# 메타데이터 전처리

1. read에 없는 글은 제거한다. (즉, 메타데이터에서 유저가 읽은적이 없는 글은 제거한다.)
2. metadata를 찾을 수 없는 유저가 읽은 글은 제거한다.
3. 필요 feature 추가 (reg_dt, type, read_cnt(=class) 및 필요없는거 제거함
4. 유저가 읽은 데이터에 metada 합치기

In [201]:
import numpy as np
import pandas as pd
import sys, os
sys.path.append(os.pardir)

from collections import Counter
from datetime import timedelta, datetime
import glob
from itertools import chain
import json
import re

import config as conf

## 데이터 로드 

In [202]:
metadata = pd.read_json(conf.data_root + 'metadata.json', lines=True)


In [203]:
print(len(metadata))

643104


In [204]:
metadata.head()

Unnamed: 0,magazine_id,user_id,title,keyword_list,display_url,sub_title,reg_ts,article_id,id
0,8982,@bookdb,"사진으로 옮기기에도 아까운, 리치필드 국립공원","[여행, 호주, 국립공원]",https://brunch.co.kr/@bookdb/782,세상 어디에도 없는 호주 Top 10,1474944427000,782,@bookdb_782
1,12081,@kohwang56,[시] 서러운 봄,"[목련꽃, 아지랑이, 동행]",https://brunch.co.kr/@kohwang56/81,,1463092749000,81,@kohwang56_81
2,0,@hannahajink,무엇을 위해,[],https://brunch.co.kr/@hannahajink/4,무엇 때문에,1447997287000,4,@hannahajink_4
3,16315,@bryceandjuli,싫다,"[감정, 마음, 위로]",https://brunch.co.kr/@bryceandjuli/88,,1491055161000,88,@bryceandjuli_88
4,29363,@mijeongpark,Dubliner#7,"[유럽여행, 더블린, 아일랜드]",https://brunch.co.kr/@mijeongpark/34,#7. 내 친구의 집은 어디인가,1523292942000,34,@mijeongpark_34


In [205]:
read_file_lst = glob.glob(conf.data_root + 'read/*')
exclude_file_lst = ['read.tar']


In [206]:
read_df_lst = []
for f in read_file_lst:
    file_name = os.path.basename(f)
    if file_name in exclude_file_lst:
        print(file_name)
    else:
        df_temp = pd.read_csv(f, header=None, names=['raw'])
        df_temp['dt'] = file_name[:8]
        df_temp['hr'] = file_name[8:10]
        df_temp['user_id'] = df_temp['raw'].str.split(' ').str[0]
        df_temp['article_id'] = df_temp['raw'].str.split(' ').str[1:].str.join(' ').str.strip()
        read_df_lst.append(df_temp)

In [207]:
read = pd.concat(read_df_lst)

In [208]:
read.head()

Unnamed: 0,raw,dt,hr,user_id,article_id
0,#a055d0c3520e1c002531001928217887 @charlessay_...,20181121,14,#a055d0c3520e1c002531001928217887,@charlessay_30 @wal8am_27 @uglyduckmin_40 @ant...
1,#a9cba8bf098c149170315d4cd8d3082e @jinbread_88...,20181121,14,#a9cba8bf098c149170315d4cd8d3082e,@jinbread_88 @jinbread_87 @jinbread_89
2,#207c8cb590a843e247b9fe7fdd0e5281 @sangheeshyn...,20181121,14,#207c8cb590a843e247b9fe7fdd0e5281,@sangheeshyn_66
3,#d76887a5b30adbf793f742c6f67b57bd @tenbody_150...,20181121,14,#d76887a5b30adbf793f742c6f67b57bd,@tenbody_1506 @tenbody_1506
4,#b260d83247a4d9e3b22143c950f7f471 @maumdal_257...,20181121,14,#b260d83247a4d9e3b22143c950f7f471,@maumdal_257 @maumdal_257


In [209]:
def chainer(s):
    return list(chain.from_iterable(s.str.split(' ')))

In [210]:
read_cnt_by_user = read['article_id'].str.split(' ').map(len)

In [211]:
read_raw = pd.DataFrame({'dt': np.repeat(read['dt'], read_cnt_by_user),
                         'hr': np.repeat(read['hr'], read_cnt_by_user),
                         'user_id': np.repeat(read['user_id'], read_cnt_by_user),
                         'article_id': chainer(read['article_id'])})

In [212]:
read_raw.head()

Unnamed: 0,dt,hr,user_id,article_id
0,20181121,14,#a055d0c3520e1c002531001928217887,@charlessay_30
0,20181121,14,#a055d0c3520e1c002531001928217887,@wal8am_27
0,20181121,14,#a055d0c3520e1c002531001928217887,@uglyduckmin_40
0,20181121,14,#a055d0c3520e1c002531001928217887,@anti-essay_133
0,20181121,14,#a055d0c3520e1c002531001928217887,@roysday_125


## metdata 전처리

In [213]:
atc = metadata.copy()

In [214]:
# reg_datetime, reg_dt, type 추가
atc['reg_datetime'] = atc['reg_ts'].apply(lambda x : datetime.fromtimestamp(x/1000.0)) # timestamp를 datetime으로 바꿔줌
atc.loc[atc['reg_datetime'] == atc['reg_datetime'].min(), 'reg_datetime'] = datetime(2090, 12, 31)
atc['reg_dt'] = atc['reg_datetime'].dt.date # date만 가져오기. (글 등록 날짜)
atc['type'] = atc['magazine_id'].apply(lambda x : '개인' if x == 0.0 else '매거진') # 매거진 id가 0이면 개인, 아니면 매거진으로 분류

In [215]:
# 컬럼명 변경
atc.columns = ['magazine_id', 'author_id', 'title', 'keyword_list', 'display_url', 'sub_title', 'reg_ts', 'id', 'article_id', 'reg_datetime', 'reg_dt', 'type']


In [216]:
atc.head()

Unnamed: 0,magazine_id,author_id,title,keyword_list,display_url,sub_title,reg_ts,id,article_id,reg_datetime,reg_dt,type
0,8982,@bookdb,"사진으로 옮기기에도 아까운, 리치필드 국립공원","[여행, 호주, 국립공원]",https://brunch.co.kr/@bookdb/782,세상 어디에도 없는 호주 Top 10,1474944427000,782,@bookdb_782,2016-09-27 11:47:07,2016-09-27,매거진
1,12081,@kohwang56,[시] 서러운 봄,"[목련꽃, 아지랑이, 동행]",https://brunch.co.kr/@kohwang56/81,,1463092749000,81,@kohwang56_81,2016-05-13 07:39:09,2016-05-13,매거진
2,0,@hannahajink,무엇을 위해,[],https://brunch.co.kr/@hannahajink/4,무엇 때문에,1447997287000,4,@hannahajink_4,2015-11-20 14:28:07,2015-11-20,개인
3,16315,@bryceandjuli,싫다,"[감정, 마음, 위로]",https://brunch.co.kr/@bryceandjuli/88,,1491055161000,88,@bryceandjuli_88,2017-04-01 22:59:21,2017-04-01,매거진
4,29363,@mijeongpark,Dubliner#7,"[유럽여행, 더블린, 아일랜드]",https://brunch.co.kr/@mijeongpark/34,#7. 내 친구의 집은 어디인가,1523292942000,34,@mijeongpark_34,2018-04-10 01:55:42,2018-04-10,매거진


In [217]:
# 글당 유저가 본 횟수(?)
atc_read_cnt = read_raw[read_raw.article_id != ''].groupby('article_id')['user_id'].count()


In [218]:
atc_read_cnt.head()

article_id
#00700c454af49d5c9a36a13fcba01d0a_1      112
#00700c454af49d5c9a36a13fcba01d0a_10      24
#00700c454af49d5c9a36a13fcba01d0a_100     37
#00700c454af49d5c9a36a13fcba01d0a_101     32
#00700c454af49d5c9a36a13fcba01d0a_102     40
Name: user_id, dtype: int64

In [219]:
atc_read_cnt = atc_read_cnt.reset_index()
atc_read_cnt.columns = ['article_id', 'read_cnt']

In [220]:
# 유저가 읽은 적이 없는 글은 제외함
atc_read_cnt = pd.merge(atc_read_cnt, atc, how='left', left_on='article_id', right_on='article_id')


In [221]:
atc_read_cnt.head()

Unnamed: 0,article_id,read_cnt,magazine_id,author_id,title,keyword_list,display_url,sub_title,reg_ts,id,reg_datetime,reg_dt,type
0,#00700c454af49d5c9a36a13fcba01d0a_1,112,,,,,,,,,NaT,,
1,#00700c454af49d5c9a36a13fcba01d0a_10,24,,,,,,,,,NaT,,
2,#00700c454af49d5c9a36a13fcba01d0a_100,37,,,,,,,,,NaT,,
3,#00700c454af49d5c9a36a13fcba01d0a_101,32,,,,,,,,,NaT,,
4,#00700c454af49d5c9a36a13fcba01d0a_102,40,,,,,,,,,NaT,,


In [222]:
len(atc_read_cnt)

505840

In [223]:
# metadata를 찾을 수 없는 소비 로그(유저가 읽은 글) 제외
# id가 없는 글들 삭제

atc_read_cnt_nn = atc_read_cnt[atc_read_cnt['id'].notnull()]

In [224]:
atc_read_cnt_nn.head(2)

Unnamed: 0,article_id,read_cnt,magazine_id,author_id,title,keyword_list,display_url,sub_title,reg_ts,id,reg_datetime,reg_dt,type
5417,@002_10,2,0.0,@002,'80% 안심계란' 포기하게 한 20% 살충제 계란,"[계란, 살충제, 피해]",https://brunch.co.kr/@002/10,정직한 농가에 대한 피해보상은 소비자 신뢰를 돌려주는 것,1504448000000.0,10.0,2017-09-03 23:12:35,2017-09-03,개인
5418,@002_2,6,0.0,@002,청년들은 '왜' 농촌으로 갔을까?,"[귀촌, 농촌, 청년농업인]",https://brunch.co.kr/@002/2,삶의 가치를 찾아 유랑하는 청년들의 'YOLO 라이프',1501030000000.0,2.0,2017-07-26 09:46:45,2017-07-26,개인


In [225]:
len(atc_read_cnt_nn)

476147

In [226]:
# 소비수 기준 분류값
def get_class(x):
    if x >= 142:
        result = '5%'
    elif x >= 72:
        result = '10%'
    elif x >= 25:
        result = '25%'
    elif x >= 8:
        result = '50%'
    elif x >= 3:
        result = '75%'
    else:
        result = '100%'
    return result

In [227]:
# 쇱수 기준으로 분류해 
atc_read_cnt_nn['class'] = atc_read_cnt_nn['read_cnt'].map(get_class)


In [228]:
atc_read_cnt_nn.head(2)

Unnamed: 0,article_id,read_cnt,magazine_id,author_id,title,keyword_list,display_url,sub_title,reg_ts,id,reg_datetime,reg_dt,type,class
5417,@002_10,2,0.0,@002,'80% 안심계란' 포기하게 한 20% 살충제 계란,"[계란, 살충제, 피해]",https://brunch.co.kr/@002/10,정직한 농가에 대한 피해보상은 소비자 신뢰를 돌려주는 것,1504448000000.0,10.0,2017-09-03 23:12:35,2017-09-03,개인,100%
5418,@002_2,6,0.0,@002,청년들은 '왜' 농촌으로 갔을까?,"[귀촌, 농촌, 청년농업인]",https://brunch.co.kr/@002/2,삶의 가치를 찾아 유랑하는 청년들의 'YOLO 라이프',1501030000000.0,2.0,2017-07-26 09:46:45,2017-07-26,개인,75%


In [266]:
metadata_df = atc_read_cnt_nn[['article_id', 'read_cnt', 'title', 'keyword_list', 'sub_title', 'reg_dt', 'reg_ts', 'type']]

In [267]:
metadata_df['keyword_list'] = metadata_df['keyword_list'].apply(lambda x: ','.join(x))

In [268]:
metadata_df.head()

Unnamed: 0,article_id,read_cnt,title,keyword_list,sub_title,reg_dt,reg_ts,type
5417,@002_10,2,'80% 안심계란' 포기하게 한 20% 살충제 계란,"계란,살충제,피해",정직한 농가에 대한 피해보상은 소비자 신뢰를 돌려주는 것,2017-09-03,1504448000000.0,개인
5418,@002_2,6,청년들은 '왜' 농촌으로 갔을까?,"귀촌,농촌,청년농업인",삶의 가치를 찾아 유랑하는 청년들의 'YOLO 라이프',2017-07-26,1501030000000.0,개인
5419,@002jesus_1,10,우리는 왜 가난해야 하나요?,"후원,기부,비영리",비영리를 바라보는 편견들 (1),2019-02-06,1549456000000.0,개인
5420,@002paper_2,3,할수 있는 것과 하고 싶은 것,"제주,서울생활,서울",현실과 꿈의 거리..,2016-08-11,1470905000000.0,개인
5421,@002paper_3,2,긍정적으로 생각하며 주문을 걸기..,"바램,일상에세이,긍정",된다. 된다. 될거다....,2016-08-11,1470913000000.0,개인


In [269]:
metadata_df.to_csv('../res/metadata.csv', index=False)

In [238]:
# 메타데이터에 없는 이상한 글idx 제거
content_ids = metadata["id"].unique().tolist()
content2idx = {x: i+1 for i, x in enumerate(content_ids)}
idx2content = {i+1: x for i, x in enumerate(content_ids)}


def remove_unable_id(x):
    arr = []
    for i in x:
        if i in content2idx.keys():
            arr.append(i)
    return arr



In [240]:
DATA_PATH = '../tmp/train'

# 유저가 본 데이터 로드
df = pd.read_csv(DATA_PATH, names=['user'])

# 유저, 유저가 본 글 분리
df['seen'] = df['user'].apply(lambda x: x.split()[1:])
df['user'] = df['user'].apply(lambda x: x.split()[0])
df['seen'] = df['seen'].apply(remove_unable_id) # 메타데이터에 없는 글 삭제

In [243]:
df.to_csv('../res/train.csv')

In [244]:
df.head()

Unnamed: 0,user,seen
0,#bfc17aa1eed7f6e1993aaeac8333bdb9,"[@eundang_472, @eundang_472, @bookguru_29, @bo..."
1,#1d68c0320dad496c3f05dd862e543a8b,"[@julien_111, @julien_111, @bree_41, @bree_41,..."
2,#7411fe2ebde59b981f7b9e22c153b3bb,"[@zirimnet_9, @font_70, @zirimnet_10, @ultra00..."
3,#45104ed2c8ccb8905753617109f3776e,"[@dong02_1048, @dong02_1044, @dong02_1044, @do..."
4,#b4990f775b46463f33a9a60369149511,"[@fernweh19xx_34, @jonnaalive_288, @fernweh19x..."


In [None]:
#########

In [142]:
sub_data = metadata_df[['article_id', 'read_cnt', 'reg_ts']] # 글 id, 조회수, 글 등록한 timestamp

In [143]:
sub_data.head()

Unnamed: 0,article_id,read_cnt,reg_ts
5417,@002_10,1e-05,0.969757
5418,@002_2,5.1e-05,0.967554
5419,@002jesus_1,9.3e-05,0.998769
5420,@002paper_2,2.1e-05,0.948136
5421,@002paper_3,1e-05,0.948141


In [144]:
sub_data.to_csv('/Users/ohyeji/Desktop/연구실/대회_해커톤/kakao-arena-2nd/res/' + 'metadata_subset.csv', index=False)

In [None]:
############3

In [None]:
# 유저가 읽은 데이터에 metada 합치기

In [183]:
len(read_raw)

22110706

In [184]:
df = pd.merge(read_raw, metadata_df, how='left', left_on='article_id', right_on='article_id')

In [185]:
df['read_dt'] = pd.to_datetime(df['dt'], format='%Y%m%d')
df['reg_dt'] = pd.to_datetime(df['reg_dt'], format='%Y-%m-%d')
df['off_day'] = (df['read_dt'] - df['reg_dt']).dt.days

In [187]:
df.head(1)

Unnamed: 0,dt,hr,user_id,article_id,read_cnt,title,keyword_list,sub_title,reg_dt,reg_ts,type,read_dt,off_day
0,20181121,14,#a055d0c3520e1c002531001928217887,@charlessay_30,31.0,프랑스 인큐베이팅 프로그램 킥오프(2),"[IT, 스타트업]",자신의 비전을 이루기 위해 여행을 떠나는 길동무들,2018-11-18,1542530000000.0,매거진,2018-11-21,3.0


In [191]:
# df 중에서 지금 당장 쓸 피처만 suf_df 에 저장

In [193]:
sub_df = df[['user_id', 'article_id', 'reg_ts', 'off_day', 'type', 'read_cnt']]

In [194]:
def normalize_col(df, col_name):
    df[col_name] = (df[col_name] - df[col_name].min()) / (df[col_name].max() - df[col_name].min())
    return df

In [195]:
# 값 정규화함
sub_df = normalize_col(sub_df, 'reg_ts')
sub_df = normalize_col(sub_df, 'read_cnt')
sub_df = normalize_col(sub_df, 'off_day')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [196]:
sub_df.head()

Unnamed: 0,user_id,article_id,reg_ts,off_day,type,read_cnt
0,#a055d0c3520e1c002531001928217887,@charlessay_30,0.994304,0.950034,매거진,0.000309
1,#a055d0c3520e1c002531001928217887,@wal8am_27,0.994345,0.949998,매거진,0.001461
2,#a055d0c3520e1c002531001928217887,@uglyduckmin_40,0.99445,0.949926,매거진,0.000154
3,#a055d0c3520e1c002531001928217887,@anti-essay_133,0.994041,0.950178,매거진,0.131866
4,#a055d0c3520e1c002531001928217887,@roysday_125,0.975028,0.96249,매거진,0.005226


In [199]:
def splitter(df):
    gp_user_like = df.groupby(['user_id']) # liked와 user_id로 그룹화
    return ([gp_user_like.get_group(gp)['article_id'].tolist() for gp in gp_user_like.groups]) 

In [200]:
pd.options.mode.chained_assignment = None
splitted_articles = splitter(sub_df)