# 전처리

- trian data: ./tmp/train 
    - (유저id, @글id1, @글id2, @글id3...)
    - 즉 유저x아이템 행렬
- val data: ./tmp/dev


주의) 평가시 사용하게될 데이터와는 다르게 여기서 분할한 데이터에는 한 사용자가 본 데이터가 학습과 평가 데이터에 모두 등장할 수도 있습니다.

- 유저가 읽은 글: ./res/read
- 글의 메타데이터: ./res/metadata.json
- 글 본문 정보: ./res/contents
- 사용자 정보: ./res/users.json
- 매거진 정보: ./res/magazine.json
- 예측할 사용 정보: ./res/predict/dev.uesrs



In [1]:
import os
import random

import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

import matplotlib.pyplot as plt


import config as conf


In [2]:
DATA_PATH = './tmp/dev'
# TRAIN_PATH = './tmp/train.csv'
# VAL_PATH = './tmp/val.csv'
METADATA_PATH  = conf.data_root + 'metadata.json'

In [3]:
# 유저가 본 데이터 로드
df = pd.read_csv(DATA_PATH, names=['user'])

# 유저, 유저가 본 글 분리
df['seen'] = df['user'].apply(lambda x: x.split()[1:])
df['user'] = df['user'].apply(lambda x: x.split()[0])

df.head()

Unnamed: 0,user,seen
0,#a0923400c3255b40221fcf35a0735bd4,"[@brunch_151, @syshine7_56, @cli-annah_639, @t..."
1,#1f6cf792f08224f9fd98713bb57297b8,"[@ellieyang47uu_193, @vivasunita_153, @contigo..."
2,#9e90d727a0534213ca10f94006d084f8,"[@workerhanee_248, @workerhanee_248, @workerha..."
3,#f39aef81bc8dce2c48e6e5c24cbf19f4,"[@ellieyang47uu_193, @moment-yet_161, @peregri..."
4,#1ef87663bb35c1ff8dd175ed22bc863d,"[@peregrino97_942, @peregrino97_884, @peregrin..."


In [4]:
# 메타데이터 로드
metadata = pd.read_json(METADATA_PATH,lines=True)

metadata.head()

Unnamed: 0,magazine_id,user_id,title,keyword_list,display_url,sub_title,reg_ts,article_id,id
0,8982,@bookdb,"사진으로 옮기기에도 아까운, 리치필드 국립공원","[여행, 호주, 국립공원]",https://brunch.co.kr/@bookdb/782,세상 어디에도 없는 호주 Top 10,1474944427000,782,@bookdb_782
1,12081,@kohwang56,[시] 서러운 봄,"[목련꽃, 아지랑이, 동행]",https://brunch.co.kr/@kohwang56/81,,1463092749000,81,@kohwang56_81
2,0,@hannahajink,무엇을 위해,[],https://brunch.co.kr/@hannahajink/4,무엇 때문에,1447997287000,4,@hannahajink_4
3,16315,@bryceandjuli,싫다,"[감정, 마음, 위로]",https://brunch.co.kr/@bryceandjuli/88,,1491055161000,88,@bryceandjuli_88
4,29363,@mijeongpark,Dubliner#7,"[유럽여행, 더블린, 아일랜드]",https://brunch.co.kr/@mijeongpark/34,#7. 내 친구의 집은 어디인가,1523292942000,34,@mijeongpark_34


In [5]:
# 글 인덱스 인코딩
content_ids = metadata["id"].unique().tolist()
content2idx = {x: i+1 for i, x in enumerate(content_ids)}
idx2content = {i+1: x for i, x in enumerate(content_ids)}

content_size = len(content_ids) # 콘텐츠 개수
print(len(content_ids))

def content2idx_(x):
    arr = []
    for i in x:
        if i in content2idx.keys():
            arr.append(content2idx[i])
    return arr

643104


In [6]:
df['seen'] = df['seen'].apply(content2idx_)

In [7]:
# 임의로 레이블 지정 => 나중에 수정!!
df['label'] = 1

arr = np.random.uniform(0, content_size, len(df['label']))

df['label'] = arr # label 랜덤값
df['label'] = df['label'].apply(int)

In [8]:
df.to_csv('./tmp/train.csv')

In [9]:
df

Unnamed: 0,user,seen,label
0,#a0923400c3255b40221fcf35a0735bd4,"[237937, 108324, 525760, 26051, 71998, 237937]",247447
1,#1f6cf792f08224f9fd98713bb57297b8,"[195230, 574818, 9342, 48531, 536457]",616447
2,#9e90d727a0534213ca10f94006d084f8,"[431904, 431904, 431904, 431904, 319127, 31912...",80515
3,#f39aef81bc8dce2c48e6e5c24cbf19f4,"[195230, 277972, 604612, 96932, 21475, 393538,...",84342
4,#1ef87663bb35c1ff8dd175ed22bc863d,"[604612, 88841, 59834, 612001, 149650, 29950, ...",232364
...,...,...,...
57907,#54fd2bf34ecbcf66b878e224eb288286,[485148],36404
57908,#6bf76601814bda659db04800a150857f,"[629602, 629871, 629926, 629585, 629585, 62951...",343574
57909,#019192a08c9ee661bca41aace483984a,"[37629, 59403, 204259]",51904
57910,#1d4aa85592f8f40dfb458381203d1a29,"[39624, 39624, 39624, 39624, 39624, 39624, 641...",597199


In [25]:
# 하이퍼 파라미터 설정
model_name = "softmax_regression"
BATCH_SIZE = 2
NUM_EPOCHS = 3 
VALID_SPLIT = 0.1
MAX_SEEN = 20 # 나중에 EDA 후 바꾸기 

kargs = {
    'model_name': model_name,
    'num_layers': 2,
    'embedding_dim': 16,
    'dense_unit': 64,
    'num_class': content_size
}

In [18]:
# X_train, X_test, y_train, y_test = train_test_split(df['seen'], df['label'], test_size=0.33, random_state=42)

In [26]:
train_X = pad_sequences(df['seen'], maxlen=MAX_SEEN, dtype='float32')
train_y = np.array(df['label'], dtype='float32')

np.save(open('./tmp/train_X.npy', 'wb'), train_X)
np.save(open('./tmp/train_y.npy', 'wb'), train_y)