In [1]:
import os
import sys
import gc
import json
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
with open('./data/user_train_test/user_train.json', 'r') as f:
    train = json.load(f)

In [3]:
with open('./data/user_train_test/user_test.json', 'r') as f:
    test = json.load(f)

In [4]:
train = pd.DataFrame(zip(train['userID'].values(),
                         train['wine_id'].values(),
                         train['rating_per_user'].values()),
                    columns = ['userID', 'wine_id', 'rating'])
test = pd.DataFrame(zip(test['userID'].values(),
                        test['wine_id'].values(),
                        test['rating_per_user'].values()),
                   columns = ['userID', 'wine_id', 'rating'])

In [5]:
train.head()

Unnamed: 0,userID,wine_id,rating
0,19484511,2532733,4.0
1,19484511,1253802,3.5
2,19484511,1123441,3.5
3,19484511,1157656,3.5
4,19484511,1134756,3.5


In [6]:
test.head()

Unnamed: 0,userID,wine_id,rating
0,19484511,1141133,4.0
1,352674,1141133,4.0
2,2148498,1141133,1.5
3,3450270,1141133,4.5
4,17786617,1141133,4.0


# normalize된 rating과 like 가져오기 -> 실패

In [65]:
with open('./data/train_v2_201130.json', 'r') as f:
    norm_train = json.load(f)

In [84]:
with open('./data/test_v2_201130.json', 'r') as f:
    norm_test = json.load(f)

In [66]:
norm_train.keys()

dict_keys(['index', 'user_note', 'rating_per_user', 'vintage_id', 'user_like_count', 'userID', 'wine_id', 'wine_name', 'url', 'like'])

In [69]:
norm_train = pd.DataFrame(zip(norm_train['userID'].values(),
                              norm_train['wine_id'].values(),
                              norm_train['rating_per_user'].values(),
                              norm_train['like'].values(),
                             ),
                    columns = ['userID', 'wine_id', 'rating', 'like'])
norm_train

Unnamed: 0,userID,wine_id,rating,like
0,19484511,1141133,4.0,1
1,352674,1141133,4.0,1
2,17786617,1141133,4.0,1
3,8078038,1141133,4.5,1
4,3014532,1141133,4.0,0
...,...,...,...,...
763382,11274168,87064,3.0,0
763383,11274168,63654,4.0,1
763384,11274168,5602,4.5,1
763385,11274168,1396664,3.0,0


In [85]:
norm_test = pd.DataFrame(zip(norm_test['userID'].values(),
                              norm_test['wine_id'].values(),
                              norm_test['rating_per_user'].values(),
                              norm_test['like'].values(),
                             ),
                    columns = ['userID', 'wine_id', 'rating', 'like'])

In [97]:
norm = pd.concat([norm_train, norm_test], axis = 0)

In [124]:
norm

Unnamed: 0,userID,wine_id,rating,like
0,19484511,1141133,4.0,1
1,352674,1141133,4.0,1
2,17786617,1141133,4.0,1
3,8078038,1141133,4.5,1
4,3014532,1141133,4.0,0
...,...,...,...,...
188713,46404017,1155203,4.0,1
188714,46404017,4382344,4.0,1
188715,46404017,1104850,2.5,0
188716,46404017,1981888,3.0,0


In [125]:
train = train.merge(norm.drop_duplicates(['userID', 'wine_id', 'rating']), on = ['userID', 'wine_id', 'rating'], how = 'left')

In [128]:
norm.loc[(norm['userID'] == 19484511) & (norm['wine_id'] == 10998)]

Unnamed: 0,userID,wine_id,rating,like


In [126]:
# 왜 like null이 존재하징 ㅠ
train.loc[train['like'].isnull()]

Unnamed: 0,userID,wine_id,rating,like
108,19484511,10998,3.0,
111,19484511,77132,5.0,
113,19484511,97902,4.0,
115,19484511,1162580,3.0,
122,19484511,17615,3.5,
...,...,...,...,...
937722,16607503,1273886,3.5,
937723,16607503,1743819,3.5,
937726,16607503,61744,4.0,
937742,16607503,1139255,4.5,


In [132]:
train.loc[train['userID'] == 19484511]

Unnamed: 0,userID,wine_id,rating,like
0,19484511,2532733,4.0,1.0
1,19484511,1253802,3.5,0.0
2,19484511,1123441,3.5,0.0
3,19484511,1157656,3.5,0.0
4,19484511,1134756,3.5,0.0
...,...,...,...,...
235,19484511,87222,4.5,1.0
236,19484511,24056,4.0,1.0
237,19484511,2955936,3.0,
238,19484511,1170133,4.0,1.0


In [133]:
norm.loc[norm['userID'] == 19484511]

Unnamed: 0,userID,wine_id,rating,like
0,19484511,1141133,4.0,1
30,19484511,2532733,4.0,1
36,19484511,1253802,3.5,0
38,19484511,1123441,3.5,0
39,19484511,1157656,3.5,0
...,...,...,...,...
154623,19484511,1690177,4.0,1
154624,19484511,1252056,3.5,0
154625,19484511,1992,3.5,0
154626,19484511,2389254,4.0,1


# Feature Engineering

In [7]:
# user가 wine_id를 rating하는 각 시점마다 history를 계산

In [8]:
train.userID.nunique()

6343

In [9]:
# userID '19484511'이 가지는 wine rating history
# index 내림차순으로 최근순
# 즉 index 0인 2532733에 대한 rating은 그 전 239개를 마시고 내린 rating이라고 볼 수 있음
u = train.userID.unique()[1]
train.loc[train['userID'] == u]

Unnamed: 0,userID,wine_id,rating
240,352674,23051,4.0
241,352674,18929,4.5
242,352674,1178210,4.0
243,352674,1211816,4.5
244,352674,76431,4.5
...,...,...,...
485,352674,1156411,4.0
486,352674,1191976,4.0
487,352674,1212255,4.5
488,352674,1189640,4.0


In [10]:
u_history = train.loc[train['userID'] == u]

In [11]:
# 해당 wine 이전 최근에 마신 wine 10개!
u_history['wine_id'][1:11]

241      18929
242    1178210
243    1211816
244      76431
245    2141966
246    1258186
247      22175
248    1926947
249    1870315
250    2929415
Name: wine_id, dtype: int64

In [12]:
u_history

Unnamed: 0,userID,wine_id,rating
240,352674,23051,4.0
241,352674,18929,4.5
242,352674,1178210,4.0
243,352674,1211816,4.5
244,352674,76431,4.5
...,...,...,...
485,352674,1156411,4.0
486,352674,1191976,4.0
487,352674,1212255,4.5
488,352674,1189640,4.0


In [13]:
## Embedding을 위한 seq부여

In [14]:
user_ids = train['userID'].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}

wine_ids = train['wine_id'].unique().tolist()
wine2wine_encoded = {x: i for i, x in enumerate(wine_ids)}
wine_encoded2wine = {i: x for i, x in enumerate(wine_ids)}

train['enc_userID'] = train['userID'].map(user2user_encoded)
train["enc_wine_id"] = train['wine_id'].map(wine2wine_encoded)

test['enc_userID'] = test['userID'].map(user2user_encoded)
test["enc_wine_id"] = test['wine_id'].map(wine2wine_encoded)

In [15]:
train

Unnamed: 0,userID,wine_id,rating,enc_userID,enc_wine_id
0,19484511,2532733,4.0,0,0
1,19484511,1253802,3.5,0,1
2,19484511,1123441,3.5,0,2
3,19484511,1157656,3.5,0,3
4,19484511,1134756,3.5,0,4
...,...,...,...,...,...
937751,16607503,1135203,3.5,6342,4672
937752,16607503,4303173,3.0,6342,1023
937753,16607503,2364090,4.0,6342,14075
937754,16607503,1679387,3.5,6342,3894


In [183]:
train['recent'] = None
train['like'] = None
train['dislike'] = None
test['recent'] = None
test['like'] = None
test['dislike'] = None

for u in tqdm(train.enc_userID.unique()):

    u_history = train.loc[train['enc_userID'] == u] # user의 history 전체
    test_history = list(u_history['enc_wine_id'][:10]) # test의 history
    test_like = list(u_history.loc[u_history['rating'] > 4, 'enc_wine_id'][:10])
    test_dislike = list(u_history.loc[u_history['rating'] < 3.5, 'enc_wine_id'][:10])
    
    recent_history = []
    like_list = []
    dislike_list = []
    for i, (idx, args) in enumerate(zip(u_history.index, u_history.values)):
        recent = list(u_history['enc_wine_id'][1+i : 11+i])
        recent_history.append(recent)
        like = list(u_history.loc[0+idx:].loc[u_history['rating'] > 4, 'enc_wine_id'][:10])
        like_list.append(like)
        dislike = list(u_history.loc[0+idx:].loc[u_history['rating'] < 3.5, 'enc_wine_id'][:10])
        dislike_list.append(dislike)
    
    train.loc[train['enc_userID'] == u, 'recent'] = recent_history
    try:
        train.loc[train['enc_userID'] == u, 'like'] = like_list
    except:
        pass
    try:
        train.loc[train['enc_userID'] == u, 'dislike'] = dislike_list
    except:
        pass
    
    i = test.loc[test['enc_userID'] == u].index[0]
    test._set_value(i, 'recent', test_history)
    test._set_value(i, 'like', test_like)
    test._set_value(i, 'dislike', test_dislike)

100%|██████████| 6343/6343 [39:38<00:00,  2.67it/s] 


In [185]:
idx = train.loc[train['like'].isnull(), 'like'].index
for i in idx:
    train._set_value(i, 'like', [])

In [186]:
idx = train.loc[train['dislike'].isnull(), 'dislike'].index
for i in idx:
    train._set_value(i, 'dislike', [])

In [187]:
train.loc[train['like'].apply(lambda x: type(x)) == int, 'like'] = (train.loc[train['like']
                                                                              .apply(lambda x: type(x)) == int, 'like']
                                                                    .apply(lambda x : [x])
)

train.loc[train['dislike'].apply(lambda x: type(x)) == int, 'dislike'] = (train.loc[train['dislike']
                                                                                   .apply(lambda x: type(x)) == int, 'dislike']
                                                                          .apply(lambda x : [x])
                                                                         )

In [188]:
import joblib
joblib.dump({
    'train' : train,
    'test' : test
}, './dataset.pkl')

['./dataset.pkl']

## 아이템 메타 추가

In [41]:
item = pd.read_csv('./data/Wine_Meta_final_201208.csv')

In [42]:
item

Unnamed: 0,wine_id,name,rating_count,rating_average,label_count,review_count,type_id,body,acidity_x,alcohol,...,wood smoke,wood varnish,yeast,yellow apple,yellow beet,yellow peach,yellow plum,yellow raisin,yerba mate,yogurt
0,1938520,1882 Cabernet Sauvignon,1697,4.1,14879,16,1,5.0,2.0,14.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,14604,Les Bessards Hermitage,1078,4.3,5370,3,1,5.0,3.0,14.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1930757,Patriarch Estate Grown,1072,4.6,6042,25,1,4.0,3.0,14.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1564280,Merlot,3577,4.3,18748,52,1,4.0,3.0,14.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2576427,Cabernet Sauvignon F Block,115,4.4,806,1,1,5.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50855,1669561,Garganega - Pinot Grigio,788,3.5,6635,9,2,3.0,3.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50856,1861275,Dadà Langhe Chardonnay,231,3.8,961,6,2,3.0,3.0,13.5,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
50857,2201892,Metodo Zero Prosecco Extra Dry,390,3.9,1983,14,3,1.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50858,2396179,Les Monts Damnés Sancerre,302,4.2,730,4,2,4.0,3.0,13.5,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [193]:
# 1886805 해당 wine에 대한 meta 정보가 없음
train.loc[train['wine_id'] == 1886805].shape[0]

47

In [194]:
train_item = train.merge(item,
           on = 'wine_id',
           how = 'left')

In [195]:
test_item = test.merge(item,
           on = 'wine_id',
           how = 'left')

In [196]:
train_item.loc[5168, item.columns[1:]] = item.loc[item['wine_id'] == 1183966].values[0][1:]

In [197]:
for idx in tqdm(train_item.loc[train_item['wine_id'] == 1886805].index):
    train_item.loc[idx, item.columns[1:]] = item.loc[item['wine_id'] == 1183966].values[0][1:]

100%|██████████| 47/47 [01:24<00:00,  1.81s/it]


In [198]:
train_item.loc[train_item['wine_id'] == 1886805].head()

Unnamed: 0,userID,wine_id,rating,enc_userID,enc_wine_id,recent,like,dislike,name,rating_count,...,wood smoke,wood varnish,yeast,yellow apple,yellow beet,yellow peach,yellow plum,yellow raisin,yerba mate,yogurt
5168,15409535,1886805,3.5,18,3924,"[3925, 1553, 3926, 3927, 3928, 3929, 3100, 393...","[3254, 2182, 3951, 3958, 3970, 3897, 3995, 400...","[3935, 3948, 3949, 991, 3954, 3961, 3964, 3967...",Méthode Champenoise Brut (Gold Label),4712.0,...,0.0,0.0,26.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
120521,643840,1886805,4.0,466,3924,"[18041, 19996, 2520, 27649, 33812, 34386, 1512...","[18041, 19996, 2520, 15125, 4709, 19271, 28926...","[34386, 2997, 34388, 3922, 14182, 3091, 10438,...",Méthode Champenoise Brut (Gold Label),4712.0,...,0.0,0.0,26.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
140564,13358262,1886805,3.0,545,3924,"[17669, 2548, 8819, 3537, 3537, 6872, 20193, 3...","[2548, 3537, 3537, 20193, 22192, 36916, 19289,...","[3924, 36911, 7917, 36912, 36913, 4917, 44, 18...",Méthode Champenoise Brut (Gold Label),4712.0,...,0.0,0.0,26.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
144555,12936090,1886805,2.5,560,3924,"[6879, 3092, 6552, 12940, 6880, 3846, 3695, 33...","[6880, 20201, 2514, 7928, 37415, 22940, 19443,...","[3924, 6879, 6552, 12940, 26679, 5886, 19697, ...",Méthode Champenoise Brut (Gold Label),4712.0,...,0.0,0.0,26.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
173278,2301477,1886805,3.5,674,3924,"[4089, 4252, 33914, 2805, 3742, 3660, 13173, 3...","[4252, 3742, 3660, 13173, 1605, 17558, 14178, ...","[5412, 2320, 3937, 33387, 7219, 9884, 3363, 10...",Méthode Champenoise Brut (Gold Label),4712.0,...,0.0,0.0,26.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [199]:
joblib.dump({
    'train' : train_item,
    'test' : test_item
}, './dataset_item.pkl')

['./dataset_item.pkl']

In [121]:
from sklearn.decomposition import PCA

In [167]:
pca = PCA(n_components = 50)

In [142]:
categorical_columns = list(item.columns[item.columns.str.contains('_id')])

In [161]:
categorical_columns = categorical_columns[1:] + ['country_code']

In [162]:
item_continuous = item.drop(categorical_columns[1:], axis = 1)

In [168]:
reduced = pca.fit_transform(item_continuous.iloc[:, 2:].fillna(0))

In [177]:
item_reduced = pd.DataFrame(reduced, index = item.wine_id, columns = ['pca_{}'.format(i) for i in range(1, 51)])

In [202]:
item_reduced = item_reduced.reset_index()

In [204]:
train_item_pca = train.merge(item_reduced,
           on = 'wine_id',
           how = 'left')

In [210]:
test_item_pca = test.merge(item_reduced,
           on = 'wine_id',
           how = 'left')

In [212]:
joblib.dump({
    'train' : train_item_pca,
    'test' : test_item_pca
}, './dataset_item_pca.pkl')

['./dataset_item_pca.pkl']

In [213]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()