In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm_notebook as tqdm
import sys
sys.path.append("../src/")
from logger import setup_logger, LOGGER
from trainer import train_lgbm
from util_tool import reduce_mem_usage
%matplotlib inline
from gensim.models import Word2Vec
pd.set_option('display.max_columns', 300)

In [2]:
# ==================
# Constant
# ==================
TRAIN_PATH = "../input/train.csv"
TEST_PATH = "../input/test.csv"
USER_PATH = "../input/user_x_anime.csv"

In [3]:
# =====================
# Settings
# =====================
SAVE_PATH = "../output/fe/fe010.feather"

In [4]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
user = pd.read_csv(USER_PATH)

In [5]:
user

Unnamed: 0,user_id,anime_id,rating,watching_status,watched_episodes
0,310366,11009,,6,0
1,321597,4063,,2,12
2,37696,114,9.0,2,26
3,316142,863,0.0,6,0
4,333376,32379,4.0,4,1
...,...,...,...,...,...
26992848,112800,617,,2,1
26992849,34131,16498,,2,25
26992850,110747,3927,8.0,2,25
26992851,94883,37497,,4,6


In [6]:
def join(df):
    x = [str(e) for e in list(df)]
    return " ".join(x)
docs = user.groupby("anime_id")["user_id"].apply(join).reset_index()

In [7]:
docs

Unnamed: 0,anime_id,user_id
0,1,224239 304695 159671 168286 107358 269963 6839...
1,5,60694 292766 227630 263928 161613 67633 207166...
2,6,189783 247823 317384 186582 75898 82347 341060...
3,7,23200 352054 27183 196753 50331 296444 334444 ...
4,8,301138 176539 351406 68862 207201 33842 16830 ...
...,...,...
12416,47398,49601 98067 302329 141191 256071 192352 12313 ...
12417,47402,35348 299483 208588 156092 224370 307026 72663...
12418,47614,352287 112173 298702 105314 49167 62732 281368...
12419,47616,347385 35099 86218 37692 334457 185217 147268 ...


In [8]:
user_id_list = [i.split(" ") for i in docs["user_id"]]

In [11]:
model = Word2Vec(user_id_list, vector_size=25, window=8, min_count=10, seed = 1)

In [14]:
model.wv[t]

array([ 0.20769429, -0.52633494,  0.2825187 , -0.577271  , -0.5315594 ,
        0.4782331 , -0.4064486 , -0.08355936,  0.20133924,  1.241977  ,
       -0.40565944, -0.45143375, -0.96313035,  0.6964697 ,  0.22254547,
        0.41469792, -0.01798311, -0.42459223,  0.42283988, -0.4728834 ,
        0.0108784 , -0.66292244,  0.87682337, -1.2631522 ,  0.16642274],
      dtype=float32)

In [15]:
train_embedding = np.zeros((len(user_id_list),model.vector_size))
vector_size = model.vector_size
for m,i in enumerate(tqdm(user_id_list)):
    embedding_ = np.zeros(vector_size)
    n = 0
    for t in i:
        try:
            embedding_ += model.wv[t]
            n += 1
        except KeyError:
            pass
    if n > 0:
        embedding_ /= n
    train_embedding[m,:] = embedding_

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


  0%|          | 0/12421 [00:00<?, ?it/s]

In [17]:
train_embedding = pd.DataFrame(train_embedding)
train_embedding.columns = [f"user_id_emb_{i}" for i in range(len(train_embedding.columns))]

In [19]:
train_embedding["MAL_ID"] = docs["anime_id"]

In [18]:
cols = ["MAL_ID"]
train = pd.concat([train[cols],test[cols]]).reset_index(drop=True)

In [20]:
train = train.merge(train_embedding, how="left", on="MAL_ID")

In [21]:
train.head()

Unnamed: 0,MAL_ID,user_id_emb_0,user_id_emb_1,user_id_emb_2,user_id_emb_3,user_id_emb_4,user_id_emb_5,user_id_emb_6,user_id_emb_7,user_id_emb_8,user_id_emb_9,user_id_emb_10,user_id_emb_11,user_id_emb_12,user_id_emb_13,user_id_emb_14,user_id_emb_15,user_id_emb_16,user_id_emb_17,user_id_emb_18,user_id_emb_19,user_id_emb_20,user_id_emb_21,user_id_emb_22,user_id_emb_23,user_id_emb_24
0,1,0.10229,-0.365758,0.20519,-0.340661,-0.374389,0.276794,-0.225197,0.009175,0.060214,0.768348,-0.283377,-0.304937,-0.620362,0.475446,0.086024,0.282721,0.053735,-0.338736,0.250533,-0.280269,0.027448,-0.450279,0.589621,-0.783996,0.101727
1,16,0.106892,-0.417105,0.227414,-0.349482,-0.414721,0.258007,-0.209403,0.051625,0.024895,0.809126,-0.327126,-0.330655,-0.66251,0.53265,0.077705,0.327886,0.082292,-0.39566,0.257819,-0.301363,0.039265,-0.499097,0.658977,-0.835783,0.101267
2,22,0.09759,-0.428262,0.224944,-0.326305,-0.417663,0.221459,-0.178691,0.086413,-0.005084,0.775292,-0.337007,-0.32546,-0.642802,0.536308,0.061137,0.338721,0.100487,-0.411311,0.241916,-0.294526,0.045562,-0.498166,0.660915,-0.80766,0.089483
3,23,0.000148,-0.998276,0.381701,-0.09684,-0.784596,-0.418541,0.347105,0.90195,-0.691779,0.655408,-0.83404,-0.467902,-0.737008,1.049153,-0.223132,0.850205,0.58722,-1.100817,0.09014,-0.376615,0.223866,-0.883682,1.269114,-0.873574,-0.059426
4,47,0.096188,-0.416102,0.221239,-0.33416,-0.411639,0.242102,-0.196485,0.067321,0.016431,0.782168,-0.324935,-0.323808,-0.645043,0.523843,0.067901,0.324521,0.089544,-0.395625,0.247526,-0.294592,0.04068,-0.489621,0.647694,-0.810952,0.09286


In [22]:
train = reduce_mem_usage(train)
train.iloc[:,1:].to_feather(SAVE_PATH)

Memory usage of dataframe is 2.56 MB
column =  26
0
Memory usage after optimization is: 1.33 MB
Decreased by 48.1%


In [33]:
user

Unnamed: 0,user_id,anime_id,rating,watching_status,watched_episodes
0,310366,11009,,6,0
1,321597,4063,,2,12
2,37696,114,9.0,2,26
3,316142,863,0.0,6,0
4,333376,32379,4.0,4,1
...,...,...,...,...,...
26992848,112800,617,,2,1
26992849,34131,16498,,2,25
26992850,110747,3927,8.0,2,25
26992851,94883,37497,,4,6
