In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm_notebook as tqdm
import sys
sys.path.append("../src/")
from logger import setup_logger, LOGGER
from trainer import train_lgbm
from util_tool import reduce_mem_usage
%matplotlib inline
from gensim.models import Word2Vec
pd.set_option('display.max_columns', 300)

In [2]:
# ==================
# Constant
# ==================
TRAIN_PATH = "../input/train.csv"
TEST_PATH = "../input/test.csv"
USER_PATH = "../input/user_x_anime.csv"

In [3]:
# =====================
# Settings
# =====================
SAVE_PATH = "../output/fe/fe021.feather"

In [7]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
user = pd.read_csv(USER_PATH)

In [8]:
user["user_id"] = user["user_id"].astype(str) + "_" + "u"
user["user_id_anime_id"] = user["user_id"].astype(str) + " " + user["anime_id"].astype(str)

In [9]:
user.head()

Unnamed: 0,user_id,anime_id,rating,watching_status,watched_episodes,user_id_anime_id
0,310366_u,11009,,6,0,310366_u 11009
1,321597_u,4063,,2,12,321597_u 4063
2,37696_u,114,9.0,2,26,37696_u 114
3,316142_u,863,0.0,6,0,316142_u 863
4,333376_u,32379,4.0,4,1,333376_u 32379


In [10]:
user_id_list = [i.split(" ") for i in user["user_id_anime_id"]]

In [11]:
model = Word2Vec(user_id_list, vector_size=25, window=1, min_count=10, seed = 1 , workers=6)

In [12]:
cols = ["MAL_ID"]
train = pd.concat([train[cols],test[cols]]).reset_index(drop=True)

In [13]:
train_embedding = np.zeros((len(train),model.vector_size))
vector_size = model.vector_size
for m,i in enumerate(tqdm(train["MAL_ID"])):
    embedding_ = np.zeros(vector_size)
    try:
        train_embedding[m,:] = model.wv[str(i)]
    except KeyError:
        pass

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


  0%|          | 0/12421 [00:00<?, ?it/s]

In [14]:
train_embedding = pd.DataFrame(train_embedding)
train_embedding.columns = [f"anime_id_emb_2_{i}" for i in range(len(train_embedding.columns))]

In [15]:
train_embedding = reduce_mem_usage(train_embedding)
train_embedding.to_feather(SAVE_PATH)

Memory usage of dataframe is 2.37 MB
column =  25
0
Memory usage after optimization is: 1.18 MB
Decreased by 50.0%


In [16]:
train_embedding

Unnamed: 0,anime_id_emb_2_0,anime_id_emb_2_1,anime_id_emb_2_2,anime_id_emb_2_3,anime_id_emb_2_4,anime_id_emb_2_5,anime_id_emb_2_6,anime_id_emb_2_7,anime_id_emb_2_8,anime_id_emb_2_9,anime_id_emb_2_10,anime_id_emb_2_11,anime_id_emb_2_12,anime_id_emb_2_13,anime_id_emb_2_14,anime_id_emb_2_15,anime_id_emb_2_16,anime_id_emb_2_17,anime_id_emb_2_18,anime_id_emb_2_19,anime_id_emb_2_20,anime_id_emb_2_21,anime_id_emb_2_22,anime_id_emb_2_23,anime_id_emb_2_24
0,0.828470,-0.512094,1.484275,-0.381984,0.547574,-0.660980,0.509306,0.833945,-0.357705,-1.912738,0.382140,-0.166094,0.749148,-0.707844,0.905214,-1.222592,-0.056702,0.367055,-0.332679,0.879063,0.517343,-0.500565,1.276704,3.009783,-0.046722
1,0.692177,0.153106,-1.339531,0.498740,0.522788,-0.272935,0.873724,-0.084520,-1.439002,-0.365812,1.960651,1.042215,-1.094734,0.319641,-0.043337,-1.287402,1.870026,-0.367263,-0.327284,-0.251001,-0.555189,1.181075,0.415925,2.752805,-0.305867
2,-1.392202,1.347502,-1.819914,-0.274341,0.405588,-0.866136,1.415945,-0.466630,-0.211839,-0.340117,2.574459,0.191747,1.659930,0.359158,1.448934,-1.621046,0.840218,-0.378040,-1.646559,0.956968,0.867703,1.331752,0.538527,1.299406,0.036711
3,-0.343084,0.402079,-1.009746,-0.104683,0.745376,1.022379,0.835605,0.020406,-0.079187,-0.501120,0.651127,-1.050638,0.160332,-0.327218,-0.475259,-0.798978,-0.045225,-0.342232,-0.781732,1.323672,0.656815,0.535761,-0.629454,0.982286,-0.167681
4,0.507001,-0.587803,0.209372,-0.120182,-0.803122,-0.222446,0.878648,0.548089,-2.093724,-1.662358,-0.170073,-0.691517,-0.257368,-0.011754,0.887957,-0.635207,-0.943778,0.683573,0.371293,1.256243,1.544043,-0.483077,-0.259540,2.905976,-0.776799
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12416,0.106207,0.035991,-0.167076,-0.039182,0.163992,0.314667,0.642806,0.239619,-0.171892,-0.172361,0.137832,-0.367713,0.204655,0.070086,-0.862916,-0.295010,-0.489673,-0.121566,-0.278102,0.421327,0.029827,0.455431,-0.417655,0.494206,-0.032851
12417,0.523520,-0.380228,0.944874,0.282094,-1.140156,-0.656540,1.713146,0.543976,-0.327834,0.444132,-0.365301,-0.134992,0.777126,0.779570,-1.718804,-1.012382,-1.738283,0.894444,-1.192052,-0.093109,-0.054627,1.710930,-0.806632,0.331817,0.796880
12418,0.188083,-0.030143,0.046737,0.012644,0.051460,0.079142,0.744395,0.169159,-0.204537,-0.134799,0.208705,-0.247739,0.324337,0.218655,-0.800215,-0.209595,-0.459786,0.039716,-0.226421,0.207029,-0.081592,0.579449,-0.337169,0.544029,-0.018316
12419,0.504890,-0.306980,1.074097,0.232458,-1.098123,-0.649470,1.775873,0.451085,-0.326032,0.357601,-0.320449,-0.351341,0.720488,0.871239,-1.405287,-1.056616,-1.730225,1.011282,-1.423686,0.071794,0.113520,1.462710,-0.805427,0.436573,0.540962
