In [71]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm_notebook as tqdm
import sys
sys.path.append("../src/")
from logger import setup_logger, LOGGER
from trainer import train_lgbm
from util_tool import reduce_mem_usage
import nltk
from nltk import stem
import re
from gensim.models import FastText
stemmer = stem.PorterStemmer()
nltk.download('stopwords')
%matplotlib inline
from gensim.models import Word2Vec
pd.set_option('display.max_columns', 300)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# ==================
# Constant
# ==================
TRAIN_PATH = "../input/train.csv"
TEST_PATH = "../input/test.csv"
USER_PATH = "../input/user_x_anime.csv"
WIKI_PATH = "../input/wiki.csv"

In [3]:
# =====================
# Settings
# =====================
SAVE_PATH = "../output/fe/fe013.feather"

In [62]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
wiki = pd.read_csv(WIKI_PATH)
stop_words = nltk.corpus.stopwords.words('english')

In [60]:
def analyzer(text):
    #stop_words = ['i', 'a', 'an', 'the', 'to', 'and', 'or', 'if', 'is', 'are', 'am', 'it', 'this', 'that', 'of', 'from', 'in', 'on']
    text = text.lower() # 小文字化
    text = text.replace('\n', '') # 改行削除
    text = text.replace('\t', '') # タブ削除
    puncts = r',.":)(-!?|;\'$&/[]>%=#*+\\•~@£·_{}©^®`<→°€™›♥←×§″′Â█½à…“★”–●â►−¢²¬░¶↑±¿▾═¦║―¥▓—‹─▒：¼⊕▼▪†■’▀¨▄♫☆é¯♦¤▲è¸¾Ã⋅‘∞∙）↓、│（»，♪╩╚³・╦╣╔╗▬❤ïØ¹≤‡√。【】'
    for punct in puncts:
        text = text.replace(punct, f' {punct} ')
    # for bad_word in contraction_mapping:
    #     if bad_word in text:
    #         text = text.replace(bad_word, contraction_mapping[bad_word])
    text = text.split(' ') # スペースで区切る
    text = [stemmer.stem(t) for t in text]
    
    words = []
    for word in text:
        if (re.compile(r'^.*[0-9]+.*$').fullmatch(word) is not None): # 数字が含まれるものは分割
            for w in re.findall(r'(\d+|\D+)', word):
                words.append(w)
            continue
        if word in stop_words: # ストップワードに含まれるものは除外
            continue
        if len(word) < 2: #  1文字、0文字（空文字）は除外
            continue
        words.append(word)
    return words

In [63]:
wiki["wiki_description"] = wiki["wiki_description"].fillna("NaN")

In [68]:
description = []
for i in tqdm(wiki["wiki_description"]):
    description.append(analyzer(i))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/12421 [00:00<?, ?it/s]

In [76]:
model_gensim = FastText(vector_size=28,window=7,alpha=0.012,min_count=5,workers=6)

In [77]:
model_gensim.build_vocab(description)
model_gensim.train(description,total_examples=model_gensim.corpus_count,epochs=5)

(61951388, 70007000)

In [79]:
vector_size = model_gensim.vector_size
train_embedding = np.zeros([len(description),vector_size ])
for m, i in enumerate(tqdm(description)):
    n = 0
    for t in i:
        try:
            train_embedding[m,:] += model_gensim.wv[t]
            n += 1
        except KeyError:
            pass
    if n > 0:
        train_embedding[m,:] /= n

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


  0%|          | 0/12421 [00:00<?, ?it/s]

In [83]:
train_embedding = pd.DataFrame(train_embedding)
train_embedding.columns = [f"fasttext_{i}" for i in range(28)]
train_embedding["Japanese name"] = wiki["Japanese name"]

In [92]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
train = pd.concat([train[["Japanese name"]],test[["Japanese name"]]]).reset_index(drop=True)

In [95]:
train = train.merge(train_embedding, how="left", on ="Japanese name")

In [96]:
train = reduce_mem_usage(train)
train.iloc[:,1:].to_feather(SAVE_PATH)

Memory usage of dataframe is 3.28 MB
column =  29
0
Memory usage after optimization is: 1.75 MB
Decreased by 46.7%


In [97]:
train

Unnamed: 0,Japanese name,fasttext_0,fasttext_1,fasttext_2,fasttext_3,fasttext_4,fasttext_5,fasttext_6,fasttext_7,fasttext_8,fasttext_9,fasttext_10,fasttext_11,fasttext_12,fasttext_13,fasttext_14,fasttext_15,fasttext_16,fasttext_17,fasttext_18,fasttext_19,fasttext_20,fasttext_21,fasttext_22,fasttext_23,fasttext_24,fasttext_25,fasttext_26,fasttext_27
0,カウボーイビバップ,-0.206334,1.493912,2.186734,1.431040,-0.492521,-0.284186,-1.254118,1.434186,0.265543,1.619324,-2.874967,1.267993,0.640443,-2.726245,-1.567809,2.144842,-1.103651,-2.262416,-1.771703,0.039799,-1.592318,-0.508324,-3.045552,0.947452,-1.717842,1.556590,1.913596,1.668154
1,ハチミツとクローバー,0.305354,0.736276,1.306498,-0.043175,-0.215359,0.138844,-1.858458,1.279177,0.491700,0.829188,-2.121453,0.590140,0.180483,-1.974498,-1.651171,2.010990,-1.488573,-1.447191,-1.815787,0.706932,0.051537,1.105137,-2.467993,1.296277,-0.946907,1.400183,1.201452,2.525849
2,ハチミツとクローバー,0.305354,0.736276,1.306498,-0.043175,-0.215359,0.138844,-1.858458,1.279177,0.491700,0.829188,-2.121453,0.590140,0.180483,-1.974498,-1.651171,2.010990,-1.488573,-1.447191,-1.815787,0.706932,0.051537,1.105137,-2.467993,1.296277,-0.946907,1.400183,1.201452,2.525849
3,テニスの王子様,-0.814188,1.674957,1.612189,2.521630,-0.158885,-0.677422,-0.681004,2.229614,0.956186,1.392577,-4.061249,0.290248,0.830250,-3.368625,-1.396212,1.091629,-1.126088,-2.478320,-2.173334,0.066087,-0.420341,-0.032140,-1.659050,0.323317,-0.282862,1.076143,1.571643,1.202211
4,テニスの王子様,-0.814188,1.674957,1.612189,2.521630,-0.158885,-0.677422,-0.681004,2.229614,0.956186,1.392577,-4.061249,0.290248,0.830250,-3.368625,-1.396212,1.091629,-1.126088,-2.478320,-2.173334,0.066087,-0.420341,-0.032140,-1.659050,0.323317,-0.282862,1.076143,1.571643,1.202211
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14310,新元号にゃんこゲリオン,-1.062325,-3.559201,2.805324,-0.252996,-1.516944,-3.860998,-3.093625,-0.614906,-1.190692,6.100202,-2.914112,-2.042595,-2.579016,5.381963,-3.567624,3.791503,-2.605394,-1.389988,2.805305,1.457624,-1.247129,5.302812,-2.372058,-5.622211,6.396119,-0.778917,-4.842937,-0.516924
14311,キメツ学園 バレンタイン編,0.020607,1.999887,0.979362,2.302924,0.242540,-0.820322,-0.039277,1.718449,0.469301,1.598941,-3.978972,0.517279,1.650517,-3.190477,-0.822037,0.782985,-0.420476,-2.436021,-1.960085,-0.243488,-0.487959,0.003400,-0.415051,-0.171810,-0.162965,1.233119,-0.060627,1.099842
14312,平行線,-0.954128,1.767141,-0.249498,2.371728,-0.795606,-0.759329,0.109456,2.261664,0.587848,2.233547,-4.336799,0.492878,1.187476,-1.899363,-0.726786,1.412960,-1.455662,-4.203078,-2.618130,-0.078086,0.083977,-0.846160,-0.491504,0.861815,-2.564026,0.234212,0.908757,0.647893
14313,約束のネバーランド 特別編「道標」,-1.062325,-3.559201,2.805324,-0.252996,-1.516944,-3.860998,-3.093625,-0.614906,-1.190692,6.100202,-2.914112,-2.042595,-2.579016,5.381963,-3.567624,3.791503,-2.605394,-1.389988,2.805305,1.457624,-1.247129,5.302812,-2.372058,-5.622211,6.396119,-0.778917,-4.842937,-0.516924
