In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
from sklearn.metrics import f1_score
from collections import Counter
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.decomposition import PCA
from tqdm.notebook import tqdm
import gensim
import os
from sklearn import preprocessing 
os.listdir("../data/")

['train_with_label.csv', 'test.csv', 'train.csv', 'sample_submission.csv']

In [2]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
sub = pd.read_csv("../data/sample_submission.csv")
train.shape, test.shape, sub.shape

((24842, 6), (24843, 5), (24843, 2))

In [3]:
df = pd.concat([train, test])
df = df.reset_index(drop=True)
df.shape

(49685, 6)

In [4]:
from re import search, sub
from nltk.stem import PorterStemmer

porter = PorterStemmer()

df["words"] = df["product_name"].apply(lambda words : [word for word in words.lower().replace(",", "").replace("&", "").split(" ") if len(word)>0])
df["bigram"] = df.words.apply(lambda words: [f'{words[i]} {words[i+1]}' for i in range(len(words)-1)])
df["parcent"] = df.words.apply(lambda words:  len([word for word in words if search(r'\d+\%', word)]) > 0)
df["target"] = df.department_id

In [5]:
clusters = {
    'cluster_0': [2,3,4,7,8,9,14,16],
    'cluster_1': [0,11,19],
    'cluster_2': [5,12],
    'cluster_3': [1,10,17],
    'cluster_4': [13,18],
    'cluster_5': [6,15,18,20]
}

for clm, c in clusters.items():
    df[clm] = df.target.isin([Id+1 for Id in c]) * df.target

In [6]:
from collections import Counter, defaultdict

counter = defaultdict(Counter)

for e in df.itertuples():
    counter[e.target].update([word for word in e.bigram if word != ""])
        
keywords = {}

for i in range(21):
    mc = counter[i].most_common(20)
    keywords[i] = [c[0] for c in mc]

In [7]:
keywords_feature = [f"keyword_{i}" for i in range(21)]

for i in range(21):
    df[f"keyword_{i}"] = df.bigram.apply(lambda words : len(set(words) & set(keywords[i])))

In [8]:
from collections import defaultdict
## 訓練済みの単語ベクトルを読み込んで，product_nameに含まれる単語をベクトルに変換して平均を取ることで，各product_idに対して特徴量ベクトルを作成する

## gensimで.vecから読み込むときに時間がかかるので，他のnotebookでpickleで保存したものを使用している
model = pd.read_pickle("../fast-text/fasttext_gensim_model.pkl") 

unused_words = Counter()

def get_weight(x):
    weight = np.zeros(len(x)) + 1

    for i in range(len(x)):
        if x[i] in ['sleep']:
            weight[i] *= 100

    for i in range(len(x)-1):
        bigram = f"{x[i]} {x[i+1]}"
        if bigram in keywords:
            weight[i] *= 100
            weight[i+1] *= 100

    return weight

def to_vec(x, model):

    weight = get_weight(x)

    v = np.zeros(model.vector_size)
    for i, w in enumerate(x):
        try:
            v += model[w] ## 単語が訓練済みモデルのvocabにあったら
        except:
            if w != "":
                unused_words[w] += 1 ## ベクトルが存在しなかった単語をメモ
    v = v / (np.sqrt(np.sum(v ** 2)) + 1e-16) ## 長さを1に正規化
    return v    

vecs = df["words"].apply(lambda x : to_vec(x, model))
vecs = np.vstack(vecs)
fasttext_pretrain_cols = [f"fasttext_pretrain_vec{k}" for k in range(vecs.shape[1])]
vec_df = pd.DataFrame(vecs, columns=fasttext_pretrain_cols)
df = pd.concat([df, vec_df], axis = 1)

In [9]:
#df["words"] = df.words.apply(lambda words: [converter[word] if word in converter else word for word in words ])
unknowns = [item[0] for item in unused_words.most_common(200)]
for i, unknown in enumerate(unknowns):
    df[f'unknown_{i}'] = df.words.apply(lambda words: len([word for word in words if word==unknown]))

In [17]:
features = [f'unknown_{i}' for i in range(100)] + fasttext_pretrain_cols + keywords_feature + ["order_rate", "order_dow_mode", "order_hour_of_day_mode"] ## 予測に使用する特徴量の名前
target = 'target'
n_split = 5 ## cross validationのfold数

In [11]:
train = df[~df[target].isna()]
test = df[df[target].isna()]

scaler = preprocessing.StandardScaler()
train[features] = scaler.fit_transform(train[features])
test[features] = scaler.transform(test[features])

In [16]:
## cross validation
scores = []
val_list = []
preds_test = []
pred_cluster = []

kfold = StratifiedKFold(n_splits=n_split, shuffle = True, random_state=42)

for i_fold, (train_idx, valid_idx) in enumerate(kfold.split(train, train.target)):
    print(f"--------fold {i_fold}-------")
        
    ## train data
    x_tr = train.loc[train_idx, features]
    y_tr = train.loc[train_idx, target]

    ## valid data
    x_va = train.loc[valid_idx, features]
    y_va = train.loc[valid_idx, target]

    ## train LGBM model
    model = LGBMClassifier(colsample_bytree=0.2, subsample=0.8, class_weight='balanced', n_estimators=1000, learning_rate=0.1, boosting='dart')
    model.fit(x_tr, y_tr, )
    
    ## predict on valid
    pred_val = model.predict_proba(x_va)
    pred_cls = model.predict(x_va)

    ## evaluate
    score = {
        "logloss"  : log_loss(y_va, pred_val),
        "f1_micro" : f1_score(y_va, pred_cls, average = "micro")}
    print(score)
    scores.append(score)

    ## predict on test
    pred_test = model.predict_proba(test[features])
    preds_test.append(pred_test)

    probe = pd.DataFrame(pred_val.round(3), index=y_va.index, columns=[f"probe_{i}" for i in range(21)])
    df_new = df.loc[y_va.index, ['product_name', 'order_rate', 'order_dow_mode', 'order_hour_of_day_mode', 'department_id']]
    df_new['label'] = pd.Series(np.argmax(pred_val, axis = 1), index=y_va.index)
    val_list.append(pd.concat([df_new, probe], axis=1))

--------fold 0-------
{'logloss': 0.820042484321558, 'f1_micro': 0.7717321313586607}
--------fold 1-------
{'logloss': 0.8518544149563924, 'f1_micro': 0.7633612363168062}
--------fold 2-------


KeyboardInterrupt: 

In [124]:
# マイクロ平均：ラベル全体でF1スコアを計算する
# logloss：1を超える?

score_df = pd.DataFrame(scores)
score_df

Unnamed: 0,logloss,f1_micro
0,1.049613,0.775408
1,1.039451,0.780036
2,1.101121,0.773349
3,1.069145,0.76872
4,1.045023,0.77818


In [125]:
pd.concat(val_list, axis=0).to_csv('../data/train_with_label.csv', index=None)

In [126]:
## cvの各foldで計算した予測値の平均を最終的な予測値に
pred_test_final = np.array(preds_test).mean(axis = 0)
pred_test_final = np.argmax(pred_test_final, axis = 1)

sub["department_id"] = pred_test_final
sub.to_csv("submission.csv", index = False)
sub.head()

Unnamed: 0,product_id,department_id
0,24842,18
1,24843,6
2,24844,6
3,24845,6
4,24846,12
