#### LightGBMでモデル構築

In [1]:
import os
os.chdir('/Users/kinoshitashouhei/Desktop/competitions/05_Prob_Space/Kiva/')

In [2]:
# ライブラリのインポート
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import gc
import lightgbm as lgb
import pickle
import warnings

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline

# 自作関数のインポート
from functions.common import *
from functions import preprocessing

warnings.filterwarnings('ignore')

In [3]:
# 各種設定
N_COMP = 10
DICT_DTYPE = {'LOAN_ID': 'str', 'IMAGE_ID': 'str'}
LIST_COL_DROPS = [COL_LOAN_ID,
                  COL_DESCRIPTION,
                  COL_IMAGE_ID,
                  COL_COUNTRY_NAME,
                  COL_CURRENCY_EXCHANGE_COVERAGE_RATE,
                  COL_TAGS]

LIST_TARGET_ENC = [COL_ORIGINAL_LANGUAGE]
LIST_LABEL_ENC = [COL_SECTOR_NAME, COL_CURRENCY_POLICY, COL_CURRENCY, COL_REPAYMENT_INTERVAL, COL_DISTRIBUTION_MODEL]
LIST_LDA_DESCRIPTINO_TRANSLATED = [f'COL_LDA_DESCRIPTION_TRANSLATED_{i+1}' for i in range(N_COMP)]
LIST_LDA_LOAN_USE = [f'COL_LDA_LOAN_USE_{i+1}' for i in range(N_COMP)]

In [4]:
# モデルのパス
LDA_PATH_DESCRIPTION_TRANSLATED = 'models/lda_description_translated.pkl'
LDA_PATH_LOAN_USE = 'models/lda_loan_use.pkl'

# モデルの読み込み
with open(LDA_PATH_DESCRIPTION_TRANSLATED, mode='rb') as fp:
    CLF_LDA_DESCRIPTION_TRANSLATED = pickle.load(fp)
    
with open(LDA_PATH_LOAN_USE, mode='rb') as fp:
    CLF_LDA_LOAN_USE = pickle.load(fp)

In [5]:
# csvデータの読み込み
df_train = pd.read_csv('data/train.csv', dtype=DICT_DTYPE)
df_test = pd.read_csv('data/test.csv', dtype=DICT_DTYPE)

In [6]:
# 欠損値の補完
df_train = preprocessing.fill_na_DESCRIPTION_TRANSLATED(df_train)

# テキストを前処理(DESCRIPTION_TRANSLATED)
df_train[COL_DESCRIPTION_TRANSLATED] = df_train[COL_DESCRIPTION_TRANSLATED].apply(preprocessing.replace_str)
df_train[COL_DESCRIPTION_TRANSLATED] = df_train[COL_DESCRIPTION_TRANSLATED].apply(preprocessing.lower_text)

# テキストを前処理(LOAN_USE)
df_train[COL_LOAN_USE] = df_train[COL_LOAN_USE].apply(preprocessing.replace_str_to_use)
df_train[COL_LOAN_USE] = df_train[COL_LOAN_USE].apply(preprocessing.lower_text)

# ターゲットエンコーディング
df_train, df_test = preprocessing.target_encoding_oof(df_train, df_test, LIST_TARGET_ENC, COL_LOAN_AMOUNT)

In [7]:
# 単語のカウント行列(DESCRIPTION_TRANSLATED)
text_vec_description_translated = CountVectorizer()
text_vec_description_translated.fit(df_train[COL_DESCRIPTION_TRANSLATED])
bag_of_words_description_translated = text_vec_description_translated.transform(df_train[COL_DESCRIPTION_TRANSLATED])

# 単語のカウント行列(LOAN_USE)
text_vec_loan_use = CountVectorizer()
text_vec_loan_use.fit(df_train[COL_LOAN_USE])
bag_of_words_loan_use = text_vec_loan_use.transform(df_train[COL_LOAN_USE])

In [8]:
# 学習済みのモデルを適用(DESCRIPTION_TRANSLATED)
transformed_lda_description_translated = CLF_LDA_DESCRIPTION_TRANSLATED.transform(bag_of_words_description_translated)
df_transformed_lda = pd.DataFrame(data=transformed_lda_description_translated, columns=LIST_LDA_DESCRIPTINO_TRANSLATED)
df_train = pd.concat([df_train, df_transformed_lda], axis=1)

del transformed_lda_description_translated, df_transformed_lda
gc.collect()

24

In [10]:
# 学習済みのモデルを適用(LOAN_USE)
transformed_lda_loan_use = CLF_LDA_LOAN_USE.transform(bag_of_words_loan_use)
df_transformed_lda = pd.DataFrame(data=transformed_lda_loan_use, columns=LIST_LDA_LOAN_USE)
df_train = pd.concat([df_train, df_transformed_lda], axis=1)

del transformed_lda_loan_use, df_transformed_lda
gc.collect()

24

In [11]:
# ラベルエンコードを適用
LIST_LABEL_ENC = [COL_SECTOR_NAME, COL_CURRENCY_POLICY, COL_CURRENCY, COL_REPAYMENT_INTERVAL, COL_DISTRIBUTION_MODEL]
df_train, df_test = preprocessing.label_encoding(df_train, df_test, LIST_LABEL_ENC)

In [33]:
# TODO: 不要カラムの削除

# 学習に使用するデータを設定
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) 

# LightGBM parameters
params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression', # 目的 : 回帰  
        'metric': {'mae'}, # 評価指標 : rsme(平均二乗誤差の平方根) 
}

# モデルの学習
model = lgb.train(params,
                  train_set=lgb_train, # トレーニングデータの指定
                  valid_sets=lgb_eval, # 検証データの指定
                  )

# テストデータの予測
# y_pred = model.predict(X_test)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2629
[LightGBM] [Info] Number of data points in the train set: 68499, number of used features: 16
[LightGBM] [Info] Start training from score 738.553483
[1]	valid_0's l1: 533.755
[2]	valid_0's l1: 507.715
[3]	valid_0's l1: 485.033
[4]	valid_0's l1: 465.997
[5]	valid_0's l1: 448.827
[6]	valid_0's l1: 435.258
[7]	valid_0's l1: 422.293
[8]	valid_0's l1: 410.839
[9]	valid_0's l1: 401.322
[10]	valid_0's l1: 393.341
[11]	valid_0's l1: 385.796
[12]	valid_0's l1: 379.893
[13]	valid_0's l1: 374.153
[14]	valid_0's l1: 369.176
[15]	valid_0's l1: 365.021
[16]	valid_0's l1: 361.02
[17]	valid_0's l1: 357.75
[18]	valid_0's l1: 354.343
[19]	valid_0's l1: 351.201
[20]	valid_0's l1: 348.181
[21]	valid_0's l1: 345.201
[22]	valid_0's l1: 343.135
[23]	valid_0's l1: 341.335
[24]	valid_0's l1: 339.361
[25]	valid_0's l1: 337.804
[26]	valid_0's l1: 336.562
[27]	