In [1]:
from google.colab import drive
drive.mount('/content/drive')
# カレントディレクトリの指定
import os
os.chdir('/content/drive/MyDrive/分析コンペ/05_ProbSpace/民泊サービスの宿泊料金予測/')

Mounted at /content/drive


In [26]:
import datetime
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import collections
import warnings
import lightgbm as lgb

from tqdm import tqdm
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, StandardScaler
from geopy.distance import geodesic

from src.config import *
import src.preprocessing as pr

warnings.filterwarnings('ignore')

In [11]:
df_train = pd.read_csv('input/train_data.csv', parse_dates=[COL_LAST_REVIEW], dtype=DICT_DTYPES)
df_test = pd.read_csv('input/test_data.csv', parse_dates=[COL_LAST_REVIEW], dtype=DICT_DTYPES)
df_train_station_info = pd.read_csv('input/train_data_distance_from_station.csv', dtype=DICT_DTYPES)
df_test_station_info = pd.read_csv('input/test_data_distance_from_station.csv', dtype=DICT_DTYPES)
df_train_svd_name = pd.read_csv('input/train_data_name_features.csv')
df_test_svd_name = pd.read_csv('input/test_data_name_features.csv')
df_train_rf_oof = pd.read_csv('input/rf_out_of_fold.csv')
df_train_ridge_oof = pd.read_csv('input/ridge_out_of_fold.csv')
df_train_neighbors_oof = pd.read_csv('input/neighbors_out_of_fold.csv')
df_train_svr_oof = pd.read_csv('input/svr_out_of_fold.csv')
df_test_rf_oof = pd.read_csv('submit/submission_rf_1-0-0.csv')
df_test_ridge_oof = pd.read_csv('submit/submission_ridge_1-0-0.csv')
df_test_neighbors_oof = pd.read_csv('submit/submission_neighbors_1-0-0.csv')
df_test_svr_oof = pd.read_csv('submit/submission_svr_1-0-0.csv')
sample_sub = pd.read_csv('input/submission.csv')

In [4]:
LIST_USE_COL = [COL_NEIGHBOURHOOD,
                COL_LATITUDE,
                COL_LONGITUDE,
                COL_ROOM_TYPE,
                COL_MINIMUM_NIGHTS,
                COL_NUMBER_OF_REVIEWS,
                COL_ELAPSED_DAYS,
                COL_REVIEWS_PER_MONTH,
                COL_AVAILABILITY_365]

LIST_LABEL_ENC = [COL_NEIGHBOURHOOD,
                  COL_ROOM_TYPE]

In [12]:
df_all = pd.concat([
    df_train,
    df_test
]).reset_index(drop=True)

# 2020.4.30からの経過日数を特徴量として追加
df_all[COL_ELAPSED_DAYS] = (datetime.datetime(2020, 4, 30) - df_all[COL_LAST_REVIEW]).dt.days
df_all.fillna(0, inplace=True)

for col in LIST_LABEL_ENC:
    le = LabelEncoder()
    df_all[col] = le.fit_transform(df_all[col])

df_all = df_all[LIST_USE_COL]

In [13]:
df_all_station_info = pd.concat([df_train_station_info, df_test_station_info], axis=0).reset_index(drop=True)
pipe = Pipeline([
    ('scale', StandardScaler()),
    ('pca', PCA(n_components=10, random_state=0))
])

features_tmp = pipe.fit_transform(df_all_station_info)
df_features = pd.DataFrame(features_tmp, columns=[f'PCA_{i+1}' for i in range(features_tmp.shape[1])])

df_train_features = df_features[:df_train.shape[0]].reset_index(drop=True)
df_test_features = df_features[df_train.shape[0]:].reset_index(drop=True)

In [14]:
df_train_stack = pd.concat([
                      df_train_rf_oof,
                      df_train_ridge_oof,
                      df_train_neighbors_oof,
                      df_train_svr_oof
], axis=1)

In [15]:
df_train_stack.head()

Unnamed: 0,rf_stacking,ridge_stacking,neighbors_stacking,svr_stacking
0,9.744429,10.046653,9.382321,9.359716
1,9.748376,8.295758,9.288937,9.174984
2,9.691751,9.858572,9.422865,9.424498
3,9.030463,9.378725,9.098005,9.091154
4,9.734504,10.218319,9.528909,9.786251


In [16]:
df_test_stack = pd.concat([
                     np.log1p(df_test_rf_oof[[COL_Y]]),
                     np.log1p(df_test_ridge_oof[[COL_Y]]),
                     np.log1p(df_test_neighbors_oof[[COL_Y]]),
                     np.log1p(df_test_svr_oof[[COL_Y]])
], axis=1)
df_test_stack.columns = df_train_stack.columns

In [17]:
df_test_stack.head()

Unnamed: 0,rf_stacking,ridge_stacking,neighbors_stacking,svr_stacking
0,9.502303,9.716433,9.310863,9.40844
1,9.625121,9.68181,10.109017,9.712629
2,9.619025,9.504424,9.291091,9.534028
3,9.467607,9.220721,9.1921,9.217928
4,9.486467,8.91399,9.459562,9.162946


In [18]:
X = df_all[:df_train.shape[0]].reset_index(drop=True)
X = pd.concat([X, df_train_features, df_train_svd_name, df_train_stack], axis=1)
y = np.log1p(df_train[COL_Y])

X_inference = df_all[df_train.shape[0]:].reset_index(drop=True)
X_inference = pd.concat([X_inference, df_test_features, df_test_svd_name, df_test_stack], axis=1)

In [19]:
X.head()

Unnamed: 0,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,elapsed_days,reviews_per_month,availability_365,PCA_1,...,name_nmf_45,name_nmf_46,name_nmf_47,name_nmf_48,name_nmf_49,name_nmf_50,rf_stacking,ridge_stacking,neighbors_stacking,svr_stacking
0,9,35.68185,139.8031,0,1,55,5.0,2.21,173,18.653366,...,0.0,0.0,0.0,0.0,0.0,0.00057,9.744429,10.046653,9.382321,9.359716
1,21,35.72063,139.78536,0,6,72,36.0,2.11,9,17.356235,...,0.0,0.0,0.0,0.0,0.0,0.0,9.748376,8.295758,9.288937,9.174984
2,7,35.74723,139.82349,0,1,18,38.0,3.46,288,32.363401,...,0.0,0.0,0.0,0.0,0.0,0.0,9.691751,9.858572,9.422865,9.424498
3,16,35.68456,139.68077,0,1,2,28.0,1.76,87,-24.277259,...,0.0,0.0,0.0,0.0,0.0,0.0,9.030463,9.378725,9.098005,9.091154
4,18,35.6984,139.70467,0,1,86,91.0,2.0,156,-16.020399,...,0.0,0.0,0.0,0.0,0.0,0.0,9.734504,10.218319,9.528909,9.786251


In [30]:
params = {
    'objective': 'regression',
    'metric': 'mean_squared_error',
    "verbosity": -1,
    "boosting_type": "gbdt",
}

skf = KFold(n_splits=5, shuffle=True, random_state=0)
for i, (train_idx, valid_idx) in enumerate(skf.split(X), start=1):
    print('='*50)
    print(f'fold: {i}')
    X_train, X_valid = X.loc[train_idx, :], X.loc[valid_idx, :]
    y_train, y_valid = y[train_idx], y[valid_idx]
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_test = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
    
    lgb_results = {}                                    # 学習の履歴を入れる入物
    model = lgb.train(params=params,                    # ハイパーパラメータをセット
                      train_set=lgb_train,              # 訓練データを訓練用にセット
                      valid_sets=[lgb_train, lgb_test], # 訓練データとテストデータをセット
                      valid_names=['Train', 'Test'],    # データセットの名前をそれぞれ設定
                      num_boost_round=500,              # 計算回数
                      early_stopping_rounds=20,         # アーリーストッピング設定
                      evals_result=lgb_results,         # 履歴を保存する
                      verbose_eval=False)
    print(f'mean_squared_log_error: {np.sqrt(mean_squared_error(y_valid, model.predict(X_valid)))}')
    # 構築したモデルの保存
    filename = f'models/stacking_light_gbm_fold{i}.pkl'
    pickle.dump(model, open(filename, 'wb'))

fold: 1
mean_squared_log_error: 0.5584536348246786
fold: 2
mean_squared_log_error: 0.5527514381283666
fold: 3
mean_squared_log_error: 0.5452052910329076
fold: 4
mean_squared_log_error: 0.5198426494621813
fold: 5
mean_squared_log_error: 0.5248293116608138


In [21]:
list_preds_tmp = []

for i in range(1, 6):
    model_path = f'models/stacking_light_gbm_fold{i}.pkl'
    model = pickle.load(open(model_path, 'rb'))
    pred = model.predict(X_inference)
    list_preds_tmp.append(pred)

In [22]:
df_preds = pd.DataFrame({'model_1': np.squeeze(list_preds_tmp[0]),
                         'model_2': np.squeeze(list_preds_tmp[1]),
                         'model_3': np.squeeze(list_preds_tmp[2]),
                         'model_4': np.squeeze(list_preds_tmp[3]),
                         'model_5': np.squeeze(list_preds_tmp[4])})

df_preds['pred_avg'] = df_preds.mean(axis=1)

In [23]:
df_preds.head()

Unnamed: 0,model_1,model_2,model_3,model_4,model_5,pred_avg
0,9.412617,9.376289,9.28682,9.362167,9.613106,9.4102
1,9.953736,9.772844,9.819943,9.682339,9.914876,9.828748
2,9.518443,9.431651,9.584434,9.609838,9.58196,9.545265
3,9.238812,9.332508,9.14708,9.377228,9.448403,9.308806
4,9.201192,9.239747,9.266692,9.222084,9.126643,9.211272


In [24]:
sample_sub[COL_Y] = np.expm1(df_preds['pred_avg'])
sample_sub.head()

Unnamed: 0,id,y
0,1,12211.311379
1,2,18558.699294
2,3,13977.354385
3,4,11033.764559
4,5,10008.319328


In [25]:
sample_sub.to_csv('submit/submission_lightgbm_1-3-0.csv', index=False)