In [1]:
from google.colab import drive
drive.mount('/content/drive')
# カレントディレクトリの指定
import os
os.chdir('/content/drive/MyDrive/分析コンペ/05_ProbSpace/民泊サービスの宿泊料金予測/')

Mounted at /content/drive


In [2]:
import datetime
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import collections
import warnings
import lightgbm as lgb

from tqdm import tqdm
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, StandardScaler
from geopy.distance import geodesic

from src.config import *
import src.preprocessing as pr

warnings.filterwarnings('ignore')

In [3]:
df_train = pd.read_csv('input/train_data.csv', parse_dates=[COL_LAST_REVIEW], dtype=DICT_DTYPES)
df_test = pd.read_csv('input/test_data.csv', parse_dates=[COL_LAST_REVIEW], dtype=DICT_DTYPES)
df_train_station_info = pd.read_csv('input/train_data_distance_from_station.csv', dtype=DICT_DTYPES)
df_test_station_info = pd.read_csv('input/test_data_distance_from_station.csv', dtype=DICT_DTYPES)
df_train_svd_name = pd.read_csv('input/train_data_name_features.csv')
df_test_svd_name = pd.read_csv('input/test_data_name_features.csv')
sample_sub = pd.read_csv('input/submission.csv')

In [4]:
LIST_USE_COL = [COL_NEIGHBOURHOOD,
                COL_LATITUDE,
                COL_LONGITUDE,
                COL_ROOM_TYPE,
                COL_MINIMUM_NIGHTS,
                COL_NUMBER_OF_REVIEWS,
                COL_ELAPSED_DAYS,
                COL_REVIEWS_PER_MONTH,
                COL_AVAILABILITY_365]

LIST_LABEL_ENC = [COL_NEIGHBOURHOOD,
                  COL_ROOM_TYPE]

In [5]:
df_all = pd.concat([
    df_train,
    df_test
]).reset_index(drop=True)

In [6]:
df_all.head()

Unnamed: 0,id,name,host_id,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,last_review,reviews_per_month,availability_365,y
0,1,KiyosumiShirakawa 3min|★SkyTree★|WIFI|Max4|Tre...,242899459,Koto Ku,35.68185,139.8031,Entire home/apt,1,55,2020-04-25,2.21,173,12008.0
1,2,Downtown Tokyo Iriya next to Ueno,308879948,Taito Ku,35.72063,139.78536,Entire home/apt,6,72,2020-03-25,2.11,9,6667.0
2,3,"Japan Style,Private,Affordable,4min to Sta.",300877823,Katsushika Ku,35.74723,139.82349,Entire home/apt,1,18,2020-03-23,3.46,288,9923.0
3,4,4 min to Shinjuku Sta. by train / 2 ppl / Wi-fi,236935461,Shibuya Ku,35.68456,139.68077,Entire home/apt,1,2,2020-04-02,1.76,87,8109.0
4,5,LICENSED SHINJUKU HOUSE: Heart of the action!,243408889,Shinjuku Ku,35.6984,139.70467,Entire home/apt,1,86,2020-01-30,2.0,156,100390.0


In [7]:
# 2020.4.30からの経過日数を特徴量として追加
df_all[COL_ELAPSED_DAYS] = (datetime.datetime(2020, 4, 30) - df_all[COL_LAST_REVIEW]).dt.days

In [8]:
df_all.fillna(0, inplace=True)

In [9]:
for col in LIST_LABEL_ENC:
    le = LabelEncoder()
    df_all[col] = le.fit_transform(df_all[col])

In [10]:
df_all = df_all[LIST_USE_COL]

In [11]:
df_all.head()

Unnamed: 0,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,elapsed_days,reviews_per_month,availability_365
0,9,35.68185,139.8031,0,1,55,5.0,2.21,173
1,21,35.72063,139.78536,0,6,72,36.0,2.11,9
2,7,35.74723,139.82349,0,1,18,38.0,3.46,288
3,16,35.68456,139.68077,0,1,2,28.0,1.76,87
4,18,35.6984,139.70467,0,1,86,91.0,2.0,156


In [12]:
df_all_station_info = pd.concat([df_train_station_info, df_test_station_info], axis=0).reset_index(drop=True)
pipe = Pipeline([
    ('scale', StandardScaler()),
    ('pca', PCA(n_components=10, random_state=0))
])

In [13]:
features_tmp = pipe.fit_transform(df_all_station_info)
df_features = pd.DataFrame(features_tmp, columns=[f'PCA_{i+1}' for i in range(features_tmp.shape[1])])
df_features.head()

Unnamed: 0,PCA_1,PCA_2,PCA_3,PCA_4,PCA_5,PCA_6,PCA_7,PCA_8,PCA_9,PCA_10
0,18.653366,0.8584,-13.804691,2.301068,-0.302593,2.757417,0.460876,-1.628651,0.657036,-0.117527
1,17.356235,-8.494013,-0.796745,0.504377,-2.05283,-1.85929,0.186437,0.440242,-0.522165,0.054421
2,32.363401,6.726955,13.218806,0.054782,2.093366,-2.422866,-1.568053,0.301075,-0.832967,0.726261
3,-24.277259,0.255721,-0.03913,1.613609,2.273263,-1.378739,0.306705,-0.403338,-0.098069,-0.514598
4,-16.020399,-8.673477,-1.703626,-1.10478,2.383075,-0.055141,1.253202,0.4178,-0.442989,0.166816


In [14]:
df_train_features = df_features[:df_train.shape[0]].reset_index(drop=True)
df_test_features = df_features[df_train.shape[0]:].reset_index(drop=True)

In [15]:
X = df_all[:df_train.shape[0]].reset_index(drop=True)
X = pd.concat([X, df_train_features, df_train_svd_name], axis=1)
y = np.log1p(df_train[COL_Y])

X_inference = df_all[df_train.shape[0]:].reset_index(drop=True)
X_inference = pd.concat([X_inference, df_test_features, df_test_svd_name], axis=1)

In [16]:
X.head()

Unnamed: 0,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,elapsed_days,reviews_per_month,availability_365,PCA_1,...,name_nmf_41,name_nmf_42,name_nmf_43,name_nmf_44,name_nmf_45,name_nmf_46,name_nmf_47,name_nmf_48,name_nmf_49,name_nmf_50
0,9,35.68185,139.8031,0,1,55,5.0,2.21,173,18.653366,...,0.0,0.003259,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00057
1,21,35.72063,139.78536,0,6,72,36.0,2.11,9,17.356235,...,0.0,0.001156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7,35.74723,139.82349,0,1,18,38.0,3.46,288,32.363401,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,16,35.68456,139.68077,0,1,2,28.0,1.76,87,-24.277259,...,0.007228,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,18,35.6984,139.70467,0,1,86,91.0,2.0,156,-16.020399,...,0.003686,0.0,0.0,0.121278,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
params = {
    'objective': 'regression',
    'metric': 'mean_squared_error',
    "verbosity": -1,
    "boosting_type": "gbdt",
}

kf = KFold(n_splits=5, shuffle=True, random_state=0)
for i, (train_idx, valid_idx) in enumerate(kf.split(X), start=1):
    print('='*50)
    print(f'fold: {i}')
    X_train, X_valid = X.loc[train_idx, :], X.loc[valid_idx, :]
    y_train, y_valid = y[train_idx], y[valid_idx]
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_test = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
    
    lgb_results = {}                                    # 学習の履歴を入れる入物
    model = lgb.train(params=params,                    # ハイパーパラメータをセット
                      train_set=lgb_train,              # 訓練データを訓練用にセット
                      valid_sets=[lgb_train, lgb_test], # 訓練データとテストデータをセット
                      valid_names=['Train', 'Test'],    # データセットの名前をそれぞれ設定
                      num_boost_round=200,              # 計算回数
                      early_stopping_rounds=30,         # アーリーストッピング設定
                      evals_result=lgb_results,         # 履歴を保存する
                      verbose_eval=False)
    print(f'mean_squared_log_error: {np.sqrt(mean_squared_error(y_valid, model.predict(X_valid)))}')
    # 構築したモデルの保存
    filename = f'models/light_gbm_fold{i}.pkl'
    pickle.dump(model, open(filename, 'wb'))

fold: 1
mean_squared_log_error: 0.5905946593619703
fold: 2
mean_squared_log_error: 0.5692342694091266
fold: 3
mean_squared_log_error: 0.5606896278835029
fold: 4
mean_squared_log_error: 0.5477088438340253
fold: 5
mean_squared_log_error: 0.5472818417237109


In [19]:
list_preds_tmp = []

for i in range(1, 6):
    model_path = f'models/light_gbm_fold{i}.pkl'
    model = pickle.load(open(model_path, 'rb'))
    pred = model.predict(X_inference)
    list_preds_tmp.append(pred)

In [20]:
df_preds = pd.DataFrame({'model_1': np.squeeze(list_preds_tmp[0]),
                         'model_2': np.squeeze(list_preds_tmp[1]),
                         'model_3': np.squeeze(list_preds_tmp[2]),
                         'model_4': np.squeeze(list_preds_tmp[3]),
                         'model_5': np.squeeze(list_preds_tmp[4])})

df_preds['pred_avg'] = df_preds.mean(axis=1)

In [21]:
df_preds.head()

Unnamed: 0,model_1,model_2,model_3,model_4,model_5,pred_avg
0,9.554312,9.605958,9.459806,9.543096,9.670905,9.566815
1,9.89545,9.459679,9.799125,9.777247,9.512462,9.688793
2,9.294062,9.410572,9.834655,9.530908,9.899429,9.593925
3,9.101279,9.344646,9.271451,9.409075,9.245743,9.274439
4,9.162454,9.302155,9.221029,9.006214,9.512732,9.240917


In [22]:
sample_sub[COL_Y] = np.expm1(df_preds['pred_avg'])
sample_sub.head()

Unnamed: 0,id,y
0,1,14281.859191
1,2,16134.749746
2,3,14674.358695
3,4,10660.970524
4,5,10309.487098


In [23]:
sample_sub.to_csv('submit/submission_lightgbm_1-1-0.csv', index=False)