In [1]:
from google.colab import drive
drive.mount('/content/drive')
# カレントディレクトリの指定
import os
os.chdir('/content/drive/MyDrive/分析コンペ/05_ProbSpace/民泊サービスの宿泊料金予測/')

Mounted at /content/drive


In [2]:
import datetime
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import collections
import warnings
import xgboost as xgb

from tqdm import tqdm
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, StandardScaler
from geopy.distance import geodesic

from src.config import *

warnings.filterwarnings('ignore')

In [3]:
def create_elapsed_days(df):
    """
    2020/4/30までの経過日数
    """
    df[COL_ELAPSED_DAYS] = (datetime.datetime(2020, 4, 30) - df[COL_LAST_REVIEW]).dt.days
    return df


def enc_categorical(df, col_list, method):
    """
    カテゴリカル変数に対して、one-hotかlabel-encを行う
    """
    if method == 'one-hot':
        df = pd.get_dummies(df, columns=col_list, drop_first=True)
        return df
    elif method == 'label-enc':
        for col in col_list:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
        return df

In [4]:
df_train = pd.read_csv('input/train_data.csv', parse_dates=[COL_LAST_REVIEW], dtype=DICT_DTYPES)
df_test = pd.read_csv('input/test_data.csv', parse_dates=[COL_LAST_REVIEW], dtype=DICT_DTYPES)
df_train_station_info = pd.read_csv('input/train_data_distance_from_station.csv', dtype=DICT_DTYPES)
df_test_station_info = pd.read_csv('input/test_data_distance_from_station.csv', dtype=DICT_DTYPES)
df_train_name_features = pd.read_csv('input/train_data_name_features.csv')
df_test_name_features = pd.read_csv('input/test_data_name_features.csv')
df_train_gaussian_mixture = pd.read_csv('input/train_data_gaussianmixture.csv')
df_test_gaussian_mixture = pd.read_csv('input/test_data_gaussianmixture.csv')
df_train_mds = pd.read_csv('input/train_data_mds.csv')
df_test_mds = pd.read_csv('input/test_data_mds.csv')
df_train_nearest_station = pd.read_csv('input/train_data_station_info.csv', usecols=['nearest_station_index'])
df_test_nearest_station = pd.read_csv('input/test_data_station_info.csv', usecols=['nearest_station_index'])
df_train_neighbourhood_roomtype = pd.read_csv('input/train_data_neighbourhood_roomtype.csv', usecols=['neighbourhood_roomtype_le'])
df_test_neighbourhood_roomtype = pd.read_csv('input/test_data_neighbourhood_roomtype.csv', usecols=['neighbourhood_roomtype_le'])
sample_sub = pd.read_csv('input/submission.csv')

In [5]:
df_all = pd.concat([
    df_train,
    df_test
]).reset_index(drop=True)

In [6]:
df_all.head()

Unnamed: 0,id,name,host_id,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,last_review,reviews_per_month,availability_365,y
0,1,KiyosumiShirakawa 3min|★SkyTree★|WIFI|Max4|Tre...,242899459,Koto Ku,35.68185,139.8031,Entire home/apt,1,55,2020-04-25,2.21,173,12008.0
1,2,Downtown Tokyo Iriya next to Ueno,308879948,Taito Ku,35.72063,139.78536,Entire home/apt,6,72,2020-03-25,2.11,9,6667.0
2,3,"Japan Style,Private,Affordable,4min to Sta.",300877823,Katsushika Ku,35.74723,139.82349,Entire home/apt,1,18,2020-03-23,3.46,288,9923.0
3,4,4 min to Shinjuku Sta. by train / 2 ppl / Wi-fi,236935461,Shibuya Ku,35.68456,139.68077,Entire home/apt,1,2,2020-04-02,1.76,87,8109.0
4,5,LICENSED SHINJUKU HOUSE: Heart of the action!,243408889,Shinjuku Ku,35.6984,139.70467,Entire home/apt,1,86,2020-01-30,2.0,156,100390.0


In [7]:
# 2020.4.30からの経過日数を特徴量として追加
df_all[COL_ELAPSED_DAYS] = (datetime.datetime(2020, 4, 30) - df_all[COL_LAST_REVIEW]).dt.days

In [8]:
df_all.fillna(0, inplace=True)

In [9]:
for col in LIST_ENC_COL:
    le = LabelEncoder()
    df_all[col] = le.fit_transform(df_all[col])

In [10]:
df_all = df_all[LIST_USE_COL]

In [11]:
df_all.head()

Unnamed: 0,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,elapsed_days,reviews_per_month,availability_365
0,9,35.68185,139.8031,0,1,55,5.0,2.21,173
1,21,35.72063,139.78536,0,6,72,36.0,2.11,9
2,7,35.74723,139.82349,0,1,18,38.0,3.46,288
3,16,35.68456,139.68077,0,1,2,28.0,1.76,87
4,18,35.6984,139.70467,0,1,86,91.0,2.0,156


In [12]:
df_all_station_info = pd.concat([df_train_station_info, df_test_station_info], axis=0).reset_index(drop=True)
pipe = Pipeline([
    ('scale', StandardScaler()),
    ('pca', PCA(n_components=10, random_state=0))
])

In [13]:
features_tmp = pipe.fit_transform(df_all_station_info)
df_features = pd.DataFrame(features_tmp, columns=[f'PCA_{i+1}' for i in range(features_tmp.shape[1])])
df_features.head()

Unnamed: 0,PCA_1,PCA_2,PCA_3,PCA_4,PCA_5,PCA_6,PCA_7,PCA_8,PCA_9,PCA_10
0,18.653366,0.8584,-13.804691,2.301068,-0.302593,2.757417,0.460876,-1.628651,0.657036,-0.117527
1,17.356235,-8.494013,-0.796745,0.504377,-2.05283,-1.85929,0.186437,0.440242,-0.522165,0.054421
2,32.363401,6.726955,13.218806,0.054782,2.093366,-2.422866,-1.568053,0.301075,-0.832967,0.726261
3,-24.277259,0.255721,-0.03913,1.613609,2.273263,-1.378739,0.306705,-0.403338,-0.098069,-0.514598
4,-16.020399,-8.673477,-1.703626,-1.10478,2.383075,-0.055141,1.253202,0.4178,-0.442989,0.166816


In [14]:
df_train_features = df_features[:df_train.shape[0]].reset_index(drop=True)
df_test_features = df_features[df_train.shape[0]:].reset_index(drop=True)

In [15]:
X = df_all[:df_train.shape[0]].reset_index(drop=True)
X = pd.concat([X, df_train_features, df_train_name_features, df_train_gaussian_mixture, df_train_mds, df_train_nearest_station, df_train_neighbourhood_roomtype], axis=1)
y = np.log1p(df_train[COL_Y])

X_inference = df_all[df_train.shape[0]:].reset_index(drop=True)
X_inference = pd.concat([X_inference, df_test_features, df_test_name_features, df_test_gaussian_mixture, df_test_mds, df_test_nearest_station, df_test_neighbourhood_roomtype], axis=1)

In [16]:
X.head()

Unnamed: 0,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,elapsed_days,reviews_per_month,availability_365,PCA_1,...,mds_3,mds_4,mds_5,mds_6,mds_7,mds_8,mds_9,mds_10,nearest_station_index,neighbourhood_roomtype_le
0,9,35.68185,139.8031,0,1,55,5.0,2.21,173,18.653366,...,-58.109808,176.259606,-37.868741,200.372995,-90.446952,97.374829,106.743579,-223.862741,627,36
1,21,35.72063,139.78536,0,6,72,36.0,2.11,9,17.356235,...,-64.322166,-74.630171,-21.281596,221.45788,33.833832,54.11985,161.492633,-273.066649,179,82
2,7,35.74723,139.82349,0,1,18,38.0,3.46,288,32.363401,...,-122.71374,44.936694,183.089396,-4.669583,-100.740114,173.948258,63.183299,-507.104732,694,28
3,16,35.68456,139.68077,0,1,2,28.0,1.76,87,-24.277259,...,33.29165,-53.459711,-65.131115,-122.304476,45.540907,-93.036075,-121.970158,388.522576,80,62
4,18,35.6984,139.70467,0,1,86,91.0,2.0,156,-16.020399,...,1.69436,-171.467625,-160.070368,-25.120009,73.119588,-128.352844,-0.527934,287.441065,346,70


In [17]:
from numpy.ma.core import argsort
def fit_for_xgboost(X, y, model_name):
    """
    KFoldによる学習、検証
    out of foldによる予測をデータフレームで返す
    """
    scores = []
    preds = []
    va_idxes = []
    kf = KFold(n_splits=5, shuffle=True, random_state=0)
    for i, (train_idx, valid_idx) in enumerate(kf.split(X), start=1):
        print('='*50)
        print(f'fold: {i}')
        X_train, X_valid = X.loc[train_idx, :], X.loc[valid_idx, :]
        y_train, y_valid = y[train_idx], y[valid_idx]
        xgb_train = xgb.DMatrix(X_train, label=y_train)
        xgb_test = xgb.DMatrix(X_valid, label=y_valid)
        
        params = {
        'objective': 'reg:squarederror','silent':1, 'random_state':1234, 
        # 学習用の指標 (RMSE)
        'eval_metric': 'rmse',
        }
        num_round = 200
        watchlist = [(xgb_train, 'train'), (xgb_test, 'eval')]#訓練データはdtrain、評価用のテストデータはdvalidと設定

        model = xgb.train(params,
                    xgb_train,#訓練データ
                    num_round,#設定した学習回数
                    early_stopping_rounds=20,
                    evals=watchlist,
                    )

        score = np.sqrt(mean_squared_error(y_valid, model.predict(xgb.DMatrix(X_valid))))
        print(f'mean_squared_log_error: {score}')
        pred = model.predict(xgb.DMatrix(X_valid))
        scores.append(score)
        preds.append(pred)
        va_idxes.append(valid_idx)

        filename = f'models/{model_name}_fold{i}.pkl'
        pickle.dump(model, open(filename, 'wb'))
        print(f'save model: {filename}')
        
    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = argsort(va_idxes)
    df_oof = pd.DataFrame(preds[order], columns=[f'{model_name}_stacking'])
    rmsle_mean = np.mean(scores)
    print('='*50)
    print(f'rmsle_mean: {rmsle_mean}')
    print(f'{model_name}_oof')
    print(df_oof.head())
    return df_oof

In [18]:
df_oof = fit_for_xgboost(X, y, 'xgb')

fold: 1
[0]	train-rmse:6.3345	eval-rmse:6.38406
Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.

Will train until eval-rmse hasn't improved in 20 rounds.
[1]	train-rmse:4.47231	eval-rmse:4.51704
[2]	train-rmse:3.17688	eval-rmse:3.22388
[3]	train-rmse:2.28035	eval-rmse:2.32947
[4]	train-rmse:1.66613	eval-rmse:1.72548
[5]	train-rmse:1.25141	eval-rmse:1.32442
[6]	train-rmse:0.977137	eval-rmse:1.06452
[7]	train-rmse:0.802488	eval-rmse:0.904349
[8]	train-rmse:0.690999	eval-rmse:0.809528
[9]	train-rmse:0.620397	eval-rmse:0.753625
[10]	train-rmse:0.566035	eval-rmse:0.718082
[11]	train-rmse:0.533445	eval-rmse:0.69972
[12]	train-rmse:0.506788	eval-rmse:0.684725
[13]	train-rmse:0.48872	eval-rmse:0.680674
[14]	train-rmse:0.479813	eval-rmse:0.677137
[15]	train-rmse:0.467711	eval-rmse:0.673053
[16]	train-rmse:0.461165	eval-rmse:0.671284
[17]	train-rmse:0.454322	eval-rmse:0.670257
[18]	train-rmse:0.443359	eval-rmse:0.667702
[19]	train-rmse:0.436213	eval-rmse:0.66

In [19]:
df_oof.to_csv(f'input/train_xgb_out_of_fold.csv', index=False)

In [20]:
def predict_fold_avg(X_inference, model_name):
    """
    テストデータに対する予測
    5foldで作成したモデルのそれぞれの予測と平均をデータフレームで返す
    """
    list_preds_tmp = []
    for i in range(1, 6):
        model_path = f'models/{model_name}_fold{i}.pkl'
        model = pickle.load(open(model_path, 'rb'))
        pred = model.predict(xgb.DMatrix(X_inference))
        list_preds_tmp.append(pred)
    df_preds = pd.DataFrame({'model_1': np.squeeze(list_preds_tmp[0]),
                               'model_2': np.squeeze(list_preds_tmp[1]),
                               'model_3': np.squeeze(list_preds_tmp[2]),
                               'model_4': np.squeeze(list_preds_tmp[3]),
                               'model_5': np.squeeze(list_preds_tmp[4])})
    df_preds[f'{model_name}_stacking'] = df_preds.mean(axis=1)
    print(f'{model_name}_predict')
    print(df_preds.head())
    return df_preds[[f'{model_name}_stacking']]

In [21]:
df_preds = predict_fold_avg(X_inference, 'xgb')

xgb_predict
     model_1   model_2   model_3   model_4   model_5  xgb_stacking
0   9.554907  9.450179  9.065234  9.788546  9.366606      9.445094
1  11.029101  9.554940  9.524618  9.772481  9.978577      9.971944
2   9.729311  9.366158  9.430955  9.392135  9.218604      9.427433
3   9.385055  9.447559  9.516781  9.556825  8.982764      9.377797
4   9.582204  9.507136  9.668526  8.892940  9.397411      9.409643


In [22]:
df_preds

Unnamed: 0,xgb_stacking
0,9.445094
1,9.971944
2,9.427433
3,9.377797
4,9.409643
...,...
4991,9.403758
4992,9.156425
4993,9.267194
4994,8.835741


In [23]:
df_preds.to_csv(f'input/test_xgb_out_of_fold.csv', index=False)

In [None]:
sample_sub[COL_Y] = np.expm1(df_preds['xgb_stacking'])
sample_sub.head()

Unnamed: 0,id,y
0,1,12644.972656
1,2,21416.076172
2,3,12423.591797
3,4,11821.941406
4,5,12204.514648


In [None]:
sample_sub.to_csv('submit/submission_xgboost.csv', index=False)

In [None]:
sample_sub[COL_Y] = np.expm1(df_preds['pred_avg'])
sample_sub.head()

Unnamed: 0,id,y
0,1,14281.859191
1,2,16134.749746
2,3,14674.358695
3,4,10660.970524
4,5,10309.487098


In [None]:
sample_sub.to_csv('submit/submission_lightgbm_1-1-0.csv', index=False)