In [None]:
from google.colab import drive
drive.mount('/content/drive')
# カレントディレクトリの指定
import os
os.chdir('/content/drive/MyDrive/分析コンペ/05_ProbSpace/民泊サービスの宿泊料金予測/')

Mounted at /content/drive


In [None]:
import datetime
import pickle
import numpy as np
import pandas as pd
import warnings
import lightgbm as lgb

from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from glob import glob
from tqdm import tqdm
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error

from src.config import *
warnings.filterwarnings('ignore')

In [None]:
LIST_TRAIN_DATA = sorted(glob('input/train_*_out_of_fold.csv'))
LIST_TEST_DATA = sorted(glob('input/test_*_out_of_fold.csv'))

df_y = pd.read_csv('input/train_data.csv', usecols=[COL_Y])
y = np.log1p(df_y[COL_Y])
sample_sub = pd.read_csv('input/submission.csv')

In [None]:
def df_concat(list_path):
    df = pd.DataFrame()
    for path in list_path:
        df_tmp = pd.read_csv(path)
        df = pd.concat([df, df_tmp], axis=1)
    return df


def calc_rsmle(y_pred, data):
    """LightGBMのカスタムメトリック(RMSLE)"""
    y_true = data.get_label() # lgb.Dataset() から 目的変数を取得
    metric = np.sqrt(mean_squared_error(y_true, y_pred))
    return 'rmsle', metric, False

In [None]:
X = df_concat(LIST_TRAIN_DATA)
X_inference = df_concat(LIST_TEST_DATA)

In [None]:
X.head()

Unnamed: 0,catboost_stacking,lightgbm_stacking,neighbors_stacking,nn_stacking,rf_stacking,ridge_stacking,svr_stacking,xgb_stacking
0,9.685351,9.548714,9.19969,9.011193,9.849388,10.114502,9.377393,9.627528
1,9.79855,9.213942,9.288937,9.254456,9.46644,8.360527,9.220028,9.337125
2,9.75102,9.305787,9.169691,9.234566,9.635884,9.909695,9.385733,9.390121
3,9.083029,9.042127,9.098005,9.050856,9.049234,9.356619,9.090586,9.037077
4,9.961771,9.877283,9.528909,10.121135,9.789036,10.18626,9.805191,10.51377


In [None]:
X_inference.head()

Unnamed: 0,catboost_stacking,lightgbm_stacking,neighbors_stacking,nn_stacking,rf_stacking,ridge_stacking,svr_stacking,xgb_stacking
0,9.402112,9.432202,9.310863,9.401971,9.523879,9.710025,9.420764,9.445094
1,9.723535,9.871946,10.109017,10.865754,9.551622,9.713884,9.715402,9.971944
2,9.333528,9.595329,9.336297,9.242704,9.53116,9.517798,9.552163,9.427433
3,9.486393,9.369932,9.1921,9.164177,9.447652,9.154997,9.214931,9.377797
4,9.128863,9.293471,9.427344,9.269113,9.527033,8.974034,9.191017,9.409643


In [None]:
from numpy.ma.core import argsort

DICT_MODEL_LIST = {
    'ridge_stacking': Ridge(random_state=0),
    'svr_stacking': SVR()
}

In [None]:
def fit_for_sklearn(X, y, dict_model_list, model_name, model_save=True):
    """
    KFoldによる学習、検証
    out of foldによる予測をデータフレームで返す
    """
    scores = []
    preds = []
    va_idxes = []
    kf = KFold(n_splits=5, shuffle=True, random_state=0)
    for i, (train_idx, valid_idx) in enumerate(kf.split(X), start=1):
        print('='*30)
        print(f'fold: {i}')
        X_train, X_valid = X.loc[train_idx, :], X.loc[valid_idx, :]
        y_train, y_valid = y[train_idx], y[valid_idx]
        model = dict_model_list[model_name]
        model.fit(X_train, y_train)
        pred = model.predict(X_valid)
        score = np.sqrt(mean_squared_error(y_valid, pred))
        print(f'score: {score}')
        scores.append(score)
        preds.append(pred)
        va_idxes.append(valid_idx)

        if model_save:
            filename = f'models/{model_name}_fold{i}.pkl'
            pickle.dump(model, open(filename, 'wb'))

    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = argsort(va_idxes)
    df_oof = pd.DataFrame(preds[order], columns=[f'{model_name}_stacking'])
    rmsle_mean = np.mean(scores)
    print('='*30)
    print(f'rmsle_mean: {rmsle_mean}')
    print(f'{model_name}_oof')
    print(df_oof.head())
    return df_oof

In [None]:
df_oof = fit_for_sklearn(X, y, DICT_MODEL_LIST, 'svr_stacking', True)

fold: 1
score: 0.5280775374555033
fold: 2
score: 0.5103455064948162
fold: 3
score: 0.5123454573782579
fold: 4
score: 0.4996242540028309
fold: 5
score: 0.4966333058234603
rmsle_mean: 0.5094052122309738
ridge_stacking_oof
   ridge_stacking_stacking
0                 9.319757
1                 9.279334
2                 9.303061
3                 9.041339
4                10.063235


In [None]:
list_preds_tmp = []

for i in range(1, 6):
    model_path = f'models/svr_stacking_fold{i}.pkl'
    model = pickle.load(open(model_path, 'rb'))
    pred = model.predict(X_inference)
    list_preds_tmp.append(pred)

In [None]:
df_preds = pd.DataFrame({'model_1': np.squeeze(list_preds_tmp[0]),
                         'model_2': np.squeeze(list_preds_tmp[1]),
                         'model_3': np.squeeze(list_preds_tmp[2]),
                         'model_4': np.squeeze(list_preds_tmp[3]),
                         'model_5': np.squeeze(list_preds_tmp[4])})

df_preds['pred_avg'] = df_preds.mean(axis=1)

In [None]:
df_preds.head()

Unnamed: 0,model_1,model_2,model_3,model_4,model_5,pred_avg
0,9.435128,9.419443,9.422145,9.423371,9.433241,9.426665
1,10.396986,10.387768,10.392313,10.41423,10.375695,10.393398
2,9.412436,9.394407,9.407467,9.398822,9.424625,9.407551
3,9.27081,9.261544,9.271115,9.272092,9.272392,9.269591
4,9.304853,9.287616,9.289594,9.282967,9.298452,9.292696


In [None]:
sample_sub[COL_Y] = np.expm1(df_preds['pred_avg'])
sample_sub.head()

Unnamed: 0,id,y
0,1,12414.059562
1,2,32642.414381
2,3,12179.009807
3,4,10609.410034
4,5,10857.421314


In [None]:
sample_sub.to_csv('submit/submission_svr_stacking.csv', index=False)