## XGBデフォルト

In [1]:
import numpy as np
import pandas as pd

import xgboost
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split


In [2]:
# pickle ファイルから読み込み
train_pkl = pd.read_pickle('./train.pk3')

In [3]:
train_pkl.shape

(55369, 13)

## 訓練データとテストデータに分割

In [4]:
# 訓練とテストデータに分割
train, test = train_test_split(train_pkl, test_size=0.2, random_state=42)

In [5]:
# ターゲットと特徴量の分割
train_X_tmp = train.copy()
train_X_tmp.drop(['id', 'y'], axis=1, inplace=True)
train_X = train_X_tmp.iloc[:].values
train_y = train.y.values

## 訓練データで訓練

In [6]:
XGB = XGBRegressor()

In [7]:
XGB = XGB.fit(train_X, train_y)



In [8]:
XGB.feature_importances_

array([0.12433461, 0.1215694 , 0.36389393, 0.07497145, 0.02051009,
       0.01374692, 0.00651229, 0.00437408, 0.02113449, 0.19066755,
       0.05828515], dtype=float32)

In [11]:
sorted(
    zip(map(lambda x: round(x, 3), XGB.feature_importances_), train.iloc[:, [2, 3, 4, 5, 7, 8, 9, 10, 11]].columns),
    reverse=True)

[(0.364, 'host_response_rate'),
 (0.124, 'bathrooms'),
 (0.122, 'bedrooms'),
 (0.075, 'number_of_reviews'),
 (0.021, 'room_type_Private room'),
 (0.021, 'cancellation_policy_moderate'),
 (0.014, 'cancellation_policy_strict'),
 (0.007, 'cancellation_policy_super_strict_30'),
 (0.004, 'cancellation_policy_super_strict_60')]

## テストデータで実行

In [12]:
# ターゲットと特徴量の分割
test_X_tmp = test.copy()
test_X_tmp.drop(['id', 'y'], axis=1, inplace=True)
test_x = test_X_tmp.iloc[:].values
test_y = test.y.values

In [13]:
test_x.shape, test_y.shape

((11074, 11), (11074,))

In [14]:
pred_y = XGB.predict(test_x)

In [15]:
np.sqrt(mean_squared_error(test_y, pred_y))

125.14434097693592

## 検証データで実行

In [16]:
# 検証データ読み込み
valid = pd.read_pickle('./test.pk3')

In [17]:
valid.shape

(18528, 12)

In [18]:
# ID の保存
valid_pass = valid.id.values

In [19]:
valid_X = valid.iloc[:, 1:].values

In [20]:
valid_X.shape, train_X.shape

((18528, 11), (44295, 11))

In [21]:
# valid_X.head()

In [22]:
pred_valid_y = XGB.predict(valid_X)

In [23]:
pred_valid_y.shape

(18528,)

In [24]:
type(valid_pass), type(pred_valid_y)

(numpy.ndarray, numpy.ndarray)

In [25]:
result_df = pd.DataFrame(pred_valid_y, valid_pass, columns=['y'])

In [None]:
result_df.to_csv("./XGB_3.csv", header=False)