### ニューラルネットワーク回帰　パラメータチューニング

In [1]:
import numpy as np
import pandas as pd

from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV


In [2]:
# pickle ファイルから読み込み
train_pkl = pd.read_pickle('./train.pk2')

In [3]:
train_pkl.shape

(55175, 12)

In [4]:
train_pkl.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55175 entries, 0 to 55582
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      55175 non-null  int64  
 1   accommodates            55175 non-null  float16
 2   bathrooms               55175 non-null  float16
 3   bedrooms                55175 non-null  float16
 4   beds                    55175 non-null  float16
 5   cleaning_fee            55175 non-null  int8   
 6   host_has_profile_pic    55175 non-null  int8   
 7   host_identity_verified  55175 non-null  int8   
 8   instant_bookable        55175 non-null  int8   
 9   number_of_reviews       55175 non-null  float16
 10  review_scores_rating    55175 non-null  float16
 11  y                       55175 non-null  float64
dtypes: float16(6), float64(1), int64(1), int8(4)
memory usage: 2.1 MB


In [5]:
train_pkl.isnull().all()

id                        False
accommodates              False
bathrooms                 False
bedrooms                  False
beds                      False
cleaning_fee              False
host_has_profile_pic      False
host_identity_verified    False
instant_bookable          False
number_of_reviews         False
review_scores_rating      False
y                         False
dtype: bool

In [6]:
np.max(train_pkl.id), np.max(train_pkl.y)

(55582, 1998.9999999999993)

In [7]:
np.min(train_pkl.id), np.min(train_pkl.y)

(0, 1.0)

In [8]:
train_pkl['y'] = train_pkl['y'].astype('float32')

## 訓練データとテストデータに分割

In [9]:
# 訓練とテストデータに分割
train, test = train_test_split(train_pkl, test_size=0.2, random_state=42)

In [10]:
# ターゲットと特徴量の分割
train_X = train.iloc[:, 1:-1].values
train_y = train.y.values

In [11]:
grid_param = [
    {
    'solver': ['sgd'],    
    'activation': ['logistic', 'relu'],
    'hidden_layer_sizes':[3, 5, 6, 7, 10, 15],
#     'learning_rate_init': [0.001, 0.01, 0.1, 1],
    'alpha': [0.1, 0.01, 0.001, 0.0001],
#     'max_iter': [],
    'random_state': [42]
    },
    {
    'solver': ['adam'],    
    'activation': ['logistic', 'relu'],
    'hidden_layer_sizes':[3, 5, 6, 7, 10, 15],
#     'learning_rate_init': [0.001, 0.01, 0.1, 1],
    'alpha': [0.1, 0.01, 0.001, 0.0001],
#     'max_iter': [],
    'random_state': [42]
    }
]


### グリッドサーチ

In [12]:
gs = GridSearchCV(estimator=MLPRegressor(), param_grid=grid_param, scoring=make_scorer(mean_squared_error, greater_is_better=False), cv=5, return_train_score=False)

In [13]:
gs.fit(train_X, train_y)



GridSearchCV(cv=5, error_score=nan,
             estimator=MLPRegressor(activation='relu', alpha=0.0001,
                                    batch_size='auto', beta_1=0.9, beta_2=0.999,
                                    early_stopping=False, epsilon=1e-08,
                                    hidden_layer_sizes=(100,),
                                    learning_rate='constant',
                                    learning_rate_init=0.001, max_fun=15000,
                                    max_iter=200, momentum=0.9,
                                    n_iter_no_change=10,
                                    nesterovs_momentum=True, power_t=0.5,
                                    random_state=...
                          'hidden_layer_sizes': [3, 5, 6, 7, 10, 15],
                          'random_state': [42], 'solver': ['sgd']},
                         {'activation': ['logistic', 'relu'],
                          'alpha': [0.1, 0.01, 0.001, 0.0001],
                          'h

### 一番良かったパラメータで学習

In [14]:
gs.best_score_

-16407.076871600417

In [15]:
gs.best_params_

{'activation': 'relu',
 'alpha': 0.1,
 'hidden_layer_sizes': 7,
 'random_state': 42,
 'solver': 'sgd'}

In [16]:
# gs.cv_results_['mean_test_score']

In [17]:
NN = MLPRegressor(**gs.best_params_)

In [18]:
NN = NN.fit(train_X, train_y)

### テストデータで学習

In [19]:
# ターゲットと特徴量の分割
test_x = test.iloc[:, 1:-1].values
test_y = test.y.values

In [20]:
test_x.shape, test_y.shape

((11035, 10), (11035,))

In [21]:
pred_y = NN.predict(test_x)

In [22]:
np.sqrt(mean_squared_error(test_y, pred_y))

152.1629998198824

### 検証データで学習

In [23]:
# 検証データ読み込み
valid = pd.read_pickle('./test.pk2')

In [24]:
valid.shape

(18528, 11)

In [25]:
# ID の保存
valid_pass = valid.id.values

In [26]:
valid_X = valid.iloc[:, 1:]

In [27]:
valid_X.shape, train_X.shape

((18528, 10), (44140, 10))

In [28]:
pred_valid_y = NN.predict(valid_X)

In [29]:
pred_valid_y.shape

(18528,)

In [30]:
type(valid_pass), type(pred_valid_y)

(numpy.ndarray, numpy.ndarray)

In [31]:
result_df = pd.DataFrame(pred_valid_y, valid_pass, columns=['y'])

In [32]:
result_df.to_csv("./NN_2.csv", header=False)