In [1]:
# Basic module
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import pickle
%matplotlib inline

# Machine Learning module
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV, LassoCV, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

# Model selection
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

# Feature selection
from sklearn.feature_selection import RFE

# Evaluation method
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [142]:
# 前処理済みデータ(pickle)読み込み
train = pd.read_pickle('../1. data/train_pre_proccessing_v2.pkl')
test = pd.read_pickle('../1. data/test_pre_proccessing_v2.pkl')
submit = pd.read_csv('../1. data/sample_submit.csv', header=None, names=['id','value'])

In [143]:
train.columns

Index(['mpg', 'cylinders_std', 'displacement_std', 'horsepower_std',
       'weight_std', 'acceleration_std', 'model year_70', 'model year_71',
       'model year_72', 'model year_73', 'model year_74', 'model year_75',
       'model year_76', 'model year_77', 'model year_78', 'model year_79',
       'model year_80', 'model year_81', 'model year_82', 'origin_std',
       'maker_amc', 'maker_datsun', 'maker_dodge', 'maker_ford', 'maker_gm',
       'maker_honda', 'maker_others', 'maker_peugeot', 'maker_plymouth',
       'maker_toyota', 'maker_volkswagen'],
      dtype='object')

In [144]:
# データ作成
X = train.drop(['mpg'], axis=1)
y = train['mpg'].copy().values

In [145]:
# データ分割
train_X, test_X, train_y, test_y = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    shuffle=True
                                                   )

In [146]:
len(X.columns)

30

----------------------------------------------

■ Xgboost

In [132]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

print("Parameter optimization")
xgb_model = xgb.XGBRegressor()
reg_xgb = GridSearchCV(xgb_model,
                   {'max_depth': [3,4,5,6,7,8],
                    'n_estimators': [40,50,60,70,80,90]}, verbose=1)
reg_xgb.fit(X, y)
print(reg_xgb.best_params_)

Parameter optimization
Fitting 3 folds for each of 28 candidates, totalling 84 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


{'max_depth': 5, 'n_estimators': 40}


[Parallel(n_jobs=1)]: Done  84 out of  84 | elapsed:    4.5s finished


In [147]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

def create_model(optimizer='adam'):
    model = Sequential()
    model.add(Dense(30, input_dim=X.shape[1], kernel_initializer='normal', activation='sigmoid'))
    model.add(Dense(30, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))

    model.compile(loss='mean_squared_error', optimizer=optimizer)
    return model

model = KerasRegressor(build_fn=create_model, verbose=0)
# define the grid search parameters
optimizer = ['SGD','Adam']
batch_size = [10, 30, 50]
epochs = [10, 50, 100]
param_grid = dict(optimizer=optimizer, batch_size=batch_size, epochs=epochs)
reg_dl = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
reg_dl.fit(X, y)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=<keras.wrappers.scikit_learn.KerasRegressor object at 0x000001CDE1492CC8>,
             iid='warn', n_jobs=-1,
             param_grid={'batch_size': [10, 30, 50], 'epochs': [10, 50, 100],
                         'optimizer': ['SGD', 'Adam']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [148]:
# SVR
from sklearn.svm import SVR

reg_svr = GridSearchCV(SVR(kernel='rbf'), 
                       cv=5,
                       param_grid={"C": [300,400,500,600], "gamma": [0.01,0.02,0.03,0.04,0.05]}
                      )
reg_svr.fit(X, y)
print(reg_svr.best_params_)

{'C': 300, 'gamma': 0.03}


In [150]:
# RandomForest
reg_forest = GridSearchCV(
                    RandomForestRegressor(),
                    param_grid={'n_estimators': [60,70,80,90], 'min_samples_split':[4,5,6], 'max_depth': [8,9,10,15,20]}
)
reg_forest.fit(X, y)
print(reg_forest.best_params_)



{'max_depth': 15, 'min_samples_split': 5, 'n_estimators': 60}


In [151]:
# second feature matrix
X_train2 = pd.DataFrame( 
    {'XGB': reg_xgb.predict(X),
     'DL': reg_dl.predict(X).ravel(),
     'SVR': reg_svr.predict(X),
     'RDF': reg_forest.predict(X)
    }
)
X_train2.head()

Unnamed: 0,XGB,DL,SVR,RDF
0,23.035496,25.084976,23.26037,23.205022
1,17.488577,17.95639,17.324863,17.340254
2,17.611713,19.319284,16.96723,17.533286
3,23.067909,27.402966,22.565093,22.517691
4,17.564112,16.621805,17.277875,17.58775


In [152]:
# second-feature modeling using linear regression
from sklearn import linear_model

reg = linear_model.LinearRegression()
reg.fit(X_train2, y)

# prediction using the test set
X_test2 = pd.DataFrame( 
    {'XGB': reg_xgb.predict(test),
     'DL': reg_dl.predict(test).ravel(),
     'SVR': reg_svr.predict(test),
     'RDF': reg_forest.predict(test)
    }
)

y_pre = reg.predict(X_test2)

In [153]:
# 提出用データ作成
submit['value'] = y_pre
submit.to_csv('submit_20201121_xgboost_v6.csv',header=False, index=False)