### ----------------------------------------------------------------------------------------------------------
## 社会変革型 医療データサイエンティスト育成講座
# Chapter 5: モデル選択
### ----------------------------------------------------------------------------------------------------------

In [1]:
# データのロード
import pandas as pd
bace_data = pd.read_csv("~/DK-SRP/bace_data.csv")
bace_data.head()

Unnamed: 0.1,Unnamed: 0,MW,AlogP,HBA,HBD,RB,HeavyAtomCount,ChiralCenterCount,ChiralCenterCountAllPossible,RingCount,...,PEOE7 (PEOE7),PEOE8 (PEOE8),PEOE9 (PEOE9),PEOE10 (PEOE10),PEOE11 (PEOE11),PEOE12 (PEOE12),PEOE13 (PEOE13),PEOE14 (PEOE14),pIC50,class
0,0,431.56979,4.4014,3.0,2.0,5.0,32.0,2.0,2.0,4.0,...,78.640335,226.85541,107.43491,37.133846,0.0,7.98017,0.0,0.0,9.154901,1.0
1,1,657.81073,2.6412,5.0,4.0,16.0,47.0,6.0,6.0,4.0,...,47.1716,365.67694,174.07675,34.923889,7.98017,24.148668,0.0,24.663788,8.853872,1.0
2,2,591.74091,2.5499,4.0,3.0,11.0,42.0,2.0,3.0,5.0,...,47.941147,192.40652,255.75255,23.654478,0.230159,15.87979,0.0,24.663788,8.69897,1.0
3,3,591.67828,3.168,4.0,3.0,12.0,40.0,4.0,5.0,3.0,...,37.954151,194.35304,202.76335,36.498634,0.980913,8.188327,0.0,26.385181,8.69897,1.0
4,4,629.71283,3.5086,3.0,3.0,11.0,44.0,2.0,3.0,5.0,...,39.361153,179.71288,220.4613,23.654478,0.230159,15.87979,0.0,26.100143,8.69897,1.0


In [2]:
import numpy as np

# 今回は説明変数をすべて使用します！
X = np.array(bace_data.iloc[:,1:-2])
y = np.array(bace_data[['pIC50']])

In [3]:
# データ標準化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_st = scaler.fit_transform(X)

In [4]:
# データを訓練用と評価用に分割します
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_st,y,test_size=1/4, 
                                                    random_state=1)
print(X_train.shape)
print(X_test.shape)

(1141, 589)
(381, 589)


In [5]:
# 正則化なしの線形回帰モデルによるfitting
from sklearn.linear_model import LinearRegression
linModel = LinearRegression()
linModel.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [6]:
# 評価用データのX_testからy_testを予測
y_pred = linModel.predict(X_test)

In [7]:
# 評価用データへの当てはまりの指標としてMSEを計算
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)

4.1367535050431354e+17

In [8]:
# Lassoによるfitting
from sklearn.linear_model import Lasso
lassoModel = Lasso()
lassoModel.fit(X_train,y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [9]:
# Lassoの予測精度を計算
y_pred_lasso = lassoModel.predict(X_test)
mean_squared_error(y_test, y_pred_lasso)

1.7762794619018865

In [10]:
# Ridgeによるfitting
from sklearn.linear_model import Ridge
ridgeModel = Ridge()
ridgeModel.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [11]:
# Ridgeの予測精度を計算
y_pred_ridge = ridgeModel.predict(X_test)
mean_squared_error(y_test, y_pred_ridge)

0.7086735424092137

In [12]:
# Ridgeで最適ハイパーパラメータ探索
from sklearn.linear_model import RidgeCV
ridgeModel_cv = RidgeCV()
ridgeModel_cv.fit(X_train,y_train)

RidgeCV(alphas=array([ 0.1,  1. , 10. ]), cv=None, fit_intercept=True,
    gcv_mode=None, normalize=False, scoring=None, store_cv_values=False)

In [13]:
# RidgeCVの予測精度を計算
y_pred_ridgecv = ridgeModel_cv.predict(X_test)
mean_squared_error(y_test, y_pred_ridgecv)

0.6244735024682864