### SVM　パラメータチューニング

In [1]:
import numpy as np
import pandas as pd

from sklearn import svm
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV


In [2]:
# pickle ファイルから読み込み
train_pkl = pd.read_pickle('./pd_train.pk2')

In [3]:
train_pkl.shape

(891, 15)

In [4]:
# Age, Fare は削除
# ダミー変数の先頭は削除
train_pkl.drop(['Age', 'Fare', 'Sex_female', 'Embarked_C', 'Pclass_1'], inplace=True, axis=1)

In [5]:
# 学習が終わらないので、特徴量を上位５つにする
train_pkl.drop(['Parch', 'Embarked_S', 'Embarked_Q'], inplace=True, axis=1)

In [6]:
train_pkl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   SibSp     891 non-null    int64  
 2   Age_bin   891 non-null    float64
 3   Fare_bin  891 non-null    float64
 4   Sex_male  891 non-null    uint8  
 5   Pclass_2  891 non-null    uint8  
 6   Pclass_3  891 non-null    uint8  
dtypes: float64(2), int64(2), uint8(3)
memory usage: 30.6 KB


In [7]:
train_pkl.describe()[['Age_bin', 'Fare_bin']]

Unnamed: 0,Age_bin,Fare_bin
count,891.0,891.0
mean,2.420875,0.904602
std,1.356289,1.980492
min,0.0,0.0
25%,2.0,0.0
50%,2.0,0.0
75%,3.0,1.0
max,8.0,20.0


In [8]:
train_pkl.Age_bin.max(), train_pkl.Age_bin.min()

(8.0, 0.0)

In [9]:
train_pkl.Fare_bin.max(), train_pkl.Fare_bin.min()

(20.0, 0.0)

In [10]:
train_pkl['Age_bin'] = train_pkl['Age_bin'].astype('float16')

In [11]:
train_pkl['Fare_bin'] = train_pkl['Fare_bin'].astype('float16')

In [12]:
train_pkl['SibSp'] = train_pkl['SibSp'].astype('int8')

In [13]:
train_pkl['Sex_male'] = train_pkl['Sex_male'].astype('bool')

In [14]:
train_pkl['Pclass_2'] = train_pkl['Pclass_2'].astype('bool')

In [15]:
train_pkl['Pclass_3'] = train_pkl['Pclass_3'].astype('bool')

In [16]:
train_pkl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   SibSp     891 non-null    int8   
 2   Age_bin   891 non-null    float16
 3   Fare_bin  891 non-null    float16
 4   Sex_male  891 non-null    bool   
 5   Pclass_2  891 non-null    bool   
 6   Pclass_3  891 non-null    bool   
dtypes: bool(3), float16(2), int64(1), int8(1)
memory usage: 14.0 KB


In [17]:
# 訓練とテストデータに分割
train, test = train_test_split(train_pkl, test_size=0.2, random_state=42)

In [18]:
# ターゲットと特徴量の分割
train_X = train.iloc[:, 1:].values
train_y = train.Survived.values

### 検証するパラメータの設定

In [19]:
grid_param = [
    {'C': [0.001, 0.01, 0.1, 1, 5, 10, 50, 100, 1000],
    'kernel': ['linear', 'rbf'],
    'gamma': [.0001, .001, .01, .1, .5, 1, 10, 30],
#     'shrinking': [True, False],
    'random_state': [42]},
    {'C': [0.001, 0.01, 0.1, 1, 10],
    'gamma': [.0001, .001, .01,.1,.2,.5,1,10],
    'random_state': [42],
    'kernel':['poly']}
]

### グリッドサーチ

In [21]:
gs = GridSearchCV(estimator=svm.SVC(grid_param), param_grid=grid_param, scoring='explained_variance', cv=3, return_train_score=False)

In [22]:
gs.fit(train_X, train_y)

GridSearchCV(cv=3, error_score=nan,
             estimator=SVC(C=[{'C': [0.001, 0.01, 0.1, 1, 5, 10, 50, 100, 1000],
                               'gamma': [0.0001, 0.001, 0.01, 0.1, 0.5, 1, 10,
                                         30],
                               'kernel': ['linear', 'rbf'],
                               'random_state': [42]},
                              {'C': [0.001, 0.01, 0.1, 1, 10],
                               'gamma': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.5, 1,
                                         10],
                               'kernel': ['poly'], 'random_state': [42]}],
                           break_ties=False, cache_size=200, class_weight=None,
                           coef0...
             param_grid=[{'C': [0.001, 0.01, 0.1, 1, 5, 10, 50, 100, 1000],
                          'gamma': [0.0001, 0.001, 0.01, 0.1, 0.5, 1, 10, 30],
                          'kernel': ['linear', 'rbf'], 'random_state': [42]},
                         {'C': 

### 一番良かったパラメータで学習

In [23]:
gs.best_score_

0.24141219871556951

In [24]:
gs.best_params_

{'C': 0.1, 'gamma': 0.5, 'kernel': 'poly', 'random_state': 42}

In [25]:
# ターゲットと特徴量の分割
train_X = train.iloc[:, 1:].values
train_y = train.Survived.values

In [26]:
SVC = svm.SVC(**gs.best_params_)

In [27]:
SVC = SVC.fit(train_X, train_y)

### テストデータで予測

In [28]:
# ターゲットと特徴量の分割
test_x = test.iloc[:, 1:].values
test_y = test.Survived.values

In [29]:
test_x.shape, test_y.shape

((179, 6), (179,))

In [30]:
pred_y = SVC.predict(test_x)

In [31]:
confusion_matrix(test_y, pred_y)

array([[92, 13],
       [20, 54]], dtype=int64)

In [32]:
accuracy_score(test_y, pred_y)

0.8156424581005587

### 検証データで予測

In [33]:
# 検証データ読み込み
valid = pd.read_pickle('./pd_test.pk2')

In [34]:
valid.shape

(418, 15)

In [35]:
# ID の保存
valid_pass = valid.PassengerId.values

In [36]:
valid_X = valid.iloc[:, 1:]

In [37]:
valid_X.drop(['Age', 'Fare', 'Sex_female', 'Embarked_C', 'Pclass_1'], inplace=True, axis=1)

In [38]:
# 学習が終わらないので、特徴量を上位５つにする
valid_X.drop(['Parch', 'Embarked_S', 'Embarked_Q'], inplace=True, axis=1)

In [39]:
valid_X['Age_bin'] = valid_X['Age_bin'].astype('float16')

In [40]:
valid_X['Fare_bin'] = valid_X['Fare_bin'].astype('float16')

In [41]:
valid_X['SibSp'] = valid_X['SibSp'].astype('int8')

In [42]:
valid_X['Sex_male'] = valid_X['Sex_male'].astype('bool')

In [43]:
valid_X['Pclass_2'] = valid_X['Pclass_2'].astype('bool')

In [44]:
valid_X['Pclass_3'] = valid_X['Pclass_3'].astype('bool')

In [45]:
valid_X.shape, train_X.shape

((418, 6), (712, 6))

In [46]:
pred_valid_y = SVC.predict(valid_X)

In [47]:
pred_valid_y.shape

(418,)

In [48]:
type(valid_pass), type(pred_valid_y)

(numpy.ndarray, numpy.ndarray)

In [49]:
result_df = pd.DataFrame(pred_valid_y, valid_pass, columns=['Survived'])

In [50]:
result_df.to_csv("./SVC_2.csv", index_label='PassengerId')