In [5]:
import pandas as pd
df_train = pd.read_csv("train.csv")
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
# 欠損データの確認
df_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
df_train.shape

(891, 12)

In [8]:
# Embarked, Ageに欠損のあるサンプルを削除
df = df_train.dropna(subset=["Embarked","Age"])

In [9]:
# pandasでone-hotエンコーディングを使い、ダミー変数を作成
sex_dum_train = pd.get_dummies(df["Sex"])
emb_dum_train = pd.get_dummies(df["Embarked"])

In [10]:
X = pd.concat([df.iloc[:,[2,5,6,7,9]], sex_dum_train, emb_dum_train], axis=1)

In [11]:
y = df.iloc[:,1]

In [12]:
# データの標準化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X_std = scaler.transform(X)

In [13]:
# ホールドアウト法による分割
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.3, random_state=0)

In [14]:
# SVC
from sklearn.svm import SVC
rbf_svm = SVC(kernel='rbf', gamma=0.1, C=10)
rbf_svm.fit(X_train, y_train)

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [15]:
# 正答率を出力
rbf_svm.score(X_test, y_test)

0.78037383177570097

In [25]:
# 調整したいパラメータの指定
param_grid = {"C":[0.1, 1.0, 10, 100, 1000, 10000],
             "gamma":[0.001, 0.01, 0.1, 1, 10]}

In [26]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
gs_svc = GridSearchCV(SVC(), param_grid)

In [27]:
gs_svc.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.1, 1.0, 10, 100, 1000, 10000], 'gamma': [0.001, 0.01, 0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [28]:
# test精度が最も高かった組み合わせを出力
gs_svc.best_params_

{'C': 1.0, 'gamma': 0.1}

In [29]:
# 正答率を出力
gs_svc.score(X_test, y_test)

0.76168224299065423

In [30]:
gs_svc.score(X_train, y_train)

0.85140562248995988