In [63]:
import re

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import confusion_matrix

In [39]:
train_df = pd.read_csv('../dataset/train.csv', index_col=0)
test_df = pd.read_csv('../dataset/test.csv', index_col=0)

In [40]:
train_df.head(5)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [41]:
test_df.head(5)

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [42]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [43]:
train_df['Pclass'].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [44]:
p_class_le = LabelEncoder()
p_class_le.fit(train_df['Pclass'])
train_df['Pclass'] = p_class_le.transform(train_df['Pclass'])

In [45]:
train_df['Name'] = [re.search('[A-Z][a-z]*\.', name).group(0)[:-1] for name in train_df['Name']]

In [46]:
train_df['Name'].value_counts()

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Mlle          2
Col           2
Major         2
Countess      1
Sir           1
Ms            1
Don           1
Jonkheer      1
Lady          1
Mme           1
Capt          1
Name: Name, dtype: int64

In [47]:
train_df['Name'] = train_df['Name'].replace(
    ['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'],
    'Rare')
train_df['Name'] = train_df['Name'].replace('Mlle', 'Miss')
train_df['Name'] = train_df['Name'].replace('Ms', 'Miss')
train_df['Name'] = train_df['Name'].replace('Mme', 'Mrs')
train_df['Name'].value_counts()

Mr        517
Miss      185
Mrs       126
Master     40
Rare       23
Name: Name, dtype: int64

In [48]:
name_le = LabelEncoder()
name_le.fit(train_df['Name'])
train_df['Name'] = name_le.transform(train_df['Name'])

In [49]:
sex_le = LabelEncoder()
sex_le.fit(train_df['Sex'])
train_df['Sex'] = sex_le.transform(train_df['Sex'])

In [50]:
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].mean())

In [51]:
train_df['Ticket'] = [str(ticket[0]) for ticket in train_df['Ticket']]
train_df['Ticket'] = train_df['Ticket'].replace('[3-9]', '3', regex=True)

ticket_le = LabelEncoder()
ticket_le.fit(train_df['Ticket'])
train_df['Ticket'] = ticket_le.transform(train_df['Ticket'])

In [52]:
train_df['Cabin'] = ['None' if pd.isna(cabin) else cabin[0] for cabin in train_df['Cabin']]

cabin_le = LabelEncoder()
cabin_le.fit(train_df['Cabin'])
train_df['Cabin'] = cabin_le.transform(train_df['Cabin'])

In [53]:
train_df['Embarked'] = train_df['Embarked'].fillna(train_df['Embarked'].mode()[0])

embarked_le = LabelEncoder()
embarked_le.fit(train_df['Embarked'])
train_df['Embarked'] = embarked_le.transform(train_df['Embarked'])

In [54]:
train_df.info()
train_df.head(5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    int64  
 3   Sex       891 non-null    int64  
 4   Age       891 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    int64  
 8   Fare      891 non-null    float64
 9   Cabin     891 non-null    int64  
 10  Embarked  891 non-null    int64  
dtypes: float64(2), int64(9)
memory usage: 83.5 KB


Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,2,2,1,22.0,1,0,3,7.25,7,2
2,1,0,3,0,38.0,1,0,7,71.2833,2,0
3,1,2,1,0,26.0,0,0,8,7.925,7,2
4,1,0,3,0,35.0,1,0,0,53.1,2,2
5,0,2,2,1,35.0,0,0,2,8.05,7,2


In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_df.iloc[:, 1:],
                                                    train_df['Survived'],
                                                    test_size=0.2,
                                                    shuffle=True)

def param():
  ret = {
      'C':[1, 10, 100],
      'kernel':['linear'],
      'degree':np.arange(1, 6, 1),
      'gamma':np.linspace(0.01, 1.0, 50)
  }
  return ret

gscv = GridSearchCV(SVC(), param(), cv=4, verbose=3, n_jobs=36)
gscv.fit(X_train, y_train)

Fitting 4 folds for each of 750 candidates, totalling 3000 fits


[Parallel(n_jobs=36)]: Using backend LokyBackend with 36 concurrent workers.
[Parallel(n_jobs=36)]: Done  56 tasks      | elapsed:   21.4s
[Parallel(n_jobs=36)]: Done 216 tasks      | elapsed:  1.2min
[Parallel(n_jobs=36)]: Done 440 tasks      | elapsed:  2.4min
[Parallel(n_jobs=36)]: Done 728 tasks      | elapsed:  3.9min
[Parallel(n_jobs=36)]: Done 1080 tasks      | elapsed:  7.4min
[Parallel(n_jobs=36)]: Done 1496 tasks      | elapsed: 16.5min
[Parallel(n_jobs=36)]: Done 1976 tasks      | elapsed: 26.9min
[Parallel(n_jobs=36)]: Done 2520 tasks      | elapsed: 100.3min


In [None]:
best = gscv.best_estimator_
pred = best.predict(X_test)

print(confusion_matrix(y_test, pred))
print('正解率(train):{:.3f}'.format(best.score(X_train, y_train)))
print('正解率(test):{:.3f}'.format(best.score(X_test, y_test)))