# 作業 : (Kaggle)鐵達尼生存預測
https://www.kaggle.com/c/titanic

# 作業1
* 參考範例，將鐵達尼的艙位代碼( 'Cabin' )欄位使用特徵雜湊 / 標籤編碼 / 目標均值編碼三種轉換後，  
與其他數值型欄位一起預估生存機率

In [1]:
import pandas as pd
import numpy as np
import copy, time
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings("ignore")

In [2]:
# 做完特徵工程前的所有準備 (與前範例相同)
data_path = 'data_part02/'
df_train = pd.read_csv(data_path + 'titanic_train.csv')
df_test = pd.read_csv(data_path + 'titanic_test.csv')

train_Y = df_train['Survived']
train_num = train_Y.shape[0]

ids = df_test['PassengerId']
df_train = df_train.drop(['PassengerId', 'Survived'] , axis=1)
df_test = df_test.drop(['PassengerId'] , axis=1)

df = pd.concat([df_train,df_test])
df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 10 columns):
Pclass      1309 non-null int64
Name        1309 non-null object
Sex         1309 non-null object
Age         1046 non-null float64
SibSp       1309 non-null int64
Parch       1309 non-null int64
Ticket      1309 non-null object
Fare        1308 non-null float64
Cabin       295 non-null object
Embarked    1307 non-null object
dtypes: float64(2), int64(3), object(5)
memory usage: 112.5+ KB


In [4]:
len(train_Y)

891

In [5]:
#只取類別值 (object) 型欄位, 存於 object_features 中
object_features = []
num_features = []
for dtype, feature in zip(df.dtypes, df.columns):
    if dtype == 'object':
        object_features.append(feature)
    elif dtype == 'float64' or dtype == 'int64':
        num_features.append(feature)
print(f'{len(object_features)} Categorical Features : {object_features}\n')
print(f'{len(num_features)} Numberical Features : {num_features}\n')

print("========================================================")
# 類別型欄位
df_obj = df[object_features]
df_obj = df_obj.fillna('None')
print(df_obj.head())

print("========================================================")
# 數值型欄位
df_num = df[num_features]
df_num = df_num.fillna(-1)
print(df_num.head())

5 Categorical Features : ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

5 Numberical Features : ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

                                                Name     Sex  \
0                            Braund, Mr. Owen Harris    male   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   
2                             Heikkinen, Miss. Laina  female   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female   
4                           Allen, Mr. William Henry    male   

             Ticket Cabin Embarked  
0         A/5 21171  None        S  
1          PC 17599   C85        C  
2  STON/O2. 3101282  None        S  
3            113803  C123        S  
4            373450  None        S  
   Pclass   Age  SibSp  Parch     Fare
0       3  22.0      1      0   7.2500
1       1  38.0      1      0  71.2833
2       3  26.0      0      0   7.9250
3       1  35.0      1      0  53.1000
4       3  35.0      0      0   8.0500


In [6]:
df_new = pd.concat([df_obj, df_num], axis = 1)
df_new.head()

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Pclass,Age,SibSp,Parch,Fare
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S,3,22.0,1,0,7.25
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C,1,38.0,1,0,71.2833
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S,3,26.0,0,0,7.925
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S,1,35.0,1,0,53.1
4,"Allen, Mr. William Henry",male,373450,,S,3,35.0,0,0,8.05


In [7]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 10 columns):
Name        1309 non-null object
Sex         1309 non-null object
Ticket      1309 non-null object
Cabin       1309 non-null object
Embarked    1309 non-null object
Pclass      1309 non-null int64
Age         1309 non-null float64
SibSp       1309 non-null int64
Parch       1309 non-null int64
Fare        1309 non-null float64
dtypes: float64(2), int64(3), object(5)
memory usage: 112.5+ KB


In [8]:
df_new[:train_num].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 10 columns):
Name        891 non-null object
Sex         891 non-null object
Ticket      891 non-null object
Cabin       891 non-null object
Embarked    891 non-null object
Pclass      891 non-null int64
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
dtypes: float64(2), int64(3), object(5)
memory usage: 76.6+ KB


In [9]:
# Mean Encoding
data = pd.concat([df_new[:train_num], train_Y], axis = 1)
cabin_cnt = data.groupby(['Cabin'])['Survived'].mean().reset_index()
cabin_cnt.columns = ['Cabin', 'Cabin_mean']

df_new = pd.merge(df_new, cabin_cnt, on = 'Cabin', how = 'left')
df_new.head()

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Pclass,Age,SibSp,Parch,Fare,Cabin_mean
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S,3,22.0,1,0,7.25,0.299854
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C,1,38.0,1,0,71.2833,1.0
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S,3,26.0,0,0,7.925,0.299854
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S,1,35.0,1,0,53.1,0.5
4,"Allen, Mr. William Henry",male,373450,,S,3,35.0,0,0,8.05,0.299854


In [10]:
# Feature hash
df_new['Cabin_hash'] = df_new['Cabin'].map(lambda x: hash(x) % 15)
df_new.head()

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Pclass,Age,SibSp,Parch,Fare,Cabin_mean,Cabin_hash
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S,3,22.0,1,0,7.25,0.299854,14
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C,1,38.0,1,0,71.2833,1.0,7
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S,3,26.0,0,0,7.925,0.299854,14
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S,1,35.0,1,0,53.1,0.5,6
4,"Allen, Mr. William Henry",male,373450,,S,3,35.0,0,0,8.05,0.299854,14


In [11]:
# labelencoding
df_new['Cabin_lbe'] = LabelEncoder().fit_transform(df_new['Cabin'])
    
df_new.head()

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Pclass,Age,SibSp,Parch,Fare,Cabin_mean,Cabin_hash,Cabin_lbe
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S,3,22.0,1,0,7.25,0.299854,14,185
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C,1,38.0,1,0,71.2833,1.0,7,106
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S,3,26.0,0,0,7.925,0.299854,14,185
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S,1,35.0,1,0,53.1,0.5,6,70
4,"Allen, Mr. William Henry",male,373450,,S,3,35.0,0,0,8.05,0.299854,14,185


In [12]:
df_new.columns

Index(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Pclass', 'Age', 'SibSp',
       'Parch', 'Fare', 'Cabin_mean', 'Cabin_hash', 'Cabin_lbe'],
      dtype='object')

In [13]:
num_features

['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [14]:
features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin_mean', 'Cabin_hash', 'Cabin_lbe']
df_new1 = df_new[features]

train_X = df_new1[:train_num]
clf = LogisticRegression()
start = time.time()

print('shape: {}'.format(train_X.shape))
print('score: {}'.format(cross_val_score(clf, train_X, train_Y, cv = 5).mean()))
print('time:  {} sec'.format(round((time.time() - start), 4)))

shape: (891, 8)
score: 0.757709039725341
time:  0.0311 sec


# 作業2
* 承上題，三者比較效果何者最好?
> ANS: 若個別看, 以 Cabin 目標均值 的最好.

In [15]:
# 對照組 : 'Cabin'標籤編碼 + 邏輯斯迴歸
features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin_lbe']
df_new1 = df_new[features]

train_X = df_new1[:train_num]
clf = LogisticRegression()
start = time.time()

print('shape: {}'.format(train_X.shape))
print('score: {}'.format(cross_val_score(clf, train_X, train_Y, cv = 5).mean()))
print('time:  {} sec'.format(round((time.time() - start), 4)))

shape: (891, 6)
score: 0.6949063173920583
time:  0.0251 sec


In [16]:
# 'Cabin'目標均值 + 邏輯斯迴歸
features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin_mean']
df_new1 = df_new[features]

train_X = df_new1[:train_num]
clf = LogisticRegression()
start = time.time()

print('shape: {}'.format(train_X.shape))
print('score: {}'.format(cross_val_score(clf, train_X, train_Y, cv = 5).mean()))
print('time:  {} sec'.format(round((time.time() - start), 4)))

shape: (891, 6)
score: 0.7509043413562797
time:  0.0273 sec


In [17]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin_hash']
df_new1 = df_new[features]

train_X = df_new1[:train_num]
clf = LogisticRegression()
start = time.time()

print('shape: {}'.format(train_X.shape))
print('score: {}'.format(cross_val_score(clf, train_X, train_Y, cv = 5).mean()))
print('time:  {} sec'.format(round((time.time() - start), 4)))

shape: (891, 6)
score: 0.692633876246681
time:  0.022 sec


In [18]:
# 'Cabin'特徵雜湊 ＋ 標籤編碼 ＋ 目標均值 + 邏輯斯迴歸
features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin_mean', 'Cabin_hash', 'Cabin_lbe']
df_new1 = df_new[features]

train_X = df_new1[:train_num]
clf = LogisticRegression()
start = time.time()

print('shape: {}'.format(train_X.shape))
print('score: {}'.format(cross_val_score(clf, train_X, train_Y, cv = 5).mean()))
print('time:  {} sec'.format(round((time.time() - start), 4)))

shape: (891, 8)
score: 0.757709039725341
time:  0.0274 sec
