# 作業 : (Kaggle)鐵達尼生存預測
***
https://www.kaggle.com/c/titanic

# [作業目標]
- 試著模仿範例寫法, 在鐵達尼生存預測中, 觀察計數編碼與特徵雜湊的效果

# [作業重點]
- 仿造範例, 完成自己挑選特徵的群聚編碼 (In[2], Out[2])
- 觀察群聚編碼, 搭配邏輯斯回歸, 看看有什麼影響 (In[5], Out[5], In[6], Out[6]) 

# 作業1
* 試著使用鐵達尼號的例子，創立兩種以上的群聚編碼特徵( mean、median、mode、max、min、count 均可 )

In [1]:
# 做完特徵工程前的所有準備 (與前範例相同)
import pandas as pd
import numpy as np
import copy
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

data_path = 'data/'
df = pd.read_csv(data_path + 'titanic_train.csv')

train_Y = df['Survived']
df = df.drop(['PassengerId', 'Survived'] , axis=1)
df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
df['Pclass'].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [3]:
df['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [4]:
df['SibSp'] = df['SibSp'].fillna(0)
df['SibSp'].value_counts()

0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64

In [5]:
df['Parch'] = df['Parch'].fillna(0)
df['Parch'].value_counts()

0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64

In [6]:
# 取一個類別型欄位, 與一個數值型欄位, 做群聚編碼
mean_df = df.groupby(['Pclass', 'Embarked', 'Sex'])['Fare'].mean().reset_index()
mean_df

Unnamed: 0,Pclass,Embarked,Sex,Fare
0,1,C,female,115.640309
1,1,C,male,93.536707
2,1,Q,female,90.0
3,1,Q,male,90.0
4,1,S,female,99.02691
5,1,S,male,52.949947
6,2,C,female,25.268457
7,2,C,male,25.42125
8,2,Q,female,12.35
9,2,Q,male,12.35


In [7]:
min_df = df.groupby(['Pclass', 'Embarked', 'Sex'])['Fare'].min().reset_index()
min_df

Unnamed: 0,Pclass,Embarked,Sex,Fare
0,1,C,female,27.7208
1,1,C,male,26.55
2,1,Q,female,90.0
3,1,Q,male,90.0
4,1,S,female,25.9292
5,1,S,male,0.0
6,2,C,female,12.0
7,2,C,male,12.875
8,2,Q,female,12.35
9,2,Q,male,12.35


In [8]:
#只取 int64, float64 兩種數值型欄位, 存於 num_features 中
num_features = []
for dtype, feature in zip(df.dtypes, df.columns):
    if dtype == 'float64' or dtype == 'int64':
        num_features.append(feature)
print(f'{len(num_features)} Numeric Features : {num_features}\n')

# 削減文字型欄位, 只剩數值型欄位
num_df = df[num_features]
num_df = num_df.fillna(-1)
MMEncoder = MinMaxScaler()
num_df.head()

5 Numeric Features : ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']



Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
0,3,22.0,1,0,7.25
1,1,38.0,1,0,71.2833
2,3,26.0,0,0,7.925
3,1,35.0,1,0,53.1
4,3,35.0,0,0,8.05


# 作業2
* 將上述的新特徵，合併原有的欄位做生存率預估，結果是否有改善?

原始特徵：0.7038

新特徵：0.7745


In [9]:
# 原始特徵 + 邏輯斯迴歸
train_X = MMEncoder.fit_transform(num_df)
estimator = LogisticRegression()
cross_val_score(estimator, train_X, train_Y, cv=5).mean()

  return self.partial_fit(X, y)


0.7038635542329971

In [10]:
# 新特徵 + 邏輯斯迴歸
temp = pd.merge(mean_df, min_df, how='left', on=['Pclass', 'Embarked', 'Sex'])
temp = temp.rename(index=str, columns={"Fare_x": "Fare_mean", "Fare_y": "Fare_min"})
temp

Unnamed: 0,Pclass,Embarked,Sex,Fare_mean,Fare_min
0,1,C,female,115.640309,27.7208
1,1,C,male,93.536707,26.55
2,1,Q,female,90.0,90.0
3,1,Q,male,90.0,90.0
4,1,S,female,99.02691,25.9292
5,1,S,male,52.949947,0.0
6,2,C,female,25.268457,12.0
7,2,C,male,25.42125,12.875
8,2,Q,female,12.35,12.35
9,2,Q,male,12.35,12.35


In [11]:
df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [12]:
new_df = pd.merge(df, temp, how='left', on=['Pclass', 'Embarked', 'Sex'])
new_df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Fare_mean,Fare_min
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,13.307149,0.0
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,115.640309,27.7208
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,18.670077,7.25
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,99.02691,25.9292
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,13.307149,0.0


In [13]:
new_df = new_df.drop(['Name', 'Ticket', 'Cabin', 'Fare'], axis=1)
new_df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked,Fare_mean,Fare_min
0,3,male,22.0,1,0,S,13.307149,0.0
1,1,female,38.0,1,0,C,115.640309,27.7208
2,3,female,26.0,0,0,S,18.670077,7.25
3,1,female,35.0,1,0,S,99.02691,25.9292
4,3,male,35.0,0,0,S,13.307149,0.0


In [14]:
new_df['Embarked'] = new_df['Embarked'].fillna('None')

In [15]:
new_df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked,Fare_mean,Fare_min
0,3,male,22.0,1,0,S,13.307149,0.0
1,1,female,38.0,1,0,C,115.640309,27.7208
2,3,female,26.0,0,0,S,18.670077,7.25
3,1,female,35.0,1,0,S,99.02691,25.9292
4,3,male,35.0,0,0,S,13.307149,0.0


In [16]:
from sklearn.preprocessing import LabelEncoder
new_df['Sex'] = LabelEncoder().fit_transform(new_df['Sex'])
new_df['Embarked'] = LabelEncoder().fit_transform(new_df['Embarked'])
new_df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked,Fare_mean,Fare_min
0,3,1,22.0,1,0,3,13.307149,0.0
1,1,0,38.0,1,0,0,115.640309,27.7208
2,3,0,26.0,0,0,3,18.670077,7.25
3,1,0,35.0,1,0,3,99.02691,25.9292
4,3,1,35.0,0,0,3,13.307149,0.0


In [18]:
new_df = new_df.fillna(-1)
train_X = MMEncoder.fit_transform(new_df)
cross_val_score(estimator, train_X, train_Y, cv=5).mean()

  return self.partial_fit(X, y)


0.7744561911945832