# 作業 : (Kaggle)鐵達尼生存預測

In [2]:
# 載入套件與資料
import pandas as pd
import numpy as np

data_path = '../data/titanic/'
df_train = pd.read_csv(data_path + 'titanic_train.csv')
df_test = pd.read_csv(data_path + 'titanic_test.csv')
df_train.shape

(891, 12)

In [3]:
# 重組資料成為訓練 / 預測用格式
train_Y = df_train['Survived']
ids = df_test['PassengerId']
df_train = df_train.drop(['PassengerId', 'Survived'] , axis=1)
df_test = df_test.drop(['PassengerId'] , axis=1)
df = pd.concat([df_train,df_test])
df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# 秀出資料欄位的類型與數量
dtype_df = df.dtypes.reset_index()
dtype_df.columns = ["Count", "Column Type"]
dtype_df = dtype_df.groupby("Column Type").aggregate('count').reset_index()
dtype_df


Unnamed: 0,Column Type,Count
0,int64,3
1,float64,2
2,object,5


In [5]:
#確定只有 int64, float64, object 三種類型後, 分別將欄位名稱存於三個 list 中
int_features = []
float_features = []
object_features = []
for dtype, feature in zip(df.dtypes, df.columns):
    if dtype == 'float64':
        float_features.append(feature)
    elif dtype == 'int64':
        int_features.append(feature)
    else:
        object_features.append(feature)
print(f'{len(int_features)} Integer Features : {int_features}\n')
print(f'{len(float_features)} Float Features : {float_features}\n')
print(f'{len(object_features)} Object Features : {object_features}')

3 Integer Features : ['Pclass', 'SibSp', 'Parch']

2 Float Features : ['Age', 'Fare']

5 Object Features : ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']


# 作業1 
* 試著執行作業程式，觀察三種類型 (int / float / object) 的欄位分別進行( 平均 mean / 最大值 Max / 相異值 nunique )  
中的九次操作會有那些問題? 並試著解釋那些發生Error的程式區塊的原因?  
> 當執行 object 欄位轉換時，部分欄位求取 平均 mean / 最大值 Max 時會出現轉換錯誤

# 作業2
* 思考一下，試著舉出今天五種類型以外的一種或多種資料類型，你舉出的新類型是否可以歸在三大類中的某些大類?  
所以三大類特徵中，哪一大類處理起來應該最複雜?
> XML文件內容資料，本身可能含有更細緻之內容欄位區分。初步應會被轉換為 object 欄位處理，但因內容為複合資訊，不能歸屬於三大類(數值/類別/時間)之任一種，應再做細部區分<br>
> 數值及時間兩類類型特徵各有不同層面之處理考量

> 五類型: 數值型特徵 / 類別型特徵 / 二元特徵 / 排序型特徵 / 時間型特徵


In [15]:
# 例 : 整數 (int) 特徵取平均 (mean)
df[int_features].mean()

Pclass    2.294882
SibSp     0.498854
Parch     0.385027
dtype: float64

In [7]:
df[int_features].max()

Pclass    3
SibSp     8
Parch     9
dtype: int64

In [8]:
df[int_features].nunique()

Pclass    3
SibSp     7
Parch     8
dtype: int64

In [9]:
df[float_features].mean()

Age     29.881138
Fare    33.295479
dtype: float64

In [10]:
df[float_features].max()

Age      80.0000
Fare    512.3292
dtype: float64

In [11]:
df[float_features].nunique()

Age      98
Fare    281
dtype: int64

In [12]:
df[object_features].mean()

Series([], dtype: float64)

In [55]:
for col in df[object_features].columns:
    try:
        print(f"{col}: {df[col].mean()}")
    except Exception as err:
        print(f"Error: df['{col}'].mean()")
    

Error: df['Name'].mean()
Error: df['Sex'].mean()
Error: df['Ticket'].mean()
Error: df['Cabin'].mean()
Error: df['Embarked'].mean()


In [13]:
df[object_features].max()

Name      van Melkebeke, Mr. Philemon
Sex                              male
Ticket                      WE/P 5735
dtype: object

In [54]:
for col in df[object_features].columns:
    try:
        print(f"{col}: {df[col].max()}")
    except Exception as err:
        print(f"Error: df['{col}'].max()")
    

Name: van Melkebeke, Mr. Philemon
Sex: male
Ticket: WE/P 5735
Error: df['Cabin'].max()
Error: df['Embarked'].max()


In [14]:
df[object_features].nunique()

Name        1307
Sex            2
Ticket       929
Cabin        186
Embarked       3
dtype: int64

In [56]:
for col in df[object_features].columns:
    try:
        print(f"{col}: {df[col].nunique()}")
    except Exception as err:
        print(f"Error: df['{col}'].nunique()")
    

Name: 1307
Sex: 2
Ticket: 929
Cabin: 186
Embarked: 3


In [57]:
df.describe(include = 'all')

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,1309.0,1309,1309,1046.0,1309.0,1309.0,1309,1308.0,295,1307
unique,,1307,2,,,,929,,186,3
top,,"Connolly, Miss. Kate",male,,,,CA. 2343,,C23 C25 C27,S
freq,,2,843,,,,11,,6,914
mean,2.294882,,,29.881138,0.498854,0.385027,,33.295479,,
std,0.837836,,,14.413493,1.041658,0.86556,,51.758668,,
min,1.0,,,0.17,0.0,0.0,,0.0,,
25%,2.0,,,21.0,0.0,0.0,,7.8958,,
50%,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,3.0,,,39.0,1.0,0.0,,31.275,,


資料集統計資訊中, 出現NaN欄位即表明有不適合之處