# 타이타닉
타이타닉 생존자를 예측하는 모델을 만드시오.

Submission File Format:

You should submit a csv file with exactly 418 entries plus a header row. Your submission will show an error if you have extra columns (beyond PassengerId and Survived) or rows.

The file should have exactly 2 columns:

PassengerId (sorted in any order)
Survived (contains your binary predictions: 1 for survived, 0 for deceased)

In [138]:
import pandas as pd
import numpy as np

In [139]:
submission_format = pd.read_csv('gender_submission.csv')
submission_format

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [140]:
df_train = pd.read_csv('train.csv') # 0 ~ 891
df_test = pd.read_csv('test.csv') # 892 ~ 1309
y_train = df_train['Survived']

df_train.drop(['Survived'], axis=1, inplace=True)
df_temp = pd.concat([df_train, df_test], axis=0).reset_index(drop=True)

In [141]:
df_temp

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1305,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1306,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1307,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [145]:
# 컬럼 정보 확인
print(df_temp.columns)

# 1309 * 11
# PassengerId      0 승객 식별번호
# Pclass           0 룸 클래스
# Name             0 이름
# Sex              0 성별
# Age            177 연령
# SibSp            0 ??? 여/부
# Parch            0 ??? 여/부
# Ticket           0 목적지 티켓 번호
# Fare             0 요금
# Cabin          687 캐비닛 번호인가?!?
# Embarked         2 승선

# 이상치, 결측치 확인

# 나이에 결측치가 있다. - 평균값으로 대체해야 할듯
# Fare - 요금에 결측치가 있다. 요금은 상관관계가 없으므로 드랍 시킨다.
# 캐비닛 번호에 결측치가 있다. 드랍.
# 승선 정보에 결측건 2건은 삭제한다.

# 결측치 대체 및 제거
df_temp['Age'].fillna(df_temp['Age'].mean(), inplace=True)
df_temp['Cabin'].fillna('NA', inplace=True)
df_temp['Fare'].fillna(0, inplace=True)


# 결측치 체우기전 항목들의 분포를 확인
df_temp['Embarked'].fillna('C', inplace=True)
print(df_temp.isna().sum())

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64


In [146]:
# 변수선택
df_temp.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)

# 카테고리 분포 확인
print(df_temp['Pclass'].value_counts())

3    709
1    323
2    277
Name: Pclass, dtype: int64


In [147]:
# Cabin
cabin = df_temp['Cabin'].value_counts() > 2
cabin = cabin[cabin.values]
cabin.index

# 상기 분류코드를 제외하고 모두 'NA' 처리
mask = ~df_temp['Cabin'].isin(cabin.index)
df_temp.loc[mask, 'Cabin'] = 'NA'
df_temp

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,22.000000,1,0,7.2500,,S
1,1,female,38.000000,1,0,71.2833,,C
2,3,female,26.000000,0,0,7.9250,,S
3,1,female,35.000000,1,0,53.1000,,S
4,3,male,35.000000,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...
1304,3,male,29.881138,0,0,8.0500,,S
1305,1,female,39.000000,0,0,108.9000,,C
1306,3,male,38.500000,0,0,7.2500,,S
1307,3,male,29.881138,0,0,8.0500,,S


In [148]:
# 나머지 카테고리 값들도 확인
mask = df_temp['SibSp'] >= 5 # 5이상 정리
df_temp.loc[mask, 'SibSp'] = 0

mask = df_temp['Parch']>=3 # 3이상 정리
df_temp.loc[mask, 'Parch'] = 0

In [149]:
# 범주형 연속형 분리
df_temp.loc[:, 'Age'] = df_temp['Age'].round().astype('int')
df_temp

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,22,1,0,7.2500,,S
1,1,female,38,1,0,71.2833,,C
2,3,female,26,0,0,7.9250,,S
3,1,female,35,1,0,53.1000,,S
4,3,male,35,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...
1304,3,male,30,0,0,8.0500,,S
1305,1,female,39,0,0,108.9000,,C
1306,3,male,38,0,0,7.2500,,S
1307,3,male,30,0,0,8.0500,,S


In [150]:
# cat, num 분리
X_cat = df_temp[['Pclass', 'Sex', 'SibSp', 'Parch', 'Cabin', 'Embarked']]
X_num = df_temp[['Age', 'Fare']]

# dummy 변수화
X_cat_dummy = pd.get_dummies(X_cat)

# Scaling
from sklearn.preprocessing import MinMaxScaler
X_num_scaled = MinMaxScaler().fit_transform(X_num)
X_num_scaled = pd.DataFrame(X_num_scaled, index=X_num.index, columns=X_num.columns)

X_temp = pd.concat([X_cat_dummy, X_num_scaled], axis=1)

# train, test 데이터 분리
X_train = X_temp.iloc[:891]
X_test = X_temp.iloc[891:]

In [151]:
# model 적용
from sklearn.linear_model import LogisticRegression
model = LogisticRegression().fit(X_train, y_train)

# cv
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

kfold = KFold(n_splits=5, random_state=9999, shuffle=True)
score = cross_val_score(LogisticRegression(), X_train, y_train, cv=kfold)

print("CV Score ", score)
print("CV Score Mean", score.mean())

CV Score  [0.79888268 0.78089888 0.79775281 0.75280899 0.81460674]
CV Score Mean 0.7889900194589166


In [163]:
# KNN, 의사결정트리
from sklearn.neighbors import KNeighborsClassifier
model_knn = KNeighborsClassifier().fit(X_train, y_train)

score = cross_val_score(KNeighborsClassifier(n_neighbors=3), X_train, y_train, cv=kfold)

print("CV Score ", score)
print("CV Score Mean", score.mean())

CV Score  [0.80446927 0.7752809  0.76966292 0.76966292 0.83707865]
CV Score Mean 0.7912309334002887


In [195]:
# 의사결정트리 - 테스트 데이터에서는 별로 였음. 0.736..
from sklearn.tree import DecisionTreeClassifier
model_tree = DecisionTreeClassifier().fit(X_train, y_train)
score = cross_val_score(DecisionTreeClassifier(max_depth=7), X_train, y_train, cv=kfold)

print("CV Score ", score)
print("CV Score Mean", score.mean())

CV Score  [0.84916201 0.80337079 0.84269663 0.80337079 0.85955056]
CV Score Mean 0.8316301550436258


In [177]:
# 앙상블 적용해보기
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier

estimators = [('svc', SVC()), ('rf', RandomForestClassifier())]
model_stack = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()).fit(X_train, y_train)

score = cross_val_score(StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
                        , X_train, y_train, cv=kfold)

print("CV Score ", score)
print("CV Score Mean", score.mean())

CV Score  [0.82122905 0.79775281 0.82022472 0.76966292 0.85393258]
CV Score Mean 0.812560416797439


In [197]:
# XGBoost 
from xgboost import XGBClassifier
model_xgb = XGBClassifier().fit(X_train, y_train)

score = cross_val_score(XGBClassifier(), X_train, y_train, cv=kfold)

print("CV Score ", score)
print("CV Score Mean", score.mean())







CV Score  [0.78212291 0.7752809  0.79775281 0.81460674 0.8258427 ]
CV Score Mean 0.7991212102190698


In [196]:
# Test 데이터셋 훈련
pred_test = model_stack.predict(X_test)
result = pd.DataFrame({'PassengerId':df_test['PassengerId'], 'Survived':pred_test})

result.to_csv('submission.csv', index=False)
result

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,1
3,895,1
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
