In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
titanic_df = pd.read_csv('./데이터/train.csv')
test_df = pd.read_csv('./데이터/test.csv')
sub_df = pd.read_csv('./데이터/gender_submission.csv')

In [4]:
# train, test set에 공통된 티켓 정보가 있어서 동증자 수를 측정하기 위해 concat 하고 
# 동승자 수 변수 생성 후 다시 train, test 분리

df = pd.concat([titanic_df, test_df], axis = 0)
dic = dict(zip(df['Ticket'].value_counts().index, df['Ticket'].value_counts().values))
df['Passengers'] = df['Ticket'].replace(dic)

In [5]:
titanic_df = df[:891]
test_df = df[891:]

In [6]:
# 전처리 함수
def preprocess(df):
    # title 파생변수 생성
    df['Title']=df['Name'].str.split(', ').str[1].str.split('.').str[0]
    
    # 등급별 성별 나이의 중앙값으로 결측치 대체
    age_median1 = df.groupby(['Pclass', 'Sex'])['Age'].median()
    df['Age'] = df.apply(lambda row: age_median1[row['Pclass'], row['Sex']] if pd.isnull(row['Age']) else row['Age'], axis=1)
    
    # 가족 파생변수 생성
    df['Family'] = df['SibSp'] + df['Parch']
    
    # 탑승 항구 결측치 S로 대체 (이름을 검색 -> S)
    df['Embarked'] = df['Embarked'].fillna('S')
    
    # 500이상을 이상치 -> 500을 제외한 가장 큰 값으로 대체
    df['Fare'] = np.where(df['Fare']>500, np.nan, df['Fare'])
    df['Fare'] = df['Fare'].fillna(df['Fare'].max())
    
    return df

In [7]:
# embarked, title 라벨인코딩 함수
def change_object(df):
    from sklearn.preprocessing import LabelEncoder
    ordinal_features = ['Embarked','Title']

    for feature in ordinal_features:
        le = LabelEncoder()
        # 라벨인코딩
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

In [8]:
# 불필요한 열 삭제 함수
def remove_col(df):
    df.drop('PassengerId', axis=1, inplace = True)
    df.drop('Name', axis=1, inplace = True)
    df.drop('Ticket', axis=1, inplace = True)
    df.drop('Cabin', axis=1, inplace = True)

In [9]:
titanic_df = preprocess(titanic_df)
test_df = preprocess(test_df)

In [11]:
remove_col(titanic_df)
remove_col(test_df)

In [12]:
titanic_df = change_object(titanic_df)
test_df = change_object(test_df)

In [13]:
# 원 핫 인코딩(판다스 내장함수) - 성별
titanic_df = pd.get_dummies(titanic_df)
test_df = pd.get_dummies(test_df)

In [14]:
titanic_df['Sex_female'] = np.where(titanic_df['Sex_female']==False, 0, 1)
titanic_df['Sex_male'] = np.where(titanic_df['Sex_male']==False, 0, 1)

In [15]:
test_df['Sex_female'] = np.where(test_df['Sex_female']==False, 0, 1)
test_df['Sex_male'] = np.where(test_df['Sex_male']==False, 0, 1)

In [16]:
test_df.drop('Survived', axis=1, inplace = True)

In [187]:
titanic_df

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,Passengers,Title,Family,Sex_female,Sex_male
0,0.0,3,22.0,1,0,7.2500,2,1,11,1,0,1
1,1.0,1,38.0,1,0,71.2833,0,1,12,1,1,0
2,1.0,3,26.0,0,0,7.9250,2,1,8,0,1,0
3,1.0,1,35.0,1,0,53.1000,2,2,12,1,1,0
4,0.0,3,35.0,0,0,8.0500,2,1,11,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,0.0,2,27.0,0,0,13.0000,2,1,14,0,0,1
887,1.0,1,19.0,0,0,30.0000,2,1,8,0,1,0
888,0.0,3,21.5,1,2,23.4500,2,2,8,3,1,0
889,1.0,1,26.0,0,0,30.0000,0,1,11,0,0,1


# 모델링

## random forest - 전체 변수

In [17]:
X = titanic_df.drop(['Survived'],axis=1)
y = titanic_df['Survived']

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

params = { 'n_estimators' : [10, 20, 30],
           'max_depth' : [3, 6, 8],
           'min_samples_leaf' : [8, 12, 16],
          'min_samples_split' : [8, 16, 20]
            }

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestClassifier(random_state = 42, n_jobs = -1)

# 교차검증 10번
grid_cv = GridSearchCV(rf_clf, param_grid = params, cv = 10, n_jobs = -1, return_train_score=True, verbose = 0)
grid_cv.fit(X_train, y_train)

In [20]:
grid_cv.best_params_

{'max_depth': 6,
 'min_samples_leaf': 8,
 'min_samples_split': 20,
 'n_estimators': 30}

In [21]:
# train, val 정확도로 overfitting 확인
pred_rf_train = grid_cv.predict(X_train)
pred_rf_val = grid_cv.predict(X_val)

accuracy_score(y_train, pred_rf_train), accuracy_score(y_val, pred_rf_val)

(0.8497191011235955, 0.8212290502793296)

## random forest - 5개 변수 제외
- 변수 중요도를 봤을 때, sibsp, parch, family, embarked 가 0.05보다 작아서 중요도가 작다고 생각함  
- 그리고, 추가적으로 목표에 맞게 시대적 배경이 존재하는 호칭을 제외함

In [31]:
X = titanic_df.drop(['Survived','SibSp','Parch','Family','Embarked','Title'],axis=1)
test_df1 = test_df.drop(['SibSp','Parch','Family','Embarked','Title'],axis=1)
y = titanic_df['Survived']

In [32]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

params = { 'n_estimators' : [10,20, 30],
          'max_depth' : [3],
          'min_samples_leaf' : [4, 8, 12, 18],
          'min_samples_split' : [10, 14, 20],
            }


# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestClassifier(random_state = 42, n_jobs = -1, criterion='entropy')
grid_rf_cv = GridSearchCV(rf_clf, param_grid = params, cv = 10, n_jobs = -1, return_train_score=True, verbose = 0)
grid_rf_cv.fit(X_train, y_train)

In [34]:
grid_rf_cv.best_params_

{'max_depth': 3,
 'min_samples_leaf': 4,
 'min_samples_split': 14,
 'n_estimators': 10}

In [35]:
# 검증 최고 score
grid_rf_cv.best_score_

0.8230242566510171

In [37]:
pred_rf_train = grid_rf_cv.predict(X_train)
pred_rf_val = grid_rf_cv.predict(X_val)

accuracy_score(y_train, pred_rf_train), accuracy_score(y_val, pred_rf_val)

(0.8216292134831461, 0.7988826815642458)

## Logit - 전체

In [40]:
X = titanic_df.drop(['Survived'],axis=1)
y = titanic_df['Survived']

In [41]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [42]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=42, solver='lbfgs')
lr.fit(X_train, y_train)

pred_lr_train = lr.predict(X_train)
pred_lr_val = lr.predict(X_val)

accuracy_score(y_train, pred_lr_train), accuracy_score(y_val, pred_lr_val)

(0.8103932584269663, 0.8100558659217877)

## Logit - 5개 변수 제외

In [44]:
X = titanic_df.drop(['Survived','SibSp','Parch','Family','Embarked','Title'],axis=1)
test_df1 = test_df.drop(['SibSp','Parch','Family','Embarked','Title'],axis=1)
y = titanic_df['Survived']

In [45]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [46]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)

pred_lr_train = lr.predict(X_train)
pred_lr_val = lr.predict(X_val)

accuracy_score(y_train, pred_lr_train), accuracy_score(y_val, pred_lr_val)

(0.800561797752809, 0.8156424581005587)

## SVC - 전체

In [48]:
X = titanic_df.drop(['Survived'],axis=1)
y = titanic_df['Survived']

In [49]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [50]:
from sklearn.svm import SVC
svc= SVC(random_state=42)
params = {'kernel': ['rbf'], 
                  'gamma': [ 0.001, 0.01, 0.1, 1],
                  'C': [1, 10, 50, 100,200,300]}

grid__svc_cv = GridSearchCV(svc,param_grid = params, cv=10,
                      scoring="accuracy")

grid__svc_cv.fit(X_train, y_train)
pred_svm_train = grid__svc_cv.predict(X_train)
pred_svm_val = grid__svc_cv.predict(X_val)

accuracy_score(y_train, pred_svm_train), accuracy_score(y_val, pred_svm_val)

(0.8525280898876404, 0.7932960893854749)

In [51]:
grid__svc_cv.best_score_

0.8019953051643194

## SVC - 5개 변수 제외

In [179]:
X = titanic_df.drop(['Survived','SibSp','Parch','Family','Embarked','Title'],axis=1)
test_df1 = test_df.drop(['SibSp','Parch','Family','Embarked','Title'],axis=1)
y = titanic_df['Survived']

In [180]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [181]:
from sklearn.svm import SVC
svc= SVC(random_state=42, probability=True)
params = {'kernel': ['rbf'], 
                  'gamma': [ 0.001, 0.01, 0.1, 1],
                  'C': [1, 10, 50, 100,200,300]}

grid__svc_cv = GridSearchCV(svc,param_grid = params, cv=10,
                      scoring="accuracy")

grid__svc_cv.fit(X_train, y_train)
pred_svm_train = grid__svc_cv.predict(X_train)
pred_svm_val = grid__svc_cv.predict(X_val)

accuracy_score(y_train, pred_svm_train), accuracy_score(y_val, pred_svm_val)

(0.8497191011235955, 0.8044692737430168)

In [182]:
grid__svc_cv.best_params_

{'C': 200, 'gamma': 0.001, 'kernel': 'rbf'}

In [183]:
grid__svc_cv.best_score_

0.8062206572769952

In [184]:
pred_svc = grid__svc_cv.predict(test_df1)

##  SGD - 전체

In [57]:
X = titanic_df.drop(['Survived'],axis=1)
y = titanic_df['Survived']

In [58]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [59]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()

param = {
              'n_estimators' : [10,30,50],
              'learning_rate': [0.1, 0.05, 0.01],
              'max_depth': [4, 8],
              'min_samples_leaf': [10,30],
              'max_features': [0.3, 0.1] 
              }

grid_sgd_cv = GridSearchCV(gb, param_grid=param, cv=10)

grid_sgd_cv.fit(X_train, y_train)
pred_sgd_train = grid_sgd_cv.predict(X_train)
pred_sgd_val = grid_sgd_cv.predict(X_val)

accuracy_score(y_train, pred_sgd_train), accuracy_score(y_val, pred_sgd_val)

(0.8581460674157303, 0.8100558659217877)

In [60]:
grid_sgd_cv.best_score_

0.8356611893583723

## SGD - 5개 변수 제외

In [79]:
X = titanic_df.drop(['Survived','SibSp','Parch','Family','Embarked','Title'],axis=1)
test_df1 = test_df.drop(['SibSp','Parch','Family','Embarked','Title'],axis=1)
y = titanic_df['Survived']

In [80]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [81]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()

param = {
              'n_estimators' : [50,70],
              'learning_rate': [0.05, 0.1],
              'max_depth': [3, 5],
              'min_samples_leaf': [10,30],
              'max_features': [0.3,0.5] 
              }

grid_sgd_cv = GridSearchCV(gb, param_grid=param, cv=10)

grid_sgd_cv.fit(X_train, y_train)
pred_sgd_train = grid_sgd_cv.predict(X_train)
pred_sgd_val = grid_sgd_cv.predict(X_val)

accuracy_score(y_train, pred_sgd_train), accuracy_score(y_val, pred_sgd_val)

(0.8609550561797753, 0.8044692737430168)

In [82]:
grid_sgd_cv.best_params_

{'learning_rate': 0.05,
 'max_depth': 5,
 'max_features': 0.5,
 'min_samples_leaf': 30,
 'n_estimators': 70}

In [83]:
grid_sgd_cv.best_score_

0.8328638497652582

In [84]:
pred_sgd = grid_sgd_cv.predict(test_df1)

In [85]:
sub_df['Survived'] = pred_sgd.astype(int)
sub_df.to_csv('submin_sgd.csv', index= False)

## EXT - 5개 변수 제외

In [86]:
X = titanic_df.drop(['Survived','SibSp','Parch','Family','Embarked','Title'],axis=1)
test_df1 = test_df.drop(['SibSp','Parch','Family','Embarked','Title'],axis=1)
y = titanic_df['Survived']

In [87]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [88]:
from sklearn.tree import ExtraTreeClassifier
et = ExtraTreeClassifier(random_state=42)

param = {
    'max_depth': [3, 5],
    'min_samples_leaf': [4,10,20],
    'min_samples_split' : [4, 8]
}

grid_et_cv = GridSearchCV(et, param_grid=param, cv=10)

grid_et_cv.fit(X_train, y_train)
pred_et_train = grid_et_cv.predict(X_train)
pred_et_val = grid_et_cv.predict(X_val)

accuracy_score(y_train, pred_et_train), accuracy_score(y_val, pred_et_val)

(0.8103932584269663, 0.7932960893854749)

In [89]:
grid_et_cv.best_params_

{'max_depth': 3, 'min_samples_leaf': 4, 'min_samples_split': 4}

In [90]:
grid_et_cv.best_score_

0.8089593114241002

In [168]:
pred_et = grid_et_cv.predict(test_df1)

# 최종1 - votting (soft : 다수결 투표)

In [185]:
from sklearn.ensemble import VotingClassifier
vot = VotingClassifier(estimators=[('rf',grid_rf_cv),
                                  ('sgd',grid_sgd_cv),
                                  ('lr',lr),
                                   ('et', grid_et_cv),
                                  ('svc',grid__svc_cv)],
                      voting='soft',n_jobs=-1)
vot.fit(X_train,y_train)

pred_vot1 = vot.predict(X_train)
pred_vot2 = vot.predict(X_val)

accuracy_score(y_train, pred_vot1), accuracy_score(y_val, pred_vot2)

(0.8258426966292135, 0.7988826815642458)

In [186]:
pred_vot = vot.predict(test_df1)
sub_df['Survived'] = pred_vot.astype(int)
sub_df.to_csv('submission_123456789.csv', index=False)

# kaggle : 0.7751

# 최종2 - 가중치

In [160]:
pred_weighted = pred_rf * 0.35 + pred_sgd * 0.40 + pred_lr * 0.25

In [162]:
pred_weighted = np.where(pred_weighted>0.8, 1, 0)

In [164]:
sub_df['Survived'] = pred_weighted.astype(int)
sub_df.to_csv('submission_123456789.csv', index=False)

# kaggle : 0.7676