# 라이브러리 불러오기

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

# 데이터 읽어오기

In [None]:
train = pd.read_csv("/kaggle/input/titanic/train.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")
submission = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
submission.head()

# 전처리 & 피처엔지니어링

In [None]:
all_data = pd.concat([train, test], sort = False)
all_data.info()

In [None]:
# Age, Fare의 null 값을 중앙값으로 치환
all_data['Age'] = all_data['Age'].fillna(all_data['Age'].median())
all_data['Fare'] = all_data['Fare'].fillna(all_data['Fare'].median())

In [None]:
all_data.info()

In [None]:
sns.catplot(x='Embarked', kind='count', data=all_data);

In [None]:
# Embarked 중 가장 많은 'S'로 null 값 치환
all_data['Embarked'] = all_data['Embarked'].fillna('S')
all_data.info()

In [None]:
# 나이 feature 구간 나누기
all_data.loc[all_data['Age'] <= 15, 'Age'] = 0
all_data.loc[(all_data['Age'] > 15) & (all_data['Age'] <= 25), 'Age'] = 1
all_data.loc[(all_data['Age'] > 25) & (all_data['Age'] <= 35), 'Age'] = 2
all_data.loc[(all_data['Age'] > 35) & (all_data['Age'] <= 45), 'Age'] = 3
all_data.loc[(all_data['Age'] > 45) & (all_data['Age'] <= 60), 'Age'] = 4
all_data.loc[ all_data['Age'] > 60, 'Age'] = 5

In [None]:
# Title 추출 함수
import re
def get_title(name):
    title_search = re.search(' ([A-Za-z]+\.)', name)
    
    if title_search:
        return title_search.group(1)
    return ""

In [None]:
# Title 추출
all_data['Title'] = all_data['Name'].apply(get_title)
all_data['Title'].value_counts()

In [None]:
# Title을 대표 Title로 축소
all_data['Title'] = all_data['Title'].replace(['Capt.', 'Dr.', 'Major.', 'Rev.'], 'Officer.')
all_data['Title'] = all_data['Title'].replace(['Lady.', 'Countess.', 'Don.', 'Sir.', 'Jonkheer.', 'Dona.'], 'Royal.')
all_data['Title'] = all_data['Title'].replace(['Mlle.', 'Ms.'], 'Miss.')
all_data['Title'] = all_data['Title'].replace(['Mme.'], 'Mrs.')
all_data['Title'].value_counts()

In [None]:
# Cabin null값 처리
all_data['Cabin'] = all_data['Cabin'].fillna('Missing')
all_data['Cabin'] = all_data['Cabin'].str[0]
all_data['Cabin'].value_counts()

In [None]:
# 가족 수 및 혼자사는지 여부에 대한 feature 추가
all_data['Family_Size'] = all_data['SibSp'] + all_data['Parch'] + 1
all_data['IsAlone'] = 0
all_data.loc[all_data['Family_Size']==1, 'IsAlone'] = 1
all_data.head()

In [None]:
# 모델링에 필요 없는 feature 삭제
all_data.drop(['Name', 'Ticket'], axis=1, inplace=True)
all_data.head()

In [None]:
# 범주형 feature 가변수화하기
all_dummies = pd.get_dummies(all_data, drop_first=True)
all_dummies.head()

In [None]:
train = all_dummies[all_dummies['Survived'].notna()]
train.info()

In [None]:
test = all_dummies[all_dummies['Survived'].isna()]
test.info()

# 모델링 및 훈련/예측

In [None]:
X = train.drop(['PassengerId','Survived'], axis=1)
y = train['Survived']

test = test.drop(['PassengerId', 'Survived'], axis=1)

In [None]:
clf = RandomForestClassifier(n_estimators=200, min_samples_split=10, random_state=42)
clf.fit(X, y)

In [None]:
pred = clf.predict(test).astype(int)
submission['Survived'] = pred

In [None]:
submission.to_csv("submission.csv", index=False)