In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn')
sns.set(font_scale=2.5)
import missingno as msno

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
df_train = pd.read_csv("./titanic/train.csv")
df_test = pd.read_csv("./titanic/test.csv")

# Null Data Check

In [3]:
# train data
for col in df_train.columns:
    msg = 'column: {:>10}\t Percent of NaN value: {:.2f}%'.format(col, 100 * (df_train[col].isnull().sum() / df_train[col].shape[0]))
    print(msg)

column: PassengerId	 Percent of NaN value: 0.00%
column:   Survived	 Percent of NaN value: 0.00%
column:     Pclass	 Percent of NaN value: 0.00%
column:       Name	 Percent of NaN value: 0.00%
column:        Sex	 Percent of NaN value: 0.00%
column:        Age	 Percent of NaN value: 19.87%
column:      SibSp	 Percent of NaN value: 0.00%
column:      Parch	 Percent of NaN value: 0.00%
column:     Ticket	 Percent of NaN value: 0.00%
column:       Fare	 Percent of NaN value: 0.00%
column:      Cabin	 Percent of NaN value: 77.10%
column:   Embarked	 Percent of NaN value: 0.22%


In [4]:
# test data
for col in df_test.columns:
    msg = 'column: {:>10}\t Percent of NaN value: {:.2f}%'.format(col, 100 * (df_test[col].isnull().sum() / df_test[col].shape[0]))
    print(msg)

column: PassengerId	 Percent of NaN value: 0.00%
column:     Pclass	 Percent of NaN value: 0.00%
column:       Name	 Percent of NaN value: 0.00%
column:        Sex	 Percent of NaN value: 0.00%
column:        Age	 Percent of NaN value: 20.57%
column:      SibSp	 Percent of NaN value: 0.00%
column:      Parch	 Percent of NaN value: 0.00%
column:     Ticket	 Percent of NaN value: 0.00%
column:       Fare	 Percent of NaN value: 0.24%
column:      Cabin	 Percent of NaN value: 78.23%
column:   Embarked	 Percent of NaN value: 0.00%


In [10]:
# 선형회귀
from sklearn.linear_model import LogisticRegression

# 서포트 벡터 머신
from sklearn.svm import SVC, LinearSVC

# 랜덤 포레스트
from sklearn.ensemble import RandomForestClassifier

# k-최근접 이웃
from sklearn.neighbors import KNeighborsClassifier

In [77]:
df_train = pd.read_csv("./titanic/train.csv")
df_test = pd.read_csv("./titanic/test.csv")

# Pclass
pclass_train_dummies = pd.get_dummies(df_train['Pclass'])
pclass_test_dummies = pd.get_dummies(df_test['Pclass'])

pclass_train_dummies.columns = ["Pclass_1","Pclass_2","Pclass_3"]
pclass_test_dummies.columns = ["Pclass_1","Pclass_2","Pclass_3"]

df_train.drop(['Pclass'] , axis = 1 , inplace=True)
df_test.drop(['Pclass'] , axis = 1 , inplace=True)

df_train = df_train.join(pclass_train_dummies)
df_test = df_test.join(pclass_test_dummies)


# Embarked
embarked_train_dummies = pd.get_dummies(df_train['Embarked'])
embarked_test_dummies = pd.get_dummies(df_test['Embarked'])

embarked_train_dummies.columns = ['S','C','Q']
embarked_test_dummies.columns = ['S','C','Q']

df_train.drop(['Embarked'], axis=1, inplace = True)
df_test.drop(['Embarked'], axis=1, inplace = True)

df_train = df_train.join(embarked_train_dummies)
df_test = df_test.join(embarked_train_dummies)


# Age
df_train["Age"].fillna(df_train["Age"].mean(), inplace=True)
df_test["Age"].fillna(df_train["Age"].mean(), inplace=True)


# Sex
sex_train_dummies = pd.get_dummies(df_train['Sex'])
sex_test_dummies = pd.get_dummies(df_test['Sex'])

sex_train_dummies.columns = ['Female', 'Male']
sex_test_dummies.columns = ['Female', 'Male']

df_train.drop(['Sex'], axis=1, inplace=True)
df_test.drop(['Sex'], axis=1, inplace=True)

df_train = df_train.join(sex_train_dummies)
df_test = df_test.join(sex_test_dummies)


# FamilySize
df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch'] + 1 # 자신을 포함 +1
df_test['FamilySize'] = df_test['SibSp'] + df_test['Parch'] + 1 # 자신을 포함 +1


# Fare
df_test.loc[df_test.Fare.isnull(), 'Fare'] = df_test['Fare'].mean() # testset 에 있는 nan value 를 평균값으로 바꿈.

df_train['Fare'] = df_train['Fare'].map(lambda i: np.log(i) if i > 0 else 0)
df_test['Fare'] = df_test['Fare'].map(lambda i: np.log(i) if i > 0 else 0)


# Drop Feature
df_train.drop(['PassengerId','Name', 'Ticket', 'Cabin','SibSp','Parch'], axis=1, inplace=True)
df_test.drop([ 'Name', 'Ticket', 'Cabin','SibSp','Parch'], axis=1, inplace=True)


# info
df_train.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    int64  
 1   Age         891 non-null    float64
 2   Fare        891 non-null    float64
 3   Pclass_1    891 non-null    uint8  
 4   Pclass_2    891 non-null    uint8  
 5   Pclass_3    891 non-null    uint8  
 6   S           891 non-null    uint8  
 7   C           891 non-null    uint8  
 8   Q           891 non-null    uint8  
 9   Female      891 non-null    uint8  
 10  Male        891 non-null    uint8  
 11  FamilySize  891 non-null    int64  
dtypes: float64(2), int64(2), uint8(8)
memory usage: 34.9 KB


In [78]:
X_train = df_train.drop("Survived",axis=1)
Y_train = df_train["Survived"]
X_test = df_test.drop("PassengerId",axis=1).copy()

In [79]:
random_forest = RandomForestClassifier()
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)

# svc = SVC(C=10, gamma=0.1)
# svc.fit(X_train,Y_train)
# Y_pred = svc.predict(X_test)
# svc.score(X_train, Y_train)

0.9820426487093153

In [80]:
submission = pd.DataFrame({
    "PassengerId": df_test["PassengerId"],
    "Survived" : Y_pred
})



submission.to_csv('titanic.csv', index = False)