In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('dataset/train.csv')

In [3]:
test = pd.read_csv('dataset/test.csv')

In [4]:
sub = pd.read_csv('dataset/submission.csv')

# Train, df 전처리

In [5]:
train_test_dataset = [train, test]

In [6]:
for df in train_test_dataset:
    df['Title'] = df['Name'].str.extract('([A-Za-z]+)\.')

    title_mapping = {'Mr': 0, 'Mrs': 1, 'Miss': 2, 'Master': 3, 'Don': 3, 'Rev': 3, 'Dr': 3, 'Mme': 3, 'Ms': 3, 'Major': 3, 'Lady': 3, 'Sir': 3, 'Mlle': 3, 'Col': 3, 'Capt': 3, 'Countess': 3, 'Jonkheer': 3, 'Dona': 3}
    df['Title2'] = df['Title'].map(title_mapping)
    df['Age'] = df['Age'].fillna(df.groupby('Title2')['Age'].transform("median"))
    df.loc[df['Age'] <= 16, 'Age2'] = 0
    df.loc[(16 < df['Age']) & (df['Age'] <= 26), 'Age2'] = 1
    df.loc[(26 < df['Age']) & (df['Age'] <= 36), 'Age2'] = 2
    df.loc[(36 < df['Age']) & (df['Age'] <= 62), 'Age2'] = 3
    df.loc[df['Age'] > 62, 'Age2'] = 4

    sex_mapping = {'male':0, 'female':1}
    df['Sex2'] = df['Sex'].map(sex_mapping)

    df['Embarked'].fillna('S', inplace=True)
    embarked_mapping = {'S':0, 'C':1, 'Q':2}
    df['Embarked2'] = df['Embarked'].map(embarked_mapping)

    df['Fare'].fillna(df.groupby('Pclass')['Fare'].transform('median'), inplace=True)

    df.loc[df['Fare'] <= 17, 'Fare2'] = 0
    df.loc[(17 < df['Fare']) & (df['Fare'] <= 30), 'Fare2'] = 1
    df.loc[(30 < df['Fare']) & (df['Fare'] <= 100), 'Fare2'] = 2
    df.loc[df['Fare'] > 100, 'Fare2'] = 3

    df['Cabin2'] = df['Cabin'].str[0]
    cabin_mapping = {'A':0, 'B':0.4, 'C':0.8, 'D':1.2, 'E':1.6, 'F':2, 'G':2.4, 'T':2.8}
    df['Cabin2'] = df['Cabin2'].map(cabin_mapping)
    df['Cabin2'].fillna(df.groupby('Pclass')['Cabin2'].transform('median'), inplace=True)

    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    family_mapping = {}
    for value in range(1, df['FamilySize'].max()+1):
        family_mapping[value] = round((value-1) * 0.4, 2)
    df['FamilySize'] = df['FamilySize'].map(family_mapping)

In [7]:
features_drop = ['PassengerId', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title', 'Title2']

In [8]:
train = train.drop(features_drop, axis=1)

In [9]:
test = test.drop(features_drop, axis=1)

In [10]:
train.isnull().sum()

Survived      0
Pclass        0
Age2          0
Sex2          0
Embarked2     0
Fare2         0
Cabin2        0
FamilySize    0
dtype: int64

In [11]:
test.isnull().sum()

Pclass        0
Age2          0
Sex2          0
Embarked2     0
Fare2         0
Cabin2        0
FamilySize    0
dtype: int64

# 머신러닝 준비

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [13]:
train_data = train.drop('Survived', axis=1)

In [14]:
train_data

Unnamed: 0,Pclass,Age2,Sex2,Embarked2,Fare2,Cabin2,FamilySize
0,3,1.0,0,0,0.0,2.0,0.4
1,1,3.0,1,1,2.0,0.8,0.4
2,3,1.0,1,0,0.0,2.0,0.0
3,1,2.0,1,0,2.0,0.8,0.4
4,3,2.0,0,0,0.0,2.0,0.0
...,...,...,...,...,...,...,...
886,2,2.0,0,0,0.0,1.8,0.0
887,1,1.0,1,0,1.0,0.4,0.0
888,3,1.0,1,0,1.0,2.0,1.2
889,1,1.0,0,1,1.0,0.8,0.0


In [15]:
target = train['Survived']

In [16]:
print(train_data.shape, target.shape)

(891, 7) (891,)


# DecisionTreeClassifier 머신러닝

In [17]:
classifier = DecisionTreeClassifier()

In [18]:
classifier.fit(train_data, target)

DecisionTreeClassifier()

In [19]:
train_predict = classifier.predict(train_data)

In [20]:
accuracy_score(train_predict, target) * 100

89.56228956228956

In [21]:
test_predict = classifier.predict(test)

In [22]:
accuracy_score(test_predict, sub['Survived']) * 100

81.10047846889952

# RandomForestClassifier 머신러닝

In [31]:
classifier2 = RandomForestClassifier(n_estimators=20, random_state=1, min_samples_split=8)

In [32]:
classifier2.fit(train_data, target)

RandomForestClassifier(n_estimators=20, random_state=1)

In [33]:
train_predict2 = classifier2.predict(train_data)

In [34]:
accuracy_score(train_predict2, target) * 100

89.337822671156

In [35]:
test_predict2 = classifier2.predict(test)

In [36]:
accuracy_score(test_predict2, sub["Survived"]) * 100

83.01435406698565