In [255]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [256]:
train_path = "H:\\Hobby\\ML Dataset\\titanic\\train.csv"
test_path = "H:\\Hobby\\ML Dataset\\titanic\\test.csv"

In [273]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)


y=train['Survived']
train=train.drop(['Survived','PassengerId', 'Ticket'], axis=1)

In [259]:
train['Age'] = train['Age'].fillna(train['Age'].mean())
test['Age'] = test['Age'].fillna(test['Age'].mean())
test['Fare'] = test['Fare'].fillna(test['Fare'].mean())

train['Embarked'] = train['Embarked'].fillna('S')

In [260]:
train['Cabin'] = train['Cabin'].str[:1]

In [261]:
cabin_mapping = {"A": 0, "B": 0.4, "C": 0.8, "D": 1.2, "E": 1.6, "F": 2, "G": 2.4, "T": 2.8}
train['Cabin'] = train['Cabin'].map(cabin_mapping)
test['Cabin'] = test['Cabin'].map(cabin_mapping)

In [262]:
train_test_data = [train, test] 

for dataset in train_test_data:
    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

title_mapping = {"Mr": 0, "Miss": 1, "Mrs": 2, 
                 "Master": 3, "Dr": 3, "Rev": 3, "Col": 3, "Major": 3, "Mlle": 3,"Countess": 3,
                 "Ms": 3, "Lady": 3, "Jonkheer": 3, "Don": 3, "Dona" : 3, "Mme": 3,"Capt": 3,"Sir": 3 }

for dataset in train_test_data:
    dataset['Title'] = dataset['Title'].map(title_mapping)

In [263]:
train = train.drop(['Name'], axis=1)
test = test.drop(['Name'], axis=1)

In [264]:
cols = (train.dtypes=='object')
cols = list(cols[cols].index)

cols

['Sex', 'Embarked']

In [265]:
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_train = pd.DataFrame(encoder.fit_transform(train[cols]))
OH_test = pd.DataFrame(encoder.transform(test[cols]))

In [266]:
OH_train.index = train.index
OH_test.index = test.index

In [267]:
train = train.drop(cols, axis=1)
test = test.drop(cols, axis=1)

train = pd.concat([train, OH_train], axis=1)
test = pd.concat([test, OH_test], axis=1)

In [268]:
train.isnull().sum()

Pclass      0
Age         0
SibSp       0
Parch       0
Fare        0
Cabin     687
Title       0
0           0
1           0
2           0
3           0
4           0
dtype: int64

In [269]:
imputer = SimpleImputer()

temp = test.copy()
test = test.drop(['PassengerId'], axis=1)
temp_test = test.copy()

train = pd.DataFrame(imputer.fit_transform(train))
test = pd.DataFrame(imputer.transform(test))

train.columns = temp_test.columns
test.columns = temp_test.columns

In [270]:
train.isnull().sum()

Pclass    0
Age       0
SibSp     0
Parch     0
Fare      0
Cabin     0
Title     0
0         0
1         0
2         0
3         0
4         0
dtype: int64

In [271]:
xTrain, xVal, yTrain, yVal = train_test_split(train, y)

greatest = 0;optimal=0
for x in range(5,100,5):
    dtc = DecisionTreeClassifier(max_leaf_nodes=x, random_state=0)
    dtc.fit(xTrain, yTrain)
    preds = dtc.predict(xVal)
    score = accuracy_score(preds, yVal)
    if score>greatest:
        greatest=score
        optimal=x
    print(score, x)
print('Largest: %.3f, Optimal: %d' %(greatest,optimal))

0.7713004484304933 5
0.7847533632286996 10
0.7937219730941704 15
0.7847533632286996 20
0.7937219730941704 25
0.8071748878923767 30
0.820627802690583 35
0.8251121076233184 40
0.8161434977578476 45
0.820627802690583 50
0.8116591928251121 55
0.8026905829596412 60
0.7937219730941704 65
0.7937219730941704 70
0.7847533632286996 75
0.7892376681614349 80
0.7847533632286996 85
0.7802690582959642 90
0.7757847533632287 95
Largest: 0.825, Optimal: 40


In [272]:
dtc = DecisionTreeClassifier(max_leaf_nodes=40)
dtc.fit(train, y)
preds = dtc.predict(test)

output = pd.DataFrame({'PassengerId': temp.PassengerId,
                      'Survived': preds})

output.to_csv('4thSubmission.csv', index=False)