In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import label_binarize, LabelEncoder
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_graphviz
import graphviz

In [2]:
def read_csv(path):
    return pd.read_csv(path)

In [3]:
def drop_column(df, columns):
    return df.drop(columns, axis=1)

In [4]:
ship_df = read_csv("./titanic/train.csv")
ship_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [5]:
ship_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
ship_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
X_train = drop_column(ship_df, ["Name", "Ticket", "Cabin", "Fare", "PassengerId"])

In [8]:
X_train["Sex"] = label_binarize(X_train["Sex"], classes=["male", "female"])
X_train['Age'].fillna((X_train["Age"].mean()), inplace=True)
X_train.dropna(inplace = True)
X_train["Embarked"] = LabelEncoder().fit_transform(X_train["Embarked"])
y_train = pd.DataFrame({ "Survived": X_train["Survived"] })
X_train = drop_column(X_train, ["Survived"])

In [9]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
0,3,0,22.000000,1,0,2
1,1,1,38.000000,1,0,0
2,3,1,26.000000,0,0,2
3,1,1,35.000000,1,0,2
4,3,0,35.000000,0,0,2
...,...,...,...,...,...,...
886,2,0,27.000000,0,0,2
887,1,1,19.000000,0,0,2
888,3,1,29.699118,1,2,2
889,1,0,26.000000,0,0,0


In [10]:
ship_df_test = read_csv("./titanic/test.csv")
X_test = drop_column(ship_df_test, ["Name", "Ticket", "Cabin", "Fare", "PassengerId"])
X_test["Sex"] = label_binarize(X_test["Sex"], classes=["male", "female"])
X_test["Age"].fillna((X_test["Age"].mean()), inplace=True)
X_test["Embarked"] = LabelEncoder().fit_transform(X_test["Embarked"])
y_index = read_csv("./titanic/gender_submission.csv")["PassengerId"]

In [11]:
def train_model(criterion, max_depth,  X_train, y_train, X_test, y_index):
    classifier = DecisionTreeClassifier(max_depth=max_depth, criterion=criterion)
    model = classifier.fit(X_train, y_train)
    prediction = model.predict(X_test)
    result = generate_result_df(y_index, prediction)
    return classifier, model, prediction, result

In [12]:
def generate_result_df(index, prediction):
    return pd.DataFrame({ 'PassengerId': index, 'Survived': prediction.flatten() })

In [13]:
def export_csv(df, name):
    df.to_csv(name + '.csv', index=False)

In [14]:
def score(model, X_test, y_test):
    return model.score(X_test, y_test)

In [15]:
def draw_tree(name, classifier, X_train):
    # plot_tree(classifier1, feature_names=X_train.columns, class_names=["Not Survived", "Survived"], filled=True)
    dot_data = export_graphviz(classifier, out_file=None, feature_names=X_train.columns, class_names=["Not Survived", "Survived"], filled=True)
    graph = graphviz.Source(dot_data, format="png") 
    graph.render(name)

In [16]:
classifier1, model1, prediction1, result1 = train_model('gini', 10, X_train, y_train, X_test, y_index)
export_csv(result1, './output/result1')
draw_tree("./output/result1", classifier1, X_train)

In [17]:
classifier2, model2, prediction2, result2 = train_model('entropy', 5, X_train, y_train, X_test, y_index)
export_csv(result2, './output/result2')
draw_tree("./output/result2", classifier2, X_train)

In [18]:
classifier3, model3, prediction3, result3 = train_model('gini', 10, X_train, y_train, X_test, y_index)
export_csv(result3, './output/result3')
draw_tree("./output/result3", classifier3, X_train)

In [19]:
classifier4, model4, prediction4, result4 = train_model('gini', 5, X_train, y_train, X_test, y_index)
export_csv(result4, './output/result4')
draw_tree("./output/result4", classifier4, X_train)