In [0]:
# Import statements

import pandas as pd
import numpy as np
import random

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from IPython.core.display import display, HTML

# import matplotlib.pyplot as plt

In [2]:
!wget -nc -P decision_tree/ https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv

--2019-10-11 17:05:38--  https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv
Resolving web.stanford.edu (web.stanford.edu)... 171.67.215.200
Connecting to web.stanford.edu (web.stanford.edu)|171.67.215.200|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 44225 (43K) [text/csv]
Saving to: ‘decision_tree/titanic.csv’


2019-10-11 17:05:39 (313 KB/s) - ‘decision_tree/titanic.csv’ saved [44225/44225]



In [3]:
# Read the data.
titan = pd.read_csv("decision_tree/titanic.csv")

"""
the following part is exploratory analysis
"""
titan.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


In [5]:
# plot the histogram based on age only
age = np.array(titan["Age"])
hist = [go.Histogram(x = age)]
fig = go.Figure(hist)
fig.show()

In [6]:
# plot the histogram based on age and survival
age0 = np.array(titan[titan["Survived"] == 0]["Age"])
age1 = np.array(titan[titan["Survived"] == 1]["Age"])

trace1 = go.Histogram(
    x=age0,
    opacity=0.75
)
trace2 = go.Histogram(
    x=age1,
    opacity=0.75
)

data = [trace1, trace2]
layout = go.Layout(barmode='overlay')
fig = go.Figure(data=data, layout=layout)

fig.show()
# from the histogram above, we cannot really tell the difference between
# survived/deceased group

In [0]:
colors = [[0, "rgb(0,0,255)"], [0.1, "rgb(51,153,255)"], [0.2, "rgb(102,204,255)"], [0.3, "rgb(153,204,255)"], [0.4, "rgb(204,204,255)"], [0.5, "rgb(255,255,255)"], [0.6, "rgb(255,204,255)"], [0.7, "rgb(255,153,255)"], [0.8, "rgb(255,102,204)"], [0.9, "rgb(255,102,102)"], [1, "rgb(255,0,0)"]]

In [20]:
# some correlation plot
corr = np.array(titan.corr())
cols = list(titan.corr().columns)

fig = go.Figure(go.Heatmap(z = corr, x = cols, y = cols, colorscale=colors))
fig.show()
# correlation marix

In [21]:
"""
this part starts to predict
"""

titan2 = titan.drop(["Name"], axis = 1)
# convert Sex to categorical, and then integer
titan2.Sex = titan2.Sex.astype("category", ordered = True, \
    categories = titan2.Sex.unique()).cat.codes

# train test split
n = titan.shape[0]
order = random.sample(range(0,n),n)  # random shuffle
X = np.array(titan2.iloc[:,1:])
y = np.array(titan.iloc[:,0])

X_train = X[order[0:600],:]
X_test = X[order[600:],:]

y_train = y[order[0:600]]
y_test = y[order[600:]]

model = DecisionTreeClassifier()
model.fit(X_train,y_train)

y_pred = model.predict(X_test)

# Calculate the accuracy and assign it to the variable acc.
acc = accuracy_score(y_test, y_pred)



specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead



In [22]:
print(acc)

0.7735191637630662


In [23]:
"""
adding cross-validation for better results
"""
# from sklearn.model_selection import GridSearchCV
parameters = {'max_depth':range(3,20)}
clf = GridSearchCV(DecisionTreeClassifier(),\
    parameters, n_jobs=8, cv = 10)
clf.fit(X=X_train, y=y_train)
tree_model = clf.best_estimator_
print (clf.best_score_, clf.best_params_)

0.83 {'max_depth': 3}



The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.

