In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [2]:
data = pd.read_csv('./Data/titanic.csv')
data.head()

Unnamed: 0,Survived,Sex,Age,Class
0,0,male,22.0,3
1,1,female,38.0,1
2,1,female,26.0,3
3,1,female,35.0,1
4,0,male,35.0,3


In [3]:
data = data[data['Age'].notnull()]

In [4]:
data = data.replace({'Sex': {'male': 0,'female': 1}})
data.head()

Unnamed: 0,Survived,Sex,Age,Class
0,0,0,22.0,3
1,1,1,38.0,1
2,1,1,26.0,3
3,1,1,35.0,1
4,0,0,35.0,3


In [5]:
#split dataset in features and target variable
feature_cols = ['Sex', 'Age', 'Class']
X = data[feature_cols] # Features
y = data.Survived # Target variable

In [6]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

In [7]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [8]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8325581395348837


In [9]:
#Perform feature selection to make narrower tree
x, y = data.drop(columns=["Survived"]), data['Survived']

skb = SelectKBest(chi2, k=2)
skb.fit(x, y)
x_new = skb.transform(x)

selected = [x.columns[i] for i in skb.get_support(indices=True)]
print("Selected features:", ", ".join(selected))

Selected features: Sex, Age


In [10]:
#Perform feature selection to make narrower tree
#split dataset in features and target variable
feature_cols = selected
X = data[feature_cols] # Features
y = data.Survived # Target variable

In [11]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

In [12]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [13]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7953488372093023
