In [1]:
#Importing libraries
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
import pandas as pd



In [2]:
#Import the csv documnet
titanic_df = pd.read_csv("titanic.csv")
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
#Drop whichever column is unnessesery for your data 
titanic_df.drop("Cabin", axis=1, inplace=True)
titanic_df.drop("Ticket", axis=1, inplace=True)
titanic_df.drop("SibSp", axis=1, inplace=True)
titanic_df.drop("Parch", axis=1, inplace=True)
titanic_df.drop("PassengerId", axis=1, inplace=True)
titanic_df.drop("Name", axis=1, inplace=True)

In [4]:
titanic_df["Sex"].unique()

titanic_df = pd.get_dummies(titanic_df, prefix="Sex", columns=["Sex"])
titanic_df.head()

Unnamed: 0,Survived,Pclass,Age,Fare,Embarked,Sex_female,Sex_male
0,0,3,22.0,7.25,S,False,True
1,1,1,38.0,71.2833,C,True,False
2,1,3,26.0,7.925,S,True,False
3,1,1,35.0,53.1,S,True,False
4,0,3,35.0,8.05,S,False,True


In [5]:
titanic_df = pd.get_dummies(titanic_df, prefix="Embarked", columns=["Embarked"])
titanic_df.head()

Unnamed: 0,Survived,Pclass,Age,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.0,7.25,False,True,False,False,True
1,1,1,38.0,71.2833,True,False,True,False,False
2,1,3,26.0,7.925,True,False,False,False,True
3,1,1,35.0,53.1,True,False,False,False,True
4,0,3,35.0,8.05,False,True,False,False,True


In [6]:
#Check the columns for the empty values 
titanic_df.isnull().sum()

Survived        0
Pclass          0
Age           177
Fare            0
Sex_female      0
Sex_male        0
Embarked_C      0
Embarked_Q      0
Embarked_S      0
dtype: int64

In [7]:
#Remove any null values from the identified columns above 
titanic_df = titanic_df.dropna(subset=["Age"])
#Check the columns once again for any empty values 
titanic_df.isnull().sum()

Survived      0
Pclass        0
Age           0
Fare          0
Sex_female    0
Sex_male      0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64

In [8]:
#Identify the X and y values 
X = titanic_df.iloc[:,[1, 2, 3, 4, 5, 6, 7, 8]].values
y = titanic_df.iloc[:,0].values
r = 7

In [9]:
#Split the identified values into test, train and development data sets 
X_train_full, X_test, y_train_full, y_test = train_test_split(X, 
                y, test_size=0.25,random_state=r)
X_train, X_dev, y_train, y_dev = train_test_split(X_train_full, 
                y_train_full, test_size=0.25,random_state=r)

### Tree models

In [10]:
#Train the tree model without pruning or max_depth
unristricted_depth_tree = DecisionTreeClassifier(max_depth=None,random_state=r)
unristricted_depth_tree.fit(X_train, y_train)
print("Accuracy (base):", unristricted_depth_tree.score(X_test, y_test))
print("At depth:", unristricted_depth_tree.tree_.max_depth)

Accuracy (base): 0.7486033519553073
At depth: 19


In [11]:
#Bagged tree model
bagged_tree = BaggingClassifier(base_estimator=unristricted_depth_tree, n_estimators=100, random_state=r)
bagged_tree.fit(X_train, y_train)
print("Accuracy:", bagged_tree.score(X_test, y_test))



Accuracy: 0.7653631284916201


In [12]:
#Boosted tree model
boosted_tree = AdaBoostClassifier(base_estimator=unristricted_depth_tree, n_estimators=100, random_state=r)
boosted_tree.fit(X_train, y_train)
print("Accuracy:", boosted_tree.score(X_test, y_test))



Accuracy: 0.7318435754189944


In [13]:
#Random Forest Tree 
random_forest_tree = RandomForestClassifier(n_estimators=100, random_state=r)
random_forest_tree.fit(X_train,y_train)
random_forest_tree_imp = pd.Series(random_forest_tree.feature_importances_).sort_values(ascending=False)
random_forest_tree_imp

1    0.292160
2    0.275231
3    0.172329
4    0.131825
0    0.096247
5    0.013388
7    0.012483
6    0.006338
dtype: float64

In [14]:
#Select important features
X = X[:, :5]
X
#Use those features to train the model 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) 
unristricted_depth_tree.fit(X_train,y_train)
random_forest_tree.fit(X_train,y_train)

print("Accuracy of a model trained on all variables in the modified titanic dataset:",unristricted_depth_tree.score(X_test, y_test))
print("Accuracy of a model trained only on the features that contribute the most to the passenger's survival probability:",random_forest_tree.score(X_test, y_test))

Accuracy of a model trained on all variables in the modified titanic dataset: 0.7597765363128491
Accuracy of a model trained only on the features that contribute the most to the passenger's survival probability: 0.8156424581005587


### Bagged tree model tunning (max_depth and n_estimators)

In [15]:
#Train the tree model at a max_depth of 6
unristricted_depth_tree_1 = DecisionTreeClassifier(max_depth=6,random_state=r)
unristricted_depth_tree_1.fit(X_train, y_train)

In [16]:
#Bagged tree model tunning
bagged_tree = BaggingClassifier(base_estimator = unristricted_depth_tree_1, n_estimators = 200, random_state = r)
bagged_tree.fit(X_train, y_train)
print("Accuracy:", bagged_tree.score(X_test, y_test))



Accuracy ensemble: 0.8100558659217877


In [17]:
#Train the tree model at a max_depth of 8
unristricted_depth_tree_2 = DecisionTreeClassifier(max_depth=8,random_state=r)
unristricted_depth_tree_2.fit(X_train, y_train)

In [18]:
#Bagged tree model tunning
bagged_tree = BaggingClassifier(base_estimator = unristricted_depth_tree_2, n_estimators = 400, random_state = r)
bagged_tree.fit(X_train, y_train)
print("Accuracy:", bagged_tree.score(X_test, y_test))



Accuracy ensemble: 0.8044692737430168


In [19]:
#Train the tree model at a max_depth of 10
unristricted_depth_tree_3 = DecisionTreeClassifier(max_depth=10,random_state=r)
unristricted_depth_tree_3.fit(X_train, y_train)

In [20]:
#Bagged tree model tunning
bagged_tree = BaggingClassifier(base_estimator = unristricted_depth_tree_3, n_estimators = 600, random_state = r)
bagged_tree.fit(X_train, y_train)
print("Accuracy:", bagged_tree.score(X_test, y_test))



Accuracy ensemble: 0.8212290502793296


From the tunning of the Bagged tree model, it can be noticed that upon the increase of both the max_depth and the n_estimators values, the resulting accuracy value showed a positive correlation and increased as well. The highest accuracy value was reached when the max_depth = 10 and the n_estimators = 600, with the accuracy equaling to 0.82, the highest amoung previous models (1- max_depth = 6 and n_estimators = 200; 2 - max_depth = 8 and n_estimators = 400).