Decision Trees

In [2]:
#Initial Imports

import pandas as pd
from path import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import pickle

In [3]:
#Loading data

file_path = Path('../Resources/titanic.csv')
df_titanic = pd.read_csv(file_path, encoding = 'windows-1252')
df_titanic.head()

Unnamed: 0,name,gender,age,class,embarked,country,ticketno,fare,sibsp,parch,survived
0,"Abbing, Mr. Anthony",male,42.0,3rd,S,United States,5547.0,7.11,0.0,0.0,no
1,"Abbott, Mr. Eugene Joseph",male,13.0,3rd,S,United States,2673.0,20.05,0.0,2.0,no
2,"Abbott, Mr. Rossmore Edward",male,16.0,3rd,S,United States,2673.0,20.05,1.0,1.0,no
3,"Abbott, Mrs. Rhoda Mary 'Rosa'",female,39.0,3rd,S,England,2673.0,20.05,1.0,1.0,yes
4,"Abelseth, Miss. Karen Marie",female,16.0,3rd,S,Norway,348125.0,7.13,0.0,0.0,yes


In [4]:
#Convert Class to numeric
mapping = {'victualling crew': 'crew', 'engineering crew': 'crew', 'restaurant staff': 'crew', 'deck crew': 'crew'}
df_t = df_titanic.replace({'class' : mapping})
df_t.head()

Unnamed: 0,name,gender,age,class,embarked,country,ticketno,fare,sibsp,parch,survived
0,"Abbing, Mr. Anthony",male,42.0,3rd,S,United States,5547.0,7.11,0.0,0.0,no
1,"Abbott, Mr. Eugene Joseph",male,13.0,3rd,S,United States,2673.0,20.05,0.0,2.0,no
2,"Abbott, Mr. Rossmore Edward",male,16.0,3rd,S,United States,2673.0,20.05,1.0,1.0,no
3,"Abbott, Mrs. Rhoda Mary 'Rosa'",female,39.0,3rd,S,England,2673.0,20.05,1.0,1.0,yes
4,"Abelseth, Miss. Karen Marie",female,16.0,3rd,S,Norway,348125.0,7.13,0.0,0.0,yes


# Drop Columns that are not needed. 
- Name is not needed so we will drop that.
- Ticket Number won't add value to our analysis so we will drop that as well.

In [5]:
#Drop Unnecessary Columns
df_t = df_t.drop(columns=['name', 'ticketno'])
df_t.head()

Unnamed: 0,gender,age,class,embarked,country,fare,sibsp,parch,survived
0,male,42.0,3rd,S,United States,7.11,0.0,0.0,no
1,male,13.0,3rd,S,United States,20.05,0.0,2.0,no
2,male,16.0,3rd,S,United States,20.05,1.0,1.0,no
3,female,39.0,3rd,S,England,20.05,1.0,1.0,yes
4,female,16.0,3rd,S,Norway,7.13,0.0,0.0,yes


# Adjust gender column
## Keep 'gender' column, but change the strings 'female' and 'male' to numerical 0 and 1, to allow machine learning to work on this column.

In [6]:
gender = {'female': 0, 'male': 1}
# df_t = df_t.replace({'gender' : gender})
df_t.replace({'gender': gender}, inplace=True)
df_t.head()
len(df_t)

2207

# Adjust survived column
## Keep 'survived' column, but much like 'gender', change the strings 'no' and 'yes' to numerical 0 and 1, to allow machine learning to work on this column as well.

In [7]:
survival = {'no': 0, 'yes': 1}
df_t.replace({'survived': survival}, inplace=True)
df_t.head()

Unnamed: 0,gender,age,class,embarked,country,fare,sibsp,parch,survived
0,1,42.0,3rd,S,United States,7.11,0.0,0.0,0
1,1,13.0,3rd,S,United States,20.05,0.0,2.0,0
2,1,16.0,3rd,S,United States,20.05,1.0,1.0,0
3,0,39.0,3rd,S,England,20.05,1.0,1.0,1
4,0,16.0,3rd,S,Norway,7.13,0.0,0.0,1


# Split Countries into groups based on geographical region
## We will keep any countries with 50 or more passengers, and group all countries with less than 50 passengers into the following regions:
- Europe: 'EUR'
- Asia: 'ASA'
- Australia/Oceania: 'AUS'
- North America: 'NAM'
- South America: 'SAM'
- Africa: 'AFR'

## Note the countries that remain as is (>=50 Passengers) are:
- England: 'ENG'
- United States: 'USA'
- Ireland: 'IRL'
- Sweden: 'SWE'
- Lebanon: 'LBN'
- Finland: 'FIN'

## There are then a total of 12 country codes.

In [8]:
countrycode = {'England': 'ENG', 
'United States': 'USA', 
'Ireland': 'IRL', 
'Sweden': 'SWE', 
'Lebanon': 'LBN', 
'Finland': 'FIN', 
'Scotland': 'EUR', 
'Canada': 'NAM', 
'France': 'EUR', 
'Norway': 'EUR', 
'Belgium': 'EUR', 
'Northern Ireland': 'EUR', 
'Wales': 'EUR', 
'Bulgaria': 'EUR', 
'Switzerland': 'EUR', 
'Channel Islands': 'EUR', 
'Croatia (Modern)': 'EUR', 
'Croatia': 'EUR', 
'Italy': 'EUR', 
'Spain': 'EUR', 
'India': 'ASA', 
'Argentina': 'SAM', 
'Hungary': 'EUR', 
'Denmark': 'EUR', 
'Turkey': 'ASA', 
'Germany': 'EUR', 
'South Africa': 'AFR', 
'Australia': 'AUS', 
'Bosnia': 'EUR', 
'Slovenia': 'EUR', 
'Poland': 'EUR', 
'Austria': 'EUR', 
'Greece': 'EUR', 
'Netherlands': 'EUR', 
'Uruguay': 'SAM', 
'Peru': 'SAM', 
'Russia': 'EUR', 
'Siam': 'ASA', 
'Syria': 'ASA', 
'Japan': 'ASA', 
'Slovakia (Modern day)': 'EUR', 
'Mexico': 'NAM', 
'Latvia': 'EUR', 
'Guyana': 'SAM', 
'Egypt': 'AFR', 
'Cuba': 'NAM', 
'China/Hong Kong': 'ASA', 
'Yugoslavia': 'EUR'}
df_t.replace({'country': countrycode}, inplace=True)
df_t.head()

Unnamed: 0,gender,age,class,embarked,country,fare,sibsp,parch,survived
0,1,42.0,3rd,S,USA,7.11,0.0,0.0,0
1,1,13.0,3rd,S,USA,20.05,0.0,2.0,0
2,1,16.0,3rd,S,USA,20.05,1.0,1.0,0
3,0,39.0,3rd,S,ENG,20.05,1.0,1.0,1
4,0,16.0,3rd,S,EUR,7.13,0.0,0.0,1


In [9]:
df_t.dtypes

gender        int64
age         float64
class        object
embarked     object
country      object
fare        float64
sibsp       float64
parch       float64
survived      int64
dtype: object

In [10]:
df_t = pd.get_dummies(df_t)
df_t.head()

Unnamed: 0,gender,age,fare,sibsp,parch,survived,class_1st,class_2nd,class_3rd,class_crew,...,country_AUS,country_ENG,country_EUR,country_FIN,country_IRL,country_LBN,country_NAM,country_SAM,country_SWE,country_USA
0,1,42.0,7.11,0.0,0.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,1,13.0,20.05,0.0,2.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,1,16.0,20.05,1.0,1.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,0,39.0,20.05,1.0,1.0,1,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
4,0,16.0,7.13,0.0,0.0,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0


In [11]:
# Check for null values
df_t.isnull().sum()

gender           0
age              2
fare           916
sibsp          900
parch          900
survived         0
class_1st        0
class_2nd        0
class_3rd        0
class_crew       0
embarked_B       0
embarked_C       0
embarked_Q       0
embarked_S       0
country_AFR      0
country_ASA      0
country_AUS      0
country_ENG      0
country_EUR      0
country_FIN      0
country_IRL      0
country_LBN      0
country_NAM      0
country_SAM      0
country_SWE      0
country_USA      0
dtype: int64

In [12]:
# Drop columns with many null values; we won't be using these in our model.
df_t.drop(columns = ["fare", "sibsp", "parch"], inplace=True)
df_t.head()

Unnamed: 0,gender,age,survived,class_1st,class_2nd,class_3rd,class_crew,embarked_B,embarked_C,embarked_Q,...,country_AUS,country_ENG,country_EUR,country_FIN,country_IRL,country_LBN,country_NAM,country_SAM,country_SWE,country_USA
0,1,42.0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,13.0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1,16.0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,39.0,1,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,16.0,1,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [14]:
# Drop null values
df_t.dropna(inplace= True)
df_t.head()

Unnamed: 0,gender,age,survived,class_1st,class_2nd,class_3rd,class_crew,embarked_B,embarked_C,embarked_Q,...,country_AUS,country_ENG,country_EUR,country_FIN,country_IRL,country_LBN,country_NAM,country_SAM,country_SWE,country_USA
0,1,42.0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,13.0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1,16.0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,39.0,1,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,16.0,1,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [15]:
#Define features set
X = df_t.copy()
X = X.drop("survived", axis = 1)
X.head()
len(X)

2205

In [16]:
#Define target vector
y = df_t["survived"].values.reshape(-1, 1)

In [17]:
#Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [18]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [19]:
# Fitting Standard Scaler
X_scaler = scaler.fit(X_train)

In [20]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [21]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Fitting the Decision Tree Model

In [22]:
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [23]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)

## Making Predictions Using the Tree Model

In [24]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

In [25]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [26]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,328,46
Actual 1,80,98


Accuracy Score : 0.7717391304347826
Classification Report
              precision    recall  f1-score   support

           0       0.80      0.88      0.84       374
           1       0.68      0.55      0.61       178

    accuracy                           0.77       552
   macro avg       0.74      0.71      0.72       552
weighted avg       0.76      0.77      0.76       552



In [27]:
with open('titanic_model.pkl', 'wb') as file:
    pickle.dump(model, file)
    