# 13. Machine learning techniques

### Importing Packages

In [56]:
import pandas as pd
import matplotlib.pyplot as plt

In [57]:
import random as rd
rd.seed(0)

## 13.1 Loading and exploring the dataset

For the next two cells, please run only one of them.
- Run the first cell if you cloned the Github Repo
- Run the second cell if you opened this as a Google Colab

In [None]:
# IMPORTANT: ONLY RUN THIS CELL IF YOU HAVE CLONED THE REPO
raw_data = pandas.read_csv('./titanic.csv')
raw_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [58]:
# IMPORTANT: ONLY RUN THIS CELL IF YOU ARE WORKING ON A COLAB

url = "https://raw.githubusercontent.com/luisguiserrano/manning/master/Chapter_13_End_to_end_example/titanic.csv"
raw_data = pd.read_csv(url)
raw_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Next, we can explore the dataset.

In [59]:
# Examining the length of the dataset
print("The dataset has", len(raw_data), "rows")

The dataset has 891 rows


In [60]:
# Examining the columns in the dataset
print("Columns (features of the dataset)")
raw_data.columns

Columns (features of the dataset)


Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [61]:
# Examining the labels
print("Labels")
raw_data["Survived"]

Labels


Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0
...,...
886,0
887,1
888,0
889,1


In [62]:
# Examining how many passengers survived
print(sum(raw_data['Survived']),'passengers survived out of',len(raw_data))

342 passengers survived out of 891


In [63]:
# One can look at several columns together
raw_data[["Name", "Age"]]

Unnamed: 0,Name,Age
0,"Braund, Mr. Owen Harris",22.0
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0
2,"Heikkinen, Miss. Laina",26.0
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0
4,"Allen, Mr. William Henry",35.0
...,...,...
886,"Montvila, Rev. Juozas",27.0
887,"Graham, Miss. Margaret Edith",19.0
888,"Johnston, Miss. Catherine Helen ""Carrie""",
889,"Behr, Mr. Karl Howell",26.0


## 13.2. Cleaning up the data

Now, let's look at how many columns have missing data

In [64]:
raw_data.isna().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,177
SibSp,0
Parch,0
Ticket,0
Fare,0


The Cabin column is missing too many values to be useful. Let's drop it altogether.

In [65]:
raw_data['Cabin']

Unnamed: 0,Cabin
0,
1,C85
2,
3,C123
4,
...,...
886,
887,B42
888,
889,C148


In [66]:
print("The Cabin column is missing", sum(raw_data['Cabin'].isna()), "values out of",len(raw_data['Cabin']))

The Cabin column is missing 687 values out of 891


In [67]:
preprocessed_data = raw_data.drop('Cabin', axis=1)

In [68]:
preprocessed_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


Other columns such as Age or Embarked are missing some values, but they can still be useful.

For the age column, let's fill in the missing values with the median of all ages.

For the Embarked column, let's make a new category called 'U', for Unknown port of embarkment.

In [69]:
preprocessed_data['Age']

Unnamed: 0,Age
0,22.0
1,38.0
2,26.0
3,35.0
4,35.0
...,...
886,27.0
887,19.0
888,
889,26.0


In [70]:
median_age = raw_data["Age"].median()
median_age

28.0

In [71]:
preprocessed_data["Age"] = preprocessed_data["Age"].fillna(median_age)

In [72]:
preprocessed_data["Embarked"] = preprocessed_data["Embarked"].fillna('U')

In [73]:
preprocessed_data.isna().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,0
SibSp,0
Parch,0
Ticket,0
Fare,0


In [74]:
preprocessed_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


## 12.3 Manipulating the features

- One-hot encoding
- Binning
- Feature selection

### 13.3.1 One-hot encoding

One-hot encoding the gender feature

In [75]:
gender_columns = pandas.get_dummies(preprocessed_data['Sex'], prefix='Sex')
print(gender_columns)
embarked_columns = pandas.get_dummies(preprocessed_data["Embarked"], prefix="Embarked")
print(embarked_columns)

     Sex_female  Sex_male
0         False      True
1          True     False
2          True     False
3          True     False
4         False      True
..          ...       ...
886       False      True
887        True     False
888        True     False
889       False      True
890       False      True

[891 rows x 2 columns]
     Embarked_C  Embarked_Q  Embarked_S  Embarked_U
0         False       False        True       False
1          True       False       False       False
2         False       False        True       False
3         False       False        True       False
4         False       False        True       False
..          ...         ...         ...         ...
886       False       False        True       False
887       False       False        True       False
888       False       False        True       False
889        True       False       False       False
890       False        True       False       False

[891 rows x 4 columns]


In [76]:
preprocessed_data = pandas.concat([preprocessed_data, gender_columns], axis=1)
preprocessed_data = pandas.concat([preprocessed_data, embarked_columns], axis=1)

In [77]:
preprocessed_data = preprocessed_data.drop(['Sex', 'Embarked'], axis=1)

In [78]:
preprocessed_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_U
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,False,True,False,False,True,False
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,True,False,True,False,False,False
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,True,False,False,False,True,False
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,True,False,False,False,True,False
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,False,True,False,False,True,False


### A rule of thumb for when to one-hot encode or not

In [79]:
class_survived = preprocessed_data[['Pclass', 'Survived']]

first_class = class_survived[class_survived['Pclass'] == 1]
second_class = class_survived[class_survived['Pclass'] == 2]
third_class = class_survived[class_survived['Pclass'] == 3]

print("In first class", sum(first_class['Survived'])/len(first_class)*100, "% of passengers survived")
print("In second class", sum(second_class['Survived'])/len(second_class)*100, "% of passengers survived")
print("In third class", sum(third_class['Survived'])/len(third_class)*100, "% of passengers survived")

In first class 62.96296296296296 % of passengers survived
In second class 47.28260869565217 % of passengers survived
In third class 24.236252545824847 % of passengers survived


In [80]:
categorized_pclass_columns = pandas.get_dummies(preprocessed_data['Pclass'], prefix='Pclass')
preprocessed_data = pandas.concat([preprocessed_data, categorized_pclass_columns], axis=1)
preprocessed_data = preprocessed_data.drop(['Pclass'], axis=1)

In [81]:
preprocessed_data.head()

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_U,Pclass_1,Pclass_2,Pclass_3
0,1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,False,True,False,False,True,False,False,False,True
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,True,False,True,False,False,False,True,False,False
2,3,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,True,False,False,False,True,False,False,False,True
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,True,False,False,False,True,False,True,False,False
4,5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,False,True,False,False,True,False,False,False,True


### 13.3.3 Binning

In [82]:
bins = [0, 10, 20, 30, 40, 50, 60, 70, 80]
categorized_age = pandas.cut(preprocessed_data['Age'], bins)
preprocessed_data['Categorized_age'] = categorized_age
preprocessed_data = preprocessed_data.drop(["Age"], axis=1)

In [83]:
preprocessed_data.head()

Unnamed: 0,PassengerId,Survived,Name,SibSp,Parch,Ticket,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_U,Pclass_1,Pclass_2,Pclass_3,Categorized_age
0,1,0,"Braund, Mr. Owen Harris",1,0,A/5 21171,7.25,False,True,False,False,True,False,False,False,True,"(20, 30]"
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,0,PC 17599,71.2833,True,False,True,False,False,False,True,False,False,"(30, 40]"
2,3,1,"Heikkinen, Miss. Laina",0,0,STON/O2. 3101282,7.925,True,False,False,False,True,False,False,False,True,"(20, 30]"
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,0,113803,53.1,True,False,False,False,True,False,True,False,False,"(30, 40]"
4,5,0,"Allen, Mr. William Henry",0,0,373450,8.05,False,True,False,False,True,False,False,False,True,"(30, 40]"


In [84]:
cagegorized_age_columns = pandas.get_dummies(preprocessed_data['Categorized_age'], prefix='Categorized_age')
preprocessed_data = pandas.concat([preprocessed_data, cagegorized_age_columns], axis=1)
preprocessed_data = preprocessed_data.drop(['Categorized_age'], axis=1)

In [85]:
preprocessed_data.head()

Unnamed: 0,PassengerId,Survived,Name,SibSp,Parch,Ticket,Fare,Sex_female,Sex_male,Embarked_C,...,Pclass_2,Pclass_3,"Categorized_age_(0, 10]","Categorized_age_(10, 20]","Categorized_age_(20, 30]","Categorized_age_(30, 40]","Categorized_age_(40, 50]","Categorized_age_(50, 60]","Categorized_age_(60, 70]","Categorized_age_(70, 80]"
0,1,0,"Braund, Mr. Owen Harris",1,0,A/5 21171,7.25,False,True,False,...,False,True,False,False,True,False,False,False,False,False
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,0,PC 17599,71.2833,True,False,True,...,False,False,False,False,False,True,False,False,False,False
2,3,1,"Heikkinen, Miss. Laina",0,0,STON/O2. 3101282,7.925,True,False,False,...,False,True,False,False,True,False,False,False,False,False
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,0,113803,53.1,True,False,False,...,False,False,False,False,False,True,False,False,False,False
4,5,0,"Allen, Mr. William Henry",0,0,373450,8.05,False,True,False,...,False,True,False,False,False,True,False,False,False,False


### 13.3.3 Feature selection

In [86]:
preprocessed_data = preprocessed_data.drop(['Name', 'Ticket', 'PassengerId'], axis=1)

In [87]:
preprocessed_data.head()

Unnamed: 0,Survived,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_U,...,Pclass_2,Pclass_3,"Categorized_age_(0, 10]","Categorized_age_(10, 20]","Categorized_age_(20, 30]","Categorized_age_(30, 40]","Categorized_age_(40, 50]","Categorized_age_(50, 60]","Categorized_age_(60, 70]","Categorized_age_(70, 80]"
0,0,1,0,7.25,False,True,False,False,True,False,...,False,True,False,False,True,False,False,False,False,False
1,1,1,0,71.2833,True,False,True,False,False,False,...,False,False,False,False,False,True,False,False,False,False
2,1,0,0,7.925,True,False,False,False,True,False,...,False,True,False,False,True,False,False,False,False,False
3,1,1,0,53.1,True,False,False,False,True,False,...,False,False,False,False,False,True,False,False,False,False
4,0,0,0,8.05,False,True,False,False,True,False,...,False,True,False,False,False,True,False,False,False,False


# 13.4 Training models

### 13.4.1 Features-labels split and train-validation split

In [89]:
data = preprocessed_data

In [90]:
features = data.drop(["Survived"], axis=1)
labels = data["Survived"]

In [91]:
from sklearn.model_selection import train_test_split

In [92]:
# remark: we fix random_state the end, to make sure we always get the same split
features_train, features_validation_test, labels_train, labels_validation_test = train_test_split(
    features, labels, test_size=0.4, random_state=100)

In [93]:
features_validation, features_test, labels_validation, labels_test = train_test_split(
    features_validation_test, labels_validation_test, test_size=0.5, random_state=100)

In [94]:
print(len(features_train))
print(len(features_validation))
print(len(features_test))
print(len(labels_train))
print(len(labels_validation))
print(len(labels_test))

534
178
179
534
178
179


### 13.4.2 Training different models on our dataset

We'll train four models:
- Logistic regression (perceptron)
- Decision tree
- Naive Bayes
- Support vector machine (SVM)

In [95]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()
lr_model.fit(features_train, labels_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [96]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()
dt_model.fit(features_train, labels_train)

In [97]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(features_train, labels_train)

In [98]:
from sklearn.svm import SVC

svm_model = SVC()
svm_model.fit(features_train, labels_train)

In [99]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(features_train, labels_train)

In [100]:
from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier()
gb_model.fit(features_train, labels_train)

In [101]:
from sklearn.ensemble import AdaBoostClassifier

ab_model = AdaBoostClassifier()
ab_model.fit(features_train, labels_train)

### 13.4.3 Evaluating the models

#### Accuracy

In [102]:
print("Scores of the models")
print("Logistic regression:", lr_model.score(features_validation, labels_validation))
print("Decision tree:", dt_model.score(features_validation, labels_validation))
print("Naive Bayes:", nb_model.score(features_validation, labels_validation))
print("SVM:", svm_model.score(features_validation, labels_validation))
print("Random forest:", rf_model.score(features_validation, labels_validation))
print("Gradient boosting:", gb_model.score(features_validation, labels_validation))
print("AdaBoost:", ab_model.score(features_validation, labels_validation))

Scores of the models
Logistic regression: 0.7696629213483146
Decision tree: 0.7696629213483146
Naive Bayes: 0.7471910112359551
SVM: 0.6797752808988764
Random forest: 0.7696629213483146
Gradient boosting: 0.8089887640449438
AdaBoost: 0.7359550561797753


#### F1-score

In [103]:
from sklearn.metrics import f1_score

print("F1-scores of the models:")

lr_predicted_labels = lr_model.predict(features_validation)
print("Logistic regression:", f1_score(labels_validation, lr_predicted_labels))

dt_predicted_labels = dt_model.predict(features_validation)
print("Decision Tree:", f1_score(labels_validation, dt_predicted_labels))

nb_predicted_labels = nb_model.predict(features_validation)
print("Naive Bayes:", f1_score(labels_validation, nb_predicted_labels))

svm_predicted_labels = svm_model.predict(features_validation)
print("Support Vector Machine:", f1_score(labels_validation, svm_predicted_labels))

rf_predicted_labels = rf_model.predict(features_validation)
print("Random Forest:", f1_score(labels_validation, rf_predicted_labels))

gb_predicted_labels = gb_model.predict(features_validation)
print("Gradient boosting:", f1_score(labels_validation, gb_predicted_labels))

ab_predicted_labels = ab_model.predict(features_validation)
print("AdaBoost:", f1_score(labels_validation, ab_predicted_labels))

F1-scores of the models:
Logistic regression: 0.6870229007633588
Decision Tree: 0.6917293233082706
Naive Bayes: 0.6808510638297872
Support Vector Machine: 0.4
Random Forest: 0.6917293233082706
Gradient boosting: 0.7384615384615385
AdaBoost: 0.6466165413533834


### 13.4.4 Testing the model

Finding the accuracy and the F1-score of the model in the testing set.

In [104]:
gb_model.score(features_test, labels_test)

0.8324022346368715

In [105]:
gb_predicted_test_labels = gb_model.predict(features_test)
f1_score(labels_test, gb_predicted_test_labels)

0.8026315789473685

# 13.5 Grid search

In [106]:
from sklearn.model_selection import GridSearchCV

In [107]:
# Grid search with an rbf kernel

print("SVM grid search with a radial basis function kernel")

# rbf, C=1, gamma=0.1
svm_1_01 = SVC(kernel='rbf', C=1, gamma=0.1)
svm_1_01.fit(features_train, labels_train)
print("C=1, gamma=0.1", svm_1_01.score(features_validation, labels_validation))

# rbf, C=1, gamma=1
svm_1_1 = SVC(kernel='rbf', C=1, gamma=1)
svm_1_1.fit(features_train, labels_train)
print("C=1, gamma=1", svm_1_1.score(features_validation, labels_validation))

# rbf, C=1, gamma=10
svm_1_10 = SVC(kernel='rbf', C=1, gamma=10)
svm_1_10.fit(features_train, labels_train)
print("C=1, gamma=10", svm_1_10.score(features_validation, labels_validation))

# rbf, C=10, gamma=0.1
svm_10_01 = SVC(kernel='rbf', C=10, gamma=0.1)
svm_10_01.fit(features_train, labels_train)
print("C=10, gamma=0.1", svm_10_01.score(features_validation, labels_validation))

# rbf, C=10, gamma=1
svm_10_1 = SVC(kernel='rbf', C=10, gamma=1)
svm_10_1.fit(features_train, labels_train)
print("C=10, gamma=1", svm_10_1.score(features_validation, labels_validation))

# rbf, C=10, gamma=10
svm_10_10 = SVC(kernel='rbf', C=10, gamma=10)
svm_10_10.fit(features_train, labels_train)
print("C=10, gamma=10", svm_10_10.score(features_validation, labels_validation))

SVM grid search with a radial basis function kernel
C=1, gamma=0.1 0.702247191011236
C=1, gamma=1 0.6966292134831461
C=1, gamma=10 0.6685393258426966
C=10, gamma=0.1 0.7247191011235955
C=10, gamma=1 0.6910112359550562
C=10, gamma=10 0.651685393258427


In [108]:
svm_parameters = {'kernel': ['rbf'],
                  'C': [0.01, 0.1, 1 , 10, 100],
                  'gamma': [0.01, 0.1, 1, 10, 100]
                }
svm = SVC()
svm_gs = GridSearchCV(estimator = svm,
                      param_grid = svm_parameters)
svm_gs.fit(features_train, labels_train)

svm_winner = svm_gs.best_estimator_
svm_winner

svm_winner.score(features_validation, labels_validation)

0.7191011235955056

In [109]:
svm_winner

# 13.6 Cross validation

In [110]:
svm_gs.cv_results_

{'mean_fit_time': array([0.01075692, 0.00999722, 0.01085267, 0.01248336, 0.0110374 ,
        0.00947075, 0.01015854, 0.01137509, 0.01363521, 0.01234083,
        0.01018143, 0.01105995, 0.01169319, 0.01235037, 0.01154022,
        0.01241469, 0.0121304 , 0.01409969, 0.01340399, 0.01202645,
        0.01706409, 0.01779289, 0.01294398, 0.01360765, 0.01277399]),
 'std_fit_time': array([2.15020543e-03, 3.30988764e-04, 2.00545124e-04, 9.92893673e-04,
        3.00044355e-04, 1.18385463e-04, 9.31526743e-05, 3.03970041e-04,
        3.24536497e-03, 1.60259227e-03, 1.19222227e-03, 1.63469584e-04,
        2.72955493e-04, 2.54539293e-04, 1.95914592e-04, 1.73901447e-03,
        9.09562443e-04, 7.67016938e-04, 5.98716888e-04, 3.28605352e-04,
        4.26935804e-03, 2.08964493e-03, 5.30501922e-04, 6.25691060e-04,
        5.98487813e-04]),
 'mean_score_time': array([0.00495477, 0.00490875, 0.0049468 , 0.00565486, 0.00506916,
        0.00453844, 0.00469518, 0.00500827, 0.00595551, 0.00516267,
        0.00

# Exercise 13.1

In [113]:
# IMPORTANT: ONLY RUN THIS CELL IF YOU HAVE CLONED THE REPO
test_data = pd.read_csv('test.csv')

In [116]:
# IMPORTANT: ONLY RUN THIS CELL IF YOU ARE WORKING ON A COLAB
url = "https://raw.githubusercontent.com/luisguiserrano/manning/master/Chapter_13_End_to_end_example/test.csv"
test_data = pd.read_csv(url)

In [117]:
test_data.isna().sum()

Unnamed: 0,0
PassengerId,0
Pclass,0
Name,0
Sex,0
Age,86
SibSp,0
Parch,0
Ticket,0
Fare,1
Cabin,327


In [118]:
# Cleaning the data
test_data = test_data.drop('Cabin', axis=1)
test_data["Age"] = test_data["Age"].fillna(28.0)

# Catch! The test data has one missing fare. Let's fix that
average_fare = test_data["Fare"].mean()
test_data['Fare'] = test_data['Fare'].fillna(average_fare)

# Preprocessing the data
test_gender_columns = pandas.get_dummies(test_data['Sex'], prefix='Sex')
test_embarked_columns = pandas.get_dummies(test_data["Embarked"], prefix="Embarked")
test_data = pandas.concat([test_data, test_gender_columns], axis=1)
test_data = pandas.concat([test_data, test_embarked_columns], axis=1)
test_data = test_data.drop(['Sex', 'Embarked'], axis=1)

# Another small catch, the test data has no missing 'Embarked' fields. Therefore, the processed test data will not
# have an 'Embarked_Q' column. We need to artificially add one filled with zeros.
test_data['Embarked_U'] = pandas.DataFrame([0 for i in range(len(test_data))])

test_categorized_pclass_columns = pandas.get_dummies(test_data['Pclass'], prefix='Pclass')
test_data = pandas.concat([test_data, test_categorized_pclass_columns], axis=1)
test_data = test_data.drop(['Pclass'], axis=1)

bins = [0, 10, 20, 30, 40, 50, 60, 70, 80]
test_categorized_age = pandas.cut(test_data['Age'], bins)
test_data['Categorized_age'] = categorized_age
test_data = test_data.drop(["Age"], axis=1)

test_cagegorized_age_columns = pandas.get_dummies(test_data['Categorized_age'], prefix='Categorized_age')
test_data = pandas.concat([test_data, test_cagegorized_age_columns], axis=1)
test_data = test_data.drop(['Categorized_age'], axis=1)

test_data = test_data.drop(['Name', 'Ticket', 'PassengerId'], axis=1)
test_data

Unnamed: 0,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_U,Pclass_1,Pclass_2,Pclass_3,"Categorized_age_(0, 10]","Categorized_age_(10, 20]","Categorized_age_(20, 30]","Categorized_age_(30, 40]","Categorized_age_(40, 50]","Categorized_age_(50, 60]","Categorized_age_(60, 70]","Categorized_age_(70, 80]"
0,0,0,7.8292,False,True,False,True,False,0,False,False,True,False,False,True,False,False,False,False,False
1,1,0,7.0000,True,False,False,False,True,0,False,False,True,False,False,False,True,False,False,False,False
2,0,0,9.6875,False,True,False,True,False,0,False,True,False,False,False,True,False,False,False,False,False
3,0,0,8.6625,False,True,False,False,True,0,False,False,True,False,False,False,True,False,False,False,False
4,1,1,12.2875,True,False,False,False,True,0,False,False,True,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,0,0,8.0500,False,True,False,False,True,0,False,False,True,False,False,True,False,False,False,False,False
414,0,0,108.9000,True,False,True,False,False,0,True,False,False,False,False,False,False,True,False,False,False
415,0,0,7.2500,False,True,False,False,True,0,False,False,True,False,False,True,False,False,False,False,False
416,0,0,8.0500,False,True,False,False,True,0,False,False,True,False,False,False,True,False,False,False,False


Now, to check how many survivors were predicted by each model

In [119]:
# Logistic regression
sum(lr_model.predict(test_data))

np.int64(153)

In [120]:
# Decision tree
sum(dt_model.predict(test_data))

np.int64(160)

In [121]:
# Naive Bayes
sum(nb_model.predict(test_data))

np.int64(195)

In [122]:
# Support vector machine
sum(svm_model.predict(test_data))

np.int64(61)

In [123]:
# Random forest
sum(rf_model.predict(test_data))

np.int64(153)

In [124]:
# Gradient boosting
sum(gb_model.predict(test_data))

np.int64(156)

In [125]:
# AdaBoost
sum(ab_model.predict(test_data))

np.int64(151)

Since the three strongest models in terms of accuracy were random forests, gradient boosting, and adaboost, and they predicted that 154, 156, and 155 passengers survived out of the 418 in the test set, a good estimate for the number of survivors is the average of these three predictions, or 155.