# Running Multiple Classification Models using Titanic Example

See bookmarked Titanic notebook on Kaggle. From memmory, it has a for loop to run multiple classificaiton models efficiently. 

## Import Libraries

In [463]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

## Import Data

In [464]:
raw_train=pd.read_csv('train.csv')
raw_test=pd.read_csv('test.csv')

In [465]:
train=raw_train
test=raw_test

## Inspection

In [466]:
#print training dataset
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [467]:
#Get Summary Stats
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [468]:
#Get information about the Dataframe
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


## Cleaning

### Missing Data

In [494]:
#Check the two rows with missing Embarked values
train[pd.isnull(train['Embarked'])]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [495]:
#Out of all the fields, only Pclass could be related to which port a passenger Embarked from if those three ports 
#are of differnt economic demographics.
#Check the distribution of Embarked across Pclass to see if Pclass 1 predominantly from a specific port.

train.groupby(['Pclass','Embarked']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,PassengerId,Survived,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin
Pclass,Embarked,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,C,85,85,85,85,74,85,85,85,85,66
1,Q,2,2,2,2,2,2,2,2,2,2
1,S,127,127,127,127,108,127,127,127,127,106
2,C,17,17,17,17,15,17,17,17,17,2
2,Q,3,3,3,3,2,3,3,3,3,1
2,S,164,164,164,164,156,164,164,164,164,13
3,C,66,66,66,66,41,66,66,66,66,1
3,Q,72,72,72,72,24,72,72,72,72,1
3,S,353,353,353,353,290,353,353,353,353,10


In [496]:
#Given that the two passengers are Pclass 1, they are most likely to have embarked from S.

In [497]:
#formally - the mode by Pclass
train.groupby('Pclass').agg(pd.Series.mode)

Unnamed: 0_level_0,Survived,Sex,SibSp,Parch,Fare,Embarked
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,male,0,0,26.55,S
2,0,male,0,0,13.0,S
3,0,male,0,0,8.05,S


In [498]:
#However the mode for Pclass 1 is the same as the mode for the whole data.
#We assign the modal value to the missing Embarked data (the [0] after mode because mode is a pandas series)
mode=train['Embarked'].mode()
train['Embarked'].fillna(mode[0],inplace=True)

In [499]:
#Show the replaced values
train[train['PassengerId'].isin([62,830])]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,S
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,S


In [500]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [501]:
#Looking at the Correlation Matrix below, it seems Age is correlated with Pclass
train.corr()

#You could also encode other categorical variables to seem them in the matrix

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


In [502]:
#SibSp and Parch are also correlated with Age.
#We could use Regression with Age as target and Pclass, Parch and SibSp as Features.
#https://towardsdatascience.com/predict-missing-values-in-the-dataset-897912a54b7b

#But for now we will simple assign the mean age for the Pclass of the missing data rows

In [503]:
#Average age different for different Pclass
train.groupby('Pclass').mean()

Unnamed: 0_level_0,PassengerId,Survived,Age,SibSp,Parch,Fare
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,461.597222,0.62963,38.233441,0.416667,0.356481,84.154687
2,445.956522,0.472826,29.87763,0.402174,0.380435,20.662183
3,439.154786,0.242363,25.14062,0.615071,0.393075,13.67555


In [504]:
#Replace Missing Vales of Age with Mean by Pclass
train['Age'] = train['Age'].fillna(train.groupby(['Pclass'])['Age'].transform('mean'))


#for test data
test['Age'] = test['Age'].fillna(train.groupby(['Pclass'])['Age'].transform('mean'))


In [506]:
#Replace Missing Values for Fare in test Data by Mean of Fare in train data
test['Fare'] = test['Fare'].fillna(train['Fare'].mean())

In [509]:
#Design New Feature
train['Cabin_Class'] = train['Cabin'].str[0]

#for test
test['Cabin_Class'] = test['Cabin'].str[0]

In [510]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Cabin_Class
0,1,0,3,"Braund, Mr. Owen Harris",male,22.00000,1,0,A/5 21171,7.2500,,S,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.00000,1,0,PC 17599,71.2833,C85,C,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.00000,0,0,STON/O2. 3101282,7.9250,,S,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.00000,1,0,113803,53.1000,C123,S,C
4,5,0,3,"Allen, Mr. William Henry",male,35.00000,0,0,373450,8.0500,,S,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.00000,0,0,211536,13.0000,,S,
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.00000,0,0,112053,30.0000,B42,S,B
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,25.14062,1,2,W./C. 6607,23.4500,,S,
889,890,1,1,"Behr, Mr. Karl Howell",male,26.00000,0,0,111369,30.0000,C148,C,C


In [511]:
pd.crosstab(train['Cabin_Class'], train['Pclass'], margins = False)

Pclass,1,2,3
Cabin_Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,15,0,0
B,47,0,0
C,59,0,0
D,29,4,0
E,25,4,3
F,0,8,5
G,0,0,4
T,1,0,0


In [512]:
#Average Fares per Cabin Class
train.groupby(['Cabin_Class']).mean()

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
Cabin_Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A,439.6,0.466667,1.0,43.513355,0.133333,0.133333,39.623887
B,521.808511,0.744681,1.0,35.09504,0.361702,0.574468,113.505764
C,406.440678,0.59322,1.0,36.377755,0.644068,0.474576,100.151341
D,475.939394,0.757576,1.121212,38.983845,0.424242,0.30303,57.244576
E,502.4375,0.75,1.3125,37.862846,0.3125,0.3125,46.026694
F,370.384615,0.615385,2.384615,20.752403,0.538462,0.538462,18.696792
G,216.0,0.5,3.0,14.75,0.5,1.25,13.58125
T,340.0,0.0,1.0,45.0,0.0,0.0,35.5


In [513]:
#Replace the Missing values in Cabin_Class with "U" for Unknown
train['Cabin_Class'].fillna('U',inplace=True)


#for test
test['Cabin_Class'].fillna('U',inplace=True)

### One Hot Ecoding Emabrked and Cabin_Class 

In [514]:
#Splitting Y from X before One Hot Encoding to ensure similar Encoding Dictionary and column numbers for train and test

y_train = train['Survived']
y_train


0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [515]:
train.drop(columns=['Survived'], inplace=True)
train

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Cabin_Class
0,1,3,"Braund, Mr. Owen Harris",male,22.00000,1,0,A/5 21171,7.2500,,S,U
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.00000,1,0,PC 17599,71.2833,C85,C,C
2,3,3,"Heikkinen, Miss. Laina",female,26.00000,0,0,STON/O2. 3101282,7.9250,,S,U
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.00000,1,0,113803,53.1000,C123,S,C
4,5,3,"Allen, Mr. William Henry",male,35.00000,0,0,373450,8.0500,,S,U
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",male,27.00000,0,0,211536,13.0000,,S,U
887,888,1,"Graham, Miss. Margaret Edith",female,19.00000,0,0,112053,30.0000,B42,S,B
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,25.14062,1,2,W./C. 6607,23.4500,,S,U
889,890,1,"Behr, Mr. Karl Howell",male,26.00000,0,0,111369,30.0000,C148,C,C


In [518]:
from feature_engine.encoding import OneHotEncoder

In [519]:
#Create Instance and Fit
# drop_lastto return k-1, false to return k
ohe = OneHotEncoder(top_categories=None, variables=['Cabin_Class', 'Embarked'], drop_last=True)
ohe.fit(train)

OneHotEncoder(drop_last=True, variables=['Cabin_Class', 'Embarked'])

In [520]:
#Transform
train = ohe.transform(train)
train

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Cabin_Class_U,Cabin_Class_C,Cabin_Class_E,Cabin_Class_G,Cabin_Class_D,Cabin_Class_A,Cabin_Class_B,Cabin_Class_F,Embarked_S,Embarked_C
0,1,3,"Braund, Mr. Owen Harris",male,22.00000,1,0,A/5 21171,7.2500,,1,0,0,0,0,0,0,0,1,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.00000,1,0,PC 17599,71.2833,C85,0,1,0,0,0,0,0,0,0,1
2,3,3,"Heikkinen, Miss. Laina",female,26.00000,0,0,STON/O2. 3101282,7.9250,,1,0,0,0,0,0,0,0,1,0
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.00000,1,0,113803,53.1000,C123,0,1,0,0,0,0,0,0,1,0
4,5,3,"Allen, Mr. William Henry",male,35.00000,0,0,373450,8.0500,,1,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",male,27.00000,0,0,211536,13.0000,,1,0,0,0,0,0,0,0,1,0
887,888,1,"Graham, Miss. Margaret Edith",female,19.00000,0,0,112053,30.0000,B42,0,0,0,0,0,0,1,0,1,0
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,25.14062,1,2,W./C. 6607,23.4500,,1,0,0,0,0,0,0,0,1,0
889,890,1,"Behr, Mr. Karl Howell",male,26.00000,0,0,111369,30.0000,C148,0,1,0,0,0,0,0,0,0,1


In [524]:
#for test
#Using the Encoder fitted with training data
test = ohe.transform(test)
test

ValueError: The number of columns in this dataset is different from the one used to fit this transformer (when using the fit() method).

In [525]:
train.isnull().sum()

PassengerId        0
Pclass             0
Name               0
Sex                0
Age                0
SibSp              0
Parch              0
Ticket             0
Fare               0
Cabin            687
Cabin_Class_U      0
Cabin_Class_C      0
Cabin_Class_E      0
Cabin_Class_G      0
Cabin_Class_D      0
Cabin_Class_A      0
Cabin_Class_B      0
Cabin_Class_F      0
Embarked_S         0
Embarked_C         0
dtype: int64

In [526]:
#Cabin Feature does not carry much information since many Distinct values and majority values missing.
#Therefore, we drop it along with the other uninformative columns.

train.drop(columns=['Cabin', 'Ticket', 'Name', 'PassengerId'], inplace=True)

In [527]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Pclass         891 non-null    int64  
 1   Sex            891 non-null    object 
 2   Age            891 non-null    float64
 3   SibSp          891 non-null    int64  
 4   Parch          891 non-null    int64  
 5   Fare           891 non-null    float64
 6   Cabin_Class_U  891 non-null    int32  
 7   Cabin_Class_C  891 non-null    int32  
 8   Cabin_Class_E  891 non-null    int32  
 9   Cabin_Class_G  891 non-null    int32  
 10  Cabin_Class_D  891 non-null    int32  
 11  Cabin_Class_A  891 non-null    int32  
 12  Cabin_Class_B  891 non-null    int32  
 13  Cabin_Class_F  891 non-null    int32  
 14  Embarked_S     891 non-null    int32  
 15  Embarked_C     891 non-null    int32  
dtypes: float64(2), int32(10), int64(3), object(1)
memory usage: 76.7+ KB


In [528]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   PassengerId    418 non-null    int64  
 1   Pclass         418 non-null    int64  
 2   Name           418 non-null    object 
 3   Sex            418 non-null    object 
 4   Age            418 non-null    float64
 5   SibSp          418 non-null    int64  
 6   Parch          418 non-null    int64  
 7   Ticket         418 non-null    object 
 8   Fare           418 non-null    float64
 9   Cabin          91 non-null     object 
 10  Cabin_Class_U  418 non-null    int32  
 11  Cabin_Class_C  418 non-null    int32  
 12  Cabin_Class_E  418 non-null    int32  
 13  Cabin_Class_G  418 non-null    int32  
 14  Cabin_Class_D  418 non-null    int32  
 15  Cabin_Class_A  418 non-null    int32  
 16  Cabin_Class_B  418 non-null    int32  
 17  Cabin_Class_F  418 non-null    int32  
 18  Embarked_S

In [529]:
#For test
test.drop(columns=['Cabin', 'Ticket', 'Name', 'PassengerId'], inplace=True)
test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin_Class_U,Cabin_Class_C,Cabin_Class_E,Cabin_Class_G,Cabin_Class_D,Cabin_Class_A,Cabin_Class_B,Cabin_Class_F,Embarked_S,Embarked_C
0,3,male,34.50000,0,0,7.8292,1,0,0,0,0,0,0,0,0,0
1,3,female,47.00000,1,0,7.0000,1,0,0,0,0,0,0,0,1,0
2,2,male,62.00000,0,0,9.6875,1,0,0,0,0,0,0,0,0,0
3,3,male,27.00000,0,0,8.6625,1,0,0,0,0,0,0,0,1,0
4,3,female,22.00000,1,1,12.2875,1,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,3,male,29.87763,0,0,8.0500,1,0,0,0,0,0,0,0,1,0
414,1,female,39.00000,0,0,108.9000,0,1,0,0,0,0,0,0,0,1
415,3,male,38.50000,0,0,7.2500,1,0,0,0,0,0,0,0,1,0
416,3,male,29.87763,0,0,8.0500,1,0,0,0,0,0,0,0,1,0


## Preparing Data for ML

### Encoding Categorical Variables

In [532]:
#Embarked and Cabin_Class already One Hot Encoded

In [533]:
#Sex is binary - so Ordinal Encoding
# Using Feature Engine because it allows us to code multiple variables at a time (sklearn doesn't) and allows us to 
#view the dictionary later (pandas doesn't)

from feature_engine.encoding import OrdinalEncoder

In [534]:
ordinal_enc = OrdinalEncoder(encoding_method='arbitrary', variables=['Sex'])

ordinal_enc.fit(train)

OrdinalEncoder(encoding_method='arbitrary', variables=['Sex'])

In [535]:
# in the encoder dict we can observe the numbers
# assigned to each category for all the indicated variables

ordinal_enc.encoder_dict_

{'Sex': {'male': 0, 'female': 1}}

In [536]:
# this is the list of variables that the encoder will transform

ordinal_enc.variables_

['Sex']

In [537]:
#Transform Train
train = ordinal_enc.transform(train)

train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin_Class_U,Cabin_Class_C,Cabin_Class_E,Cabin_Class_G,Cabin_Class_D,Cabin_Class_A,Cabin_Class_B,Cabin_Class_F,Embarked_S,Embarked_C
0,3,0,22.00000,1,0,7.2500,1,0,0,0,0,0,0,0,1,0
1,1,1,38.00000,1,0,71.2833,0,1,0,0,0,0,0,0,0,1
2,3,1,26.00000,0,0,7.9250,1,0,0,0,0,0,0,0,1,0
3,1,1,35.00000,1,0,53.1000,0,1,0,0,0,0,0,0,1,0
4,3,0,35.00000,0,0,8.0500,1,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,2,0,27.00000,0,0,13.0000,1,0,0,0,0,0,0,0,1,0
887,1,1,19.00000,0,0,30.0000,0,0,0,0,0,0,1,0,1,0
888,3,1,25.14062,1,2,23.4500,1,0,0,0,0,0,0,0,1,0
889,1,0,26.00000,0,0,30.0000,0,1,0,0,0,0,0,0,0,1


In [538]:
test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin_Class_U,Cabin_Class_C,Cabin_Class_E,Cabin_Class_G,Cabin_Class_D,Cabin_Class_A,Cabin_Class_B,Cabin_Class_F,Embarked_S,Embarked_C
0,3,male,34.50000,0,0,7.8292,1,0,0,0,0,0,0,0,0,0
1,3,female,47.00000,1,0,7.0000,1,0,0,0,0,0,0,0,1,0
2,2,male,62.00000,0,0,9.6875,1,0,0,0,0,0,0,0,0,0
3,3,male,27.00000,0,0,8.6625,1,0,0,0,0,0,0,0,1,0
4,3,female,22.00000,1,1,12.2875,1,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,3,male,29.87763,0,0,8.0500,1,0,0,0,0,0,0,0,1,0
414,1,female,39.00000,0,0,108.9000,0,1,0,0,0,0,0,0,0,1
415,3,male,38.50000,0,0,7.2500,1,0,0,0,0,0,0,0,1,0
416,3,male,29.87763,0,0,8.0500,1,0,0,0,0,0,0,0,1,0


In [539]:
#Transform Test using same Encoder object

test = ordinal_enc.transform(test)

test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin_Class_U,Cabin_Class_C,Cabin_Class_E,Cabin_Class_G,Cabin_Class_D,Cabin_Class_A,Cabin_Class_B,Cabin_Class_F,Embarked_S,Embarked_C
0,3,0,34.50000,0,0,7.8292,1,0,0,0,0,0,0,0,0,0
1,3,1,47.00000,1,0,7.0000,1,0,0,0,0,0,0,0,1,0
2,2,0,62.00000,0,0,9.6875,1,0,0,0,0,0,0,0,0,0
3,3,0,27.00000,0,0,8.6625,1,0,0,0,0,0,0,0,1,0
4,3,1,22.00000,1,1,12.2875,1,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,3,0,29.87763,0,0,8.0500,1,0,0,0,0,0,0,0,1,0
414,1,1,39.00000,0,0,108.9000,0,1,0,0,0,0,0,0,0,1
415,3,0,38.50000,0,0,7.2500,1,0,0,0,0,0,0,0,1,0
416,3,0,29.87763,0,0,8.0500,1,0,0,0,0,0,0,0,1,0


In [540]:
#Convert to nparrays
nptrain=train.values
nptest=test.values
y_train=y_train.values

# Running Various Classification Models - With Cross Validation

https://towardsdatascience.com/predicting-survivors-of-titanic-e7280822b00b

In [548]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

In [550]:
# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr_score = np.mean(cross_val_score(lr, nptrain, y_train))
print(f'Logistic Regression: {lr_score}')

Logistic Regression: 0.8024794425961961


In [551]:
# Support Vector Machine
svc = SVC()
svc_score = np.mean(cross_val_score(svc, nptrain, y_train))
print(f'Support Vector Machine: {svc_score}')

Support Vector Machine: 0.6734982110350888


In [552]:
# Decision Tree
dtc = DecisionTreeClassifier()
dtc_score = np.mean(cross_val_score(dtc, nptrain, y_train))
print(f'Decision Tree: {dtc_score}')

Decision Tree: 0.7867867679367271


In [553]:
# K Nearest Neighbors
knc = KNeighborsClassifier()
knc_score = np.mean(cross_val_score(knc, nptrain, y_train))
print(f'K Nearest Neighbors: {knc_score}')

K Nearest Neighbors: 0.7003703471219636


In [555]:
# Multi-Layer Perceptron
mlpc = MLPClassifier(max_iter=1000)
mlpc_score = np.mean(cross_val_score(mlpc, nptrain, y_train))
print(f'Multi-Layer Perceptron: {mlpc_score}')

Multi-Layer Perceptron: 0.8013746783001695
