# Titanic Code Machine Learning Project

### Rudimentary Set-Up 
Importing Libraries that will (most probably) be required alongside importing our dataframes

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import sklearn 
import seaborn as sns

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
tempdf = pd.read_csv('gender_submission.csv') #Might come useful later

## Data Exploration

In [4]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


### Exploring the Data

In [6]:
train_df.nunique()

PassengerId    891
Survived         2
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          147
Embarked         3
dtype: int64

In [7]:
train_df.isnull().sum() #Clearly Able to see Cabin and Age are the areas with the most issues

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

### Train Data Cleaning

In [8]:
train_df = train_df.drop('Cabin', axis = 1)
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [9]:
train_df['Age'].fillna(train_df['Age'].mean(), inplace = True)

In [10]:
train_df = train_df.dropna(subset = ['Embarked'])

In [11]:
train_df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [12]:
train_df['Travelling Alone'] = train_df.apply(lambda row: row.SibSp == 0 & row.Parch == 0, 
                                              axis = 1)

train_df.loc[train_df['Travelling Alone'] == True, 'Travelling Alone'] = 1

train_df.loc[train_df['Travelling Alone'] == False, 'Travelling Alone'] = 0

In [13]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Travelling Alone
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,1


In [14]:
train_df = train_df.drop(['Ticket', 'Name'], axis = 1)

In [15]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Travelling Alone
0,1,0,3,male,22.0,1,0,7.25,S,0
1,2,1,1,female,38.0,1,0,71.2833,C,0
2,3,1,3,female,26.0,0,0,7.925,S,1
3,4,1,1,female,35.0,1,0,53.1,S,0
4,5,0,3,male,35.0,0,0,8.05,S,1


In [16]:
train_df.loc[train_df['Sex'] == 'male', 'Sex'] = 1

train_df.loc[train_df['Sex'] == 'female', 'Sex'] = 0


In [17]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Travelling Alone
0,1,0,3,1,22.0,1,0,7.25,S,0
1,2,1,1,0,38.0,1,0,71.2833,C,0
2,3,1,3,0,26.0,0,0,7.925,S,1
3,4,1,1,0,35.0,1,0,53.1,S,0
4,5,0,3,1,35.0,0,0,8.05,S,1


In [18]:
train_df['Family Size'] = train_df.apply(lambda row: row.SibSp + row.Parch, 
                                              axis = 1)

In [19]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Travelling Alone,Family Size
0,1,0,3,1,22.0,1,0,7.25,S,0,1
1,2,1,1,0,38.0,1,0,71.2833,C,0,1
2,3,1,3,0,26.0,0,0,7.925,S,1,0
3,4,1,1,0,35.0,1,0,53.1,S,0,1
4,5,0,3,1,35.0,0,0,8.05,S,1,0


### Test Data Cleaning

In [20]:
test_df = test_df.drop(['Ticket', 'Name', 'Cabin'], axis = 1)

test_df['Age'].fillna(test_df['Age'].mean(), inplace = True)
test_df = test_df.dropna(subset = ['Embarked'])

test_df['Travelling Alone'] = test_df.apply(lambda row: row.SibSp == 0 & row.Parch == 0, 
                                              axis = 1)

test_df.loc[test_df['Travelling Alone'] == True, 'Travelling Alone'] = 1

test_df.loc[test_df['Travelling Alone'] == False, 'Travelling Alone'] = 0

test_df.loc[test_df['Sex'] == 'male', 'Sex'] = 1

test_df.loc[test_df['Sex'] == 'female', 'Sex'] = 0

test_df['Family Size'] = test_df.apply(lambda row: row.SibSp + row.Parch, 
                                              axis = 1)

## Machine Learning Models

In [21]:
train_df.loc[train_df['Embarked'] == 'Q', 'Embarked'] = 1

train_df.loc[train_df['Embarked'] == 'S', 'Embarked'] = 2

train_df.loc[train_df['Embarked'] == 'C', 'Embarked'] = 3

test_df.loc[test_df['Embarked'] == 'Q', 'Embarked'] = 1

test_df.loc[test_df['Embarked'] == 'S', 'Embarked'] = 2

test_df.loc[test_df['Embarked'] == 'C', 'Embarked'] = 3




In [22]:
test_df['Fare'].fillna(test_df['Fare'].mean(), inplace = True)

In [23]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Travelling Alone,Family Size
0,1,0,3,1,22.0,1,0,7.25,2,0,1
1,2,1,1,0,38.0,1,0,71.2833,3,0,1
2,3,1,3,0,26.0,0,0,7.925,2,1,0
3,4,1,1,0,35.0,1,0,53.1,2,0,1
4,5,0,3,1,35.0,0,0,8.05,2,1,0


In [24]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Travelling Alone,Family Size
0,892,3,1,34.5,0,0,7.8292,1,1,0
1,893,3,0,47.0,1,0,7.0,2,0,1
2,894,2,1,62.0,0,0,9.6875,1,1,0
3,895,3,1,27.0,0,0,8.6625,2,1,0
4,896,3,0,22.0,1,1,12.2875,2,0,2


In [25]:
features = ['Sex', 'SibSp', 'Parch', 'Travelling Alone', 'Family Size', 'Embarked', 'Pclass']

Y = train_df['Survived']
X = train_df[features]

X_test = test_df[features]

In [26]:
X_test.isnull().sum()

Sex                 0
SibSp               0
Parch               0
Travelling Alone    0
Family Size         0
Embarked            0
Pclass              0
dtype: int64

In [27]:
X.isnull().sum()

Sex                 0
SibSp               0
Parch               0
Travelling Alone    0
Family Size         0
Embarked            0
Pclass              0
dtype: int64

### Random Forest Model

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
rf = RandomForestClassifier(n_estimators = 100,
                           n_jobs = -1,
                           oob_score = True,
                           bootstrap = True,
                           random_state = 42)

In [30]:
rfm = rf.fit(X,Y)

In [31]:
rfm.score(X,Y)

0.8368953880764904

In [32]:
predictions = rfm.predict(X_test)

In [33]:
predictions

array([0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [34]:
from sklearn.model_selection import cross_val_score

In [35]:
accuracyRFM = cross_val_score(estimator=rfm, X = X, y = Y, cv=5)
accuracyRFM.mean()

0.7840411350219006

In [36]:
IDtest = test_df["PassengerId"]

test_Survived = pd.Series(rfm.predict(X_test), name="Survived")

results = pd.concat([IDtest,test_Survived],axis=1)

results.to_csv("ensemble_python_voting.csv",index=False)

### SVMs

In [37]:
from sklearn import svm

In [38]:
svms = svm.SVC()

In [39]:
s = svms.fit(X,Y)

In [40]:
svms.score(X,Y)

0.8087739032620922

In [41]:
predictions = svms.predict(X_test)

In [42]:
predictions

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [43]:
accuracySVM = cross_val_score(estimator=s, X = X, y = Y, cv=5)
accuracySVM.mean()

0.80315495461182

In [44]:
IDtest = test_df["PassengerId"]

test_Survived = pd.Series(s.predict(X_test), name="Survived")

results = pd.concat([IDtest,test_Survived],axis=1)

results.to_csv("ensemble_python_voting.csv",index=False)

### Logistic Regression

In [45]:
from sklearn.linear_model import LogisticRegression

In [46]:
lr = LogisticRegression()

In [47]:
l = lr.fit(X,Y)

In [48]:
l.score(X,Y)

0.8008998875140607

In [49]:
predictions = l.predict(X_test)

In [50]:
predictions

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [51]:
accuracySVM = cross_val_score(estimator=l, X = X, y = Y, cv=5)
accuracySVM.mean()

0.7885482130387862

In [52]:
IDtest = test_df["PassengerId"]

test_Survived = pd.Series(l.predict(X_test), name="Survived")

results = pd.concat([IDtest,test_Survived],axis=1)

results.to_csv("ensemble_python_voting.csv",index=False)

### Decision Trees

In [53]:
from sklearn.tree import DecisionTreeClassifier

In [54]:
dtc = DecisionTreeClassifier()

In [55]:
d = dtc.fit(X,Y)

In [56]:
dtc.score(X,Y)

0.8368953880764904

In [57]:
predictions = dtc.predict(X_test)

In [58]:
predictions

array([0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [59]:
accuracyDTC = cross_val_score(estimator=d, X = X, y = Y, cv=5)
accuracyDTC.mean()

0.7952834380752872

In [60]:
IDtest = test_df["PassengerId"]

test_Survived = pd.Series(d.predict(X_test), name="Survived")

results = pd.concat([IDtest,test_Survived],axis=1)

results.to_csv("ensemble_python_voting.csv",index=False)

### KNN

In [61]:
from sklearn.neighbors import KNeighborsClassifier

In [62]:
knn = KNeighborsClassifier()

In [63]:
k = knn.fit(X,Y)

In [64]:
k.score(X,Y)

0.8076490438695163

In [65]:
predictions = k.predict(X_test)
predictions

array([0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [66]:
accuracyKNN = cross_val_score(estimator=k, X = X, y = Y, cv=5)
accuracyKNN.mean()

0.7750650669713707

### AdaBoostClassifier 

In [67]:
from sklearn.ensemble import AdaBoostClassifier

In [68]:
abc = AdaBoostClassifier()

In [69]:
a = abc.fit(X,Y)

In [70]:
a.score(X,Y)

0.7975253093363329

In [71]:
predictions = a.predict(X_test)
predictions

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,

In [72]:
accuracyABC = cross_val_score(estimator=a, X = X, y = Y, cv=5)
accuracyABC.mean()

0.7862819780359297

### Gradient Boost Classifier

In [73]:
from sklearn.ensemble import GradientBoostingClassifier

In [74]:
gbc = GradientBoostingClassifier()

In [75]:
g = gbc.fit(X,Y)

In [76]:
g.score(X,Y)

0.8267716535433071

In [77]:
predictions = g.predict(X_test)
predictions

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [78]:
accuracyGBC = cross_val_score(estimator=g, X = X, y = Y, cv=5)
accuracyGBC.mean()

0.7896781565416112

# Tuning the Models

In [79]:
from pprint import pprint

In [80]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn import linear_model, decomposition
from sklearn.preprocessing import StandardScaler

In [81]:
kfold = StratifiedKFold(n_splits=10)

### Hyper-Parameter Tuning for Decision Trees

In [82]:
dtc = DecisionTreeClassifier()

pprint(dtc.get_params)

dtc_param_grid = {'criterion': ["gini", "entropy"], 'splitter': ["best", "random"]}

best_dtc = GridSearchCV(dtc, param_grid = dtc_param_grid, cv = kfold, scoring = 'accuracy', n_jobs = 4, verbose = 2)

best_dtc.fit(X, Y)

pprint(best_dtc.best_score_)

best_dtc = best_dtc.best_estimator_



<bound method BaseEstimator.get_params of DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')>
Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


0.8054136874361593


[Parallel(n_jobs=4)]: Done  40 out of  40 | elapsed:    0.9s finished


### Hyper-Parameter Tuning for Adaboost

In [83]:
adaDTC = AdaBoostClassifier(dtc, random_state = 7)

ada_param_grid = {"base_estimator__criterion" : ["gini", "entropy"],
              "base_estimator__splitter" :   ["best", "random"],
              "algorithm" : ["SAMME","SAMME.R"],
              "n_estimators" :[1,2],
              "learning_rate":  [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3,1.5]}

best_abc = GridSearchCV(adaDTC, param_grid = ada_param_grid, cv = kfold, scoring = 'accuracy', n_jobs = 4, verbose = 2)

best_abc.fit(X,Y)

pprint(best_abc.best_score_)

best_abc = best_abc.best_estimator_



[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 10 folds for each of 112 candidates, totalling 1120 fits


[Parallel(n_jobs=4)]: Done 136 tasks      | elapsed:    0.4s


0.807673646578141


[Parallel(n_jobs=4)]: Done 1120 out of 1120 | elapsed:    2.1s finished


### Hyper-Parameter Tuning for SVMs

In [84]:
from sklearn.svm import SVC

svmc = SVC(probability = True)

svmc_param_grid = {"kernel": ['rbf'], 'gamma': [0.001, 0.01, 0.1, 1, 10], 
                   'C': [0.01, 0.1, 1, 10, 50, 100, 500, 1000, 5000, 10000]}

best_svmc = GridSearchCV(svmc, param_grid = svmc_param_grid, cv = kfold, scoring = 'accuracy', n_jobs = 4, verbose = 2)

best_svmc.fit(X,Y)

pprint(best_svmc.best_score_)

best_svmc = best_svmc.best_estimator_

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 10 folds for each of 50 candidates, totalling 500 fits


[Parallel(n_jobs=4)]: Done  58 tasks      | elapsed:    1.0s
[Parallel(n_jobs=4)]: Done 436 tasks      | elapsed:   15.3s


0.8008937691521961


[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:   27.9s finished


### Hyper Parameter Tuning for Random Forest Classifier

In [85]:
rfc = RandomForestClassifier()

rf_param_grid = {'max_features' : [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 
                'min_samples_split': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 
                 'min_samples_leaf': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 150], 'max_features': [3, 0.1]}

best_rfc = GridSearchCV(rfc, param_grid = rf_param_grid, cv = kfold, scoring = 'accuracy', n_jobs  = 4, verbose = 2)

best_rfc.fit(X,Y)

pprint(best_rfc.best_score_)

best_rfc = best_rfc.best_estimator_

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 10 folds for each of 286 candidates, totalling 2860 fits


[Parallel(n_jobs=4)]: Done  92 tasks      | elapsed:    1.1s
[Parallel(n_jobs=4)]: Done 576 tasks      | elapsed:   16.3s
[Parallel(n_jobs=4)]: Done 1388 tasks      | elapsed:   41.2s
[Parallel(n_jobs=4)]: Done 2520 tasks      | elapsed:  1.2min


0.8009065372829417


[Parallel(n_jobs=4)]: Done 2860 out of 2860 | elapsed:  1.4min finished


### Hyper-Parameter Tuning for Gradient Boosting Classifier

In [86]:
gbc = GradientBoostingClassifier()

gb_param_grid = {'loss': ["deviance"], 'n_estimators': [1,0.1,0.05, 0.01],
                'max_depth':[0,2,4,6,8], 'min_samples_leaf': [50,100,150], 'max_features': [3,1,0.3,0.1]}

best_gbc = GridSearchCV(gbc, param_grid = gb_param_grid, cv = kfold, scoring = 'accuracy', n_jobs = 4, verbose = 2)

best_gbc.fit(X,Y)

pprint(best_gbc.best_score_)

best_gbc = best_gbc.best_estimator_

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 10 folds for each of 240 candidates, totalling 2400 fits


[Parallel(n_jobs=4)]: Done 136 tasks      | elapsed:    0.2s


0.6175434116445352


[Parallel(n_jobs=4)]: Done 2400 out of 2400 | elapsed:    2.0s finished


### Hyper-Parameter Tuning for KNN

In [87]:
knnc = KNeighborsClassifier()

knn_param_grid = {'n_neighbors':[5,6,7,8,9,10],
          'leaf_size':[1,2,3,5],
          'weights':['uniform', 'distance'],
          'algorithm':['auto', 'ball_tree','kd_tree','brute'],
          'n_jobs':[-1]}

best_knn = GridSearchCV(knnc, param_grid = knn_param_grid, cv = kfold, scoring = 'accuracy', n_jobs = 4, verbose = 2)

best_knn.fit(X,Y)

pprint(best_knn.best_score_)

best_knn = best_knn.best_estimator_

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 10 folds for each of 192 candidates, totalling 1920 fits


[Parallel(n_jobs=4)]: Done  58 tasks      | elapsed:    1.9s
[Parallel(n_jobs=4)]: Done 300 tasks      | elapsed:    9.1s
[Parallel(n_jobs=4)]: Done 706 tasks      | elapsed:   21.1s
[Parallel(n_jobs=4)]: Done 1272 tasks      | elapsed:   37.4s


0.8020301327885597


[Parallel(n_jobs=4)]: Done 1920 out of 1920 | elapsed:   56.5s finished


### Hyper-Parameter Tuning for Logistic Regression

In [117]:
lr = LogisticRegression()

lr_param_grid = {'penalty': ['l1','l2'], 'C': [50, 10, 1.0, 0.1, 0.01], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}

best_lr = GridSearchCV(lr, param_grid = lr_param_grid, cv = kfold, scoring = 'accuracy', n_jobs = 4, verbose = 2)

best_lr.fit(X,Y)

pprint(best_lr.best_score_)

best_lr = best_lr.best_estimator_


Fitting 10 folds for each of 50 candidates, totalling 500 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 136 tasks      | elapsed:    0.4s


0.7997701736465782


[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    1.2s finished


## Combining all of the Tuned Models into an Ensemble

In [135]:
from sklearn.ensemble import VotingClassifier

In [136]:
votingC = VotingClassifier(estimators=[('rfc', best_rfc),('svc', best_svmc), 
                                       ('adac',best_abc),('gbc',best_gbc), ('knn', best_knn), ('lr', best_lr)], voting='soft', n_jobs=4)

votingC = votingC.fit(X, Y)

In [137]:
votingC.score(X,Y)

0.8256467941507312

In [138]:
predictions = votingC.predict(X_test)

predictions

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [139]:
accuracyKNN = cross_val_score(estimator=votingC, X = X, y = Y, cv=5)
accuracyKNN.mean()

0.7964133815781121

In [140]:
test_Survived = pd.Series(votingC.predict(X_test), name="Survived")

results = pd.concat([IDtest,test_Survived],axis=1)

results.to_csv("ensemble_python_voting.csv",index=False)