## Kaggle competition (Titanic)

In [1]:
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV,cross_val_score,cross_validate
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.linear_model import LogisticRegression,Lasso,LinearRegression
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.neighbors import KNeighborsRegressor,KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,mean_squared_error, r2_score,roc_curve
import scipy.cluster.hierarchy as shc


import warnings
warnings.filterwarnings('ignore')

# Explanatory Data Analysis

In [2]:
df_titanic = pd.read_csv("train.csv")
df_titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
#import the test dataset
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
df_titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
df_titanic.shape

(891, 12)

In [6]:
# Check for missing values
df_titanic.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

#### Train Data
Of  891 values Cabin has 687 missing values, that is too much missing values, we will drop cabin. Age has 177 missing we will fill those missing values, for embark fill with the most recurring embark value

In [7]:
df_train = df_titanic.drop(columns = ['Cabin'])

In [8]:
df_train['Age'] = df_train['Age'].fillna(round(df_train['Age'].mean(),2))

In [9]:
df_train.Embarked.unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [10]:
print(f"There are {df_train.loc[df_train['Embarked']=='S', 'Embarked'].count()} for S")
print(f"There are {df_train.loc[df_train['Embarked']=='C', 'Embarked'].count()} for C")
print(f"There are {df_train.loc[df_train['Embarked']=='Q', 'Embarked'].count()} for Q")

There are 644 for S
There are 168 for C
There are 77 for Q


In [11]:
df_train['Embarked'] = df_train['Embarked'].fillna('S')

In [12]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [13]:
#confirm filling went well
df_train.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

#### Test Data

In [14]:
df_test.sample(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
155,1047,3,"Duquemin, Mr. Joseph",male,24.0,0,0,S.O./P.P. 752,7.55,,S
161,1053,3,"Touma, Master. Georges Youssef",male,7.0,1,1,2650,15.2458,,C
410,1302,3,"Naughton, Miss. Hannah",female,,0,0,365237,7.75,,Q
113,1005,3,"Buckley, Miss. Katherine",female,18.5,0,0,329944,7.2833,,Q
245,1137,1,"Kenyon, Mr. Frederick R",male,41.0,1,0,17464,51.8625,D21,S


In [15]:
df_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

Fill age and fill fare with the average, and drop Cabin data

In [16]:
df_test = df_test.drop(columns = ['Cabin'])

In [17]:
df_test['Age'] = df_test['Age'].fillna(round(df_test['Age'].mean(),2))

In [18]:
df_test['Fare'] = df_test['Fare'].fillna(round(df_test['Fare'].mean(),2))

In [19]:
#confirm the fill in went well

In [20]:
df_test.isna().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [21]:
df_train_all = df_train
df_test_all = df_test

In [22]:
# df_train = df_train.drop(columns = ['Ticket', 'Fare', 'Embarked', 'SibSp', 'Parch'])
# df_test = df_test.drop(columns = ['Ticket', 'Fare', 'Embarked', 'SibSp', 'Parch'])

In [23]:
#df_train.head()

## Data preprocessing

In [24]:
#encoding SEX variable
label = LabelEncoder()
# df_train['Sex'] = label.fit_transform(df_train['Sex'])
# df_test['Sex'] = label.fit_transform(df_test['Sex'])

df_train['Sex'] = label.fit_transform(df_train['Sex'])
df_test['Sex'] = label.fit_transform(df_test['Sex'])

df_train['Embarked'] = label.fit_transform(df_train['Embarked'])
df_test['Embarked'] = label.fit_transform(df_test['Embarked'])

#### We will use all variables for the first test, with Random Forest and Logistic Regression as they have been performing well on the assignmnets except ticket, name

In [25]:
df_train = df_train.drop(columns = ['Ticket'])
df_test = df_test.drop(columns = ['Ticket'])

In [26]:
df_train = df_train.drop(columns = ['Name'])
df_test = df_test.drop(columns = ['Name'])

In [27]:
df_train.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
748,749,0,1,1,19.0,1,0,53.1,2
132,133,0,3,0,47.0,1,0,14.5,2
401,402,0,3,1,26.0,0,0,8.05,2
225,226,0,3,1,22.0,0,0,9.35,2
267,268,1,3,1,25.0,1,0,7.775,2


In [28]:
X = df_train.loc[:,["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]]
y = df_train.loc[:, ["Survived"]]

X_test = df_test.loc[:,["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]]

## Data Modelling

### Logistic Regression

In [29]:
#Use StratifiedKFold with 10 folds
k = StratifiedKFold(n_splits = 10, shuffle= False)

In [30]:
#logistic regression
logit = LogisticRegression(random_state= 42)
logit.fit(X,y)
logit_score = cross_val_score(logit, X, y, cv = k, scoring = 'roc_auc').mean()
logit_score

0.8542521008403362

### Random Forest

In [31]:
#selectin the optimal leaves using  GridSearch
param = { 'max_leaf_nodes': [int(i) for i in range(20,110,10)]}

rfr = RandomForestClassifier(random_state = 32)
rfr_grid_search = GridSearchCV(estimator = rfr, param_grid = param, cv = k,verbose = 1, n_jobs = -1)
rfr_grid_search.fit(X, y)

Fitting 10 folds for each of 9 candidates, totalling 90 fits


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             estimator=RandomForestClassifier(random_state=32), n_jobs=-1,
             param_grid={'max_leaf_nodes': [20, 30, 40, 50, 60, 70, 80, 90,
                                            100]},
             verbose=1)

In [32]:
#selectin the optimal leaves using  GridSearch
param = { 'max_leaf_nodes': [int(i) for i in range(20,110,10)]}

rfr2 = RandomForestClassifier(random_state = 32)
rfr2_grid_search = GridSearchCV(estimator = rfr2, param_grid = param, cv = k,verbose = 1, n_jobs = -1)
rfr2_grid_search.fit(X, y)

Fitting 10 folds for each of 9 candidates, totalling 90 fits


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             estimator=RandomForestClassifier(random_state=32), n_jobs=-1,
             param_grid={'max_leaf_nodes': [20, 30, 40, 50, 60, 70, 80, 90,
                                            100]},
             verbose=1)

In [33]:
#outpu the optimal leaves
rfr_grid_search.best_estimator_

RandomForestClassifier(max_leaf_nodes=80, random_state=32)

In [34]:
#outpu the optimal leaves
rfr2_grid_search.best_estimator_

RandomForestClassifier(max_leaf_nodes=80, random_state=32)

In [35]:
#selecting the optimal number of trees uisng the optimal leaves we selected above
param = { 'max_leaf_nodes': [60],
            'n_estimators': [int(i) for i in range(10,210,10)]}

rfr = RandomForestClassifier(random_state = 32)
rfr_grid_search = GridSearchCV(estimator = rfr, param_grid = param, cv = k,verbose = 1, n_jobs = -1)
rfr_grid_search.fit(X, y)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             estimator=RandomForestClassifier(random_state=32), n_jobs=-1,
             param_grid={'max_leaf_nodes': [60],
                         'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90,
                                          100, 110, 120, 130, 140, 150, 160,
                                          170, 180, 190, 200]},
             verbose=1)

In [36]:
#selecting the optimal number of trees uisng the optimal leaves we selected above
param = {'n_estimators': [int(i) for i in range(10,210,10)]}

rfr2 = RandomForestClassifier(random_state = 32)
rfr2_grid_search = GridSearchCV(estimator = rfr2, param_grid = param, cv = k,verbose = 1, n_jobs = -1)
rfr2_grid_search.fit(X, y)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             estimator=RandomForestClassifier(random_state=32), n_jobs=-1,
             param_grid={'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90,
                                          100, 110, 120, 130, 140, 150, 160,
                                          170, 180, 190, 200]},
             verbose=1)

In [37]:
rfr_grid_search.best_estimator_

RandomForestClassifier(max_leaf_nodes=60, n_estimators=110, random_state=32)

In [38]:
rfr2_grid_search.best_estimator_

RandomForestClassifier(n_estimators=90, random_state=32)

In [39]:
#create a model with the best selected parameters
rfr_best_params = RandomForestClassifier(max_leaf_nodes=60, n_estimators=70, random_state=32)
rfr_best_params = rfr_best_params.fit(X,y)
rfr_best_params_score = cross_val_score(rfr_best_params, X, y, cv = k, scoring = 'roc_auc').mean()
rfr_best_params_score

0.8714000792236085

In [40]:
#create a model with the best selected parameters
rfr2_best_params = RandomForestClassifier(max_leaf_nodes = 80, n_estimators=90, random_state=32)
rfr2_best_params = rfr2_best_params.fit(X,y)
rfr2_best_params_score = cross_val_score(rfr2_best_params, X, y, cv = k, scoring = 'roc_auc').mean()
rfr2_best_params_score

0.8719430438842203

In [41]:
models = ["Logistic Regression", "Random Forest 1", "Random Forest 2"]
accuracy = [logit_score, rfr_best_params_score, rfr2_best_params_score]

In [42]:
zipped = list(zip(models, accuracy))
df_scores = pd.DataFrame(zipped, columns=["Models", "Accuracy"])
df_scores

Unnamed: 0,Models,Accuracy
0,Logistic Regression,0.854252
1,Random Forest 1,0.8714
2,Random Forest 2,0.871943


## Predictions

In [43]:
out_logit = df_test['PassengerId']
out_logit = pd.DataFrame(out_logit)

out_ran = df_test['PassengerId']
out_ran = pd.DataFrame(out_ran)

out_ran2 = df_test['PassengerId']
out_ran2 = pd.DataFrame(out_ran2)

In [44]:
#logistic regression

out_logit['Survived'] = logit.predict(X_test)

out_logit.head()

#write to csv
out_logit.to_csv('predictions_ran.csv',index=False)

In [45]:
#Random Forest

out_ran['Survived'] = rfr_best_params.predict(X_test)

out_ran.head()

#write to csv
out_ran.to_csv('predictions_ran.csv',index=False)

In [46]:
#Random Forest

out_ran2['Survived'] = rfr2_best_params.predict(X_test)

out_ran2.head()

#write to csv
out_ran2.to_csv('predictions_ran2.csv',index=False)