# Titanic project 

### Project Overview 
Come up with a machine learning model that predicts if a passenger survives 

### Data Inderstanding 

The dataset is from kaggle 
891rows and 12 columns 

In [1]:
#libraries
import pandas as pd
import numpy as np

#loadset
df= pd.read_csv('titanic.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## Define X and y


In [2]:
from sklearn.model_selection import train_test_split
X = df.drop('Survived', axis=1)
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20, random_state=42)

# Preprocessing data 

In [3]:
#missing values 
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
#categorical for mising 
X_train_fillna = X_train.copy()
X_train_fillna.fillna({'Cabin':'missing cabin', 'Embarked':'missing embarked'}, inplace=True)
X_train_fillna.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            140
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin            0
Embarked         0
dtype: int64

In [5]:
#for numeric we will use SimpleImputer
from sklearn.impute import SimpleImputer

imputer = SimpleImputer()
imputer.fit(X_train_fillna[['Age']])
#add in a dataframe 
age_imputed = pd.DataFrame(imputer.transform(X_train_fillna[['Age']]),index=X_train_fillna.index, columns=['Age'])

X_train_fillna['Age'] = age_imputed
X_train_fillna.isna().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

## Deal with Categorical Variables by One Hot Encoding 

In [6]:
#categorical columns 
X_train_categorical = X_train_fillna.select_dtypes(exclude=['int64', 'float64']).copy()
# One hot encode categorical columns 
from sklearn.preprocessing import OneHotEncoder


categorical_features = ['Sex', 'Cabin', 'Embarked']
X_train_categorical = X_train_fillna[categorical_features].copy()
X_train_categorical

#instantiate 
one_hot = OneHotEncoder(handle_unknown='ignore', sparse = False)
#fit 
one_hot.fit(X_train_categorical)
X_train_one_hot = pd.DataFrame(one_hot.transform(X_train_categorical),index=X_train_categorical.index,
                              columns=np.hstack(one_hot.categories_))
X_train_one_hot

Unnamed: 0,female,male,A10,A14,A16,A19,A23,A24,A26,A32,...,F33,F38,F4,G6,T,missing cabin,C,Q,S,missing embarked
331,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
733,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
382,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
704,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
813,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
270,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
860,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
435,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


# Normalization 

In [7]:
#numerical columns 
numerical_feature =['Pclass', 'Age', 'SibSp', 'Fare']
X_train_numericals = X_train_fillna[numerical_feature].copy()
X_train_numericals

Unnamed: 0,Pclass,Age,SibSp,Fare
331,1,45.500000,0,28.5000
733,2,23.000000,0,13.0000
382,3,32.000000,0,7.9250
704,3,26.000000,1,7.8542
813,3,6.000000,4,31.2750
...,...,...,...,...
106,3,21.000000,0,7.6500
270,1,29.498846,0,31.0000
860,3,41.000000,2,14.1083
435,1,14.000000,1,120.0000


In [8]:
#use minmax scaler for normalisation 
from sklearn.preprocessing import MinMaxScaler

min_max = MinMaxScaler()
#fit 
min_max.fit(X_train_numericals)
#dataframe 
X_train_scaled = pd.DataFrame(min_max.transform(X_train_numericals), index=X_train_numericals.index,
                             columns=X_train_numericals.columns)
X_train_scaled

Unnamed: 0,Pclass,Age,SibSp,Fare
331,0.0,0.566474,0.000,0.055628
733,0.5,0.283740,0.000,0.025374
382,1.0,0.396833,0.000,0.015469
704,1.0,0.321438,0.125,0.015330
813,1.0,0.070118,0.500,0.061045
...,...,...,...,...
106,1.0,0.258608,0.000,0.014932
270,0.0,0.365404,0.000,0.060508
860,1.0,0.509927,0.250,0.027538
435,0.0,0.170646,0.125,0.234224


# Concat all the transformed dataframes 

In [9]:
X_train_full = pd.concat([X_train_scaled, X_train_one_hot], axis=1)
X_train_full

Unnamed: 0,Pclass,Age,SibSp,Fare,female,male,A10,A14,A16,A19,...,F33,F38,F4,G6,T,missing cabin,C,Q,S,missing embarked
331,0.0,0.566474,0.000,0.055628,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
733,0.5,0.283740,0.000,0.025374,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
382,1.0,0.396833,0.000,0.015469,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
704,1.0,0.321438,0.125,0.015330,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
813,1.0,0.070118,0.500,0.061045,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,1.0,0.258608,0.000,0.014932,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
270,0.0,0.365404,0.000,0.060508,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
860,1.0,0.509927,0.250,0.027538,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
435,0.0,0.170646,0.125,0.234224,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


# Fitting Model

In [10]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(fit_intercept=False, C=1e12, solver='liblinear')
logreg.fit(X_train_full, y_train)

LogisticRegression(C=1000000000000.0, fit_intercept=False, solver='liblinear')

## Training Data Performance 

In [11]:
y_pred = logreg.predict(X_train_full)
Loss = np.abs(y_train-y_pred)
print(Loss.value_counts())
print(Loss.value_counts(normalize=True))

0    604
1    108
Name: Survived, dtype: int64
0    0.848315
1    0.151685
Name: Survived, dtype: float64


# Performance on Test Data

## Preprocessing

In [12]:
#copy 
X_test_fillna = X_test.copy()
X_test_fillna.fillna({'Cabin':'Cabin missing', 'Embarked':'Embarked missing'}, inplace =True)
X_test_imputed = pd.DataFrame(imputer.transform(X_test_fillna[['Age']]), index=X_test_fillna.index, columns=['Age'] )
X_test_fillna['Age'] = X_test_imputed

In [13]:
#onehot encode
X_test_categorical = X_test_fillna[categorical_features].copy()
#fit
#transform
X_test_one_hot = pd.DataFrame(one_hot.transform(X_test_categorical), index=X_test_categorical.index,
                             columns=np.hstack(one_hot.categories_))

#test numerical 
X_test_numericals =X_test_fillna[numerical_feature].copy()
#fit
#transform 
X_test_scaled = pd.DataFrame(min_max.transform(X_test_numericals), index = X_test_numericals.index, 
                            columns = X_test_numericals.columns)


### concat the Dataframes 

In [14]:
X_test_full = pd.concat([X_test_scaled, X_test_one_hot], axis=1)
X_test_full

Unnamed: 0,Pclass,Age,SibSp,Fare,female,male,A10,A14,A16,A19,...,F33,F38,F4,G6,T,missing cabin,C,Q,S,missing embarked
709,1.0,0.365404,0.125,0.029758,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
439,0.5,0.384267,0.000,0.020495,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
840,1.0,0.246042,0.000,0.015469,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
720,0.5,0.070118,0.000,0.064412,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
39,1.0,0.170646,0.125,0.021942,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433,1.0,0.208344,0.000,0.013907,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
773,1.0,0.365404,0.000,0.014102,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
25,1.0,0.472229,0.125,0.061264,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
84,0.5,0.208344,0.000,0.020495,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [16]:
#Prediction scores 
y_t_pred = logreg.predict(X_test_full)
loss_test = np.abs(y_test- y_t_pred)
print(loss_test.value_counts())
print(loss_test.value_counts(normalize=True))


0    97
1    82
Name: Survived, dtype: int64
0    0.541899
1    0.458101
Name: Survived, dtype: float64


We can see withtraining data model is performing poorly its test prediction is lower than train meaning it has underperformed


Lets tune our model to perform better 