In [1]:
# to handle datasets
import pandas as pd
import numpy as np

# for text / string processing
import re

# for plotting
import matplotlib.pyplot as plt
% matplotlib inline

# to divide train and test set
from sklearn.model_selection import train_test_split

# feature scaling
from sklearn.preprocessing import MinMaxScaler

# for tree binarisation
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score


# to build the models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

# to evaluate the models
from sklearn.metrics import roc_auc_score
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

UsageError: Line magic function `%` not found.


In [2]:
# load dataset
data = pd.read_csv('train.csv')
data.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
582,583,0,2,"Downton, Mr. William James",male,54.0,0,0,28403,26.0,,S
267,268,1,3,"Persson, Mr. Ernst Ulrik",male,25.0,1,0,347083,7.775,,S
342,343,0,2,"Collander, Mr. Erik Gustaf",male,28.0,0,0,248740,13.0,,S
618,619,1,2,"Becker, Miss. Marion Louise",female,4.0,2,1,230136,39.0,F4,S
660,661,1,1,"Frauenthal, Dr. Henry William",male,50.0,2,0,PC 17611,133.65,,S


### Data inspection

In [None]:
data.describe()


There are null values, having mixed data types like Numeric and caterogical

In [3]:
# dropping irrelavent features from training dataset
data.drop(['PassengerId','Name','Ticket'], axis=1, inplace=True)
data.sample(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
161,1,2,female,40.0,0,0,15.75,,S
753,0,3,male,23.0,0,0,7.8958,,S
359,1,3,female,,0,0,7.8792,,Q
41,0,2,female,27.0,1,0,21.0,,S
130,0,3,male,33.0,0,0,7.8958,,C


In [None]:
data.info()

In [None]:
# finding total passengers in the dataset

print('Number  of passengers on the Titanic: ', len(data))

In [None]:
# find categorical variables
categorical = [var for var in data.columns if data[var].dtype=='O']
print('There are {} categorical variables'.format(len(categorical)))

In [None]:
numeric = [num for num in data.columns if data[num].dtype!='O']
print('There are {} numeric variables'.format(len(numeric)))

In [None]:
data[categorical].head()

In [None]:
print ('unique labels in Sex variable: ', len(data['Sex'].unique()))
print ('unique labels in Ticket variable: ', len(data['Ticket'].unique()))
print ('unique labels in Cabin variable: ', len(data['Cabin'].unique()))
print ('unique labels in Embarked variable: ', len(data['Embarked'].unique()))

In [1]:
print ('unique values in Survived variable: ', len(data['Survived'].unique()))
print ('unique values in Passenger class variable: ', len(data['Pclass'].unique()))
print ('unique values in Sibling, Spouse variable: ', len(data['SibSp'].unique()))
print ('unique values in Parent, Child variable: ', len(data['Parch'].unique()))
print ('unique values in Fare variable: ', len(data['Fare'].unique()))

NameError: name 'data' is not defined

* 3 Discrete variables: Pasenger class, Sibling Spouce and Parent child
* 2 continuous variables: Fare and Age
* 1 Id variable: PassengerId (it is a label for each of the passengers)
* 1 binary: Survived (target variable)

In [None]:
# let's visualise the values of the discrete variables
for var in ['Pclass',  'SibSp', 'Parch']:
    print(var, ' values: ', data[var].unique())

In [None]:
#checking correlation
data.corr()

### Missisng values

In [None]:
data.isnull().sum()

77% of null values in Cabin, 19% in Age variable and <1% in Embarked feature

In [4]:
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True) # labled category filled with mode
data['Age'].fillna(data['Age'].mean(), inplace=True) # continous numeric variable filled with mean
data['Cabin'].fillna('Missing', inplace=True) # category with 77% missinig filled with label 'missing'
data.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Cabin       0
Embarked    0
dtype: int64

### Outliers

In [5]:
# finding the outliers
Q1 = data.quantile(0.25) 
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
print(IQR)

Survived     1.0000
Pclass       1.0000
Age         13.0000
SibSp        1.0000
Parch        0.0000
Fare        23.0896
dtype: float64


In [None]:
# let's have a look at the most extreme outliers
data[data.Fare>500]

In [None]:
data[data.Age>70]

In [None]:
import seaborn as sns
plt.figure(figsize=(20,8))
plt.subplot(1, 2, 1)
fig = data.boxplot(numeric)

In [None]:
plt.figure(figsize=(15,6))
plt.subplot(1, 2, 1)
fig = data.boxplot(column=('Age'))

In [None]:
plt.figure(figsize=(15,6))
plt.subplot(1, 2, 1)
fig = data.boxplot(column='Fare')

In [6]:
# Age
Upper_boundary = data.Age.mean() + 3* data.Age.std()
Lower_boundary = data.Age.mean() - 3* data.Age.std()
print('Age outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_boundary, upperboundary=Upper_boundary))

# Fare
IQR = data.Fare.quantile(0.75) - data.Fare.quantile(0.25)
Lower_fence = data.Fare.quantile(0.25) - (IQR * 3)
Upper_fence = data.Fare.quantile(0.75) + (IQR * 3)
print('Fare outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))

Age outliers are values < -9.30692803094989 or > 68.70516332506742
Fare outliers are values < -61.358399999999996 or > 100.2688


In [7]:
#Quantile-based Flooring and Capping
data['Age'] = np.where(data["Age"] <-9.30692803094989,-9.30692803094989, data['Age'])
data["Age"] = np.where(data["Age"] >68.70516332506742,68.70516332506742, data['Age'])
print(data['Age'].skew())

0.37820359295636596


In [None]:
plt.figure(figsize=(15,6))
plt.subplot(1, 2, 1)
fig = data.boxplot(column=('Age'))


In [None]:
# lets look at the actual number of passengers on the upper Fare ranges

print('total passengers: {}'.format(data.shape[0]))

print('passengers that paid more than 65: {}'.format(
    data[data.Fare > 65].shape[0]))

print('passengers that paid more than 100: {}'.format(
    data[data.Fare > 100].shape[0]))

In [8]:
#Quantile-based Flooring and Capping
data['Fare'] = np.where(data["Fare"] <-61.358399999999996,-61.358399999999996, data['Fare'])
data["Fare"] = np.where(data["Fare"] > 100.2688, 100.2688, data['Fare'])
print(data['Fare'].skew())

1.5592914741132582


In [None]:
# let's have a look at the most extreme outliers
data.describe()

How does survival target correlate with Age?

We can determine the relation between survival and Age by plotting the mean survival per Age. In this case, I will calculate the Survival rate per each year of Age. See below.

In [None]:
fig = plt.figure()
fig = data.groupby(['Age'])['Survived'].mean().plot()
fig.set_title('Normal relationship between variable and target')
fig.set_ylabel('Survived')

### converting categorical data into numerical

In [9]:
#Categorical boolean mask
categorical_feature_mask = data.dtypes=='object'
#filter categorical columns using mask and turn it into a list
categorical_cols = data.columns[categorical_feature_mask].tolist()

In [10]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le = LabelEncoder()
# apply le on categorical feature columns
data[categorical_cols] = data[categorical_cols].apply(lambda col: le.fit_transform(col))
data[categorical_cols].sample(5)

Unnamed: 0,Sex,Cabin,Embarked
445,1,10,2
260,1,146,1
424,1,146,2
4,1,146,2
412,0,78,1


# Handling testing dataset

In [11]:
test=pd.read_csv('D://Titanic/test.csv')
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [12]:
test.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
test.sample()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
256,3,male,,0,0,7.75,,Q


In [None]:
test.info()

In [None]:
test.isnull().sum()

In [13]:
# handling missing data in test set
test['Age'].fillna(data['Age'].mean(), inplace=True) # continous numeric variable filled with mean
test['Fare'].fillna(data['Fare'].mean(), inplace=True) # continous numeric variable filled with mean
test['Cabin'].fillna('Missing', inplace=True) # category with 77% missinig filled with label 'missing'
test.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Cabin       0
Embarked    0
dtype: int64

In [None]:
# converting categorical data into numeric for test data

In [14]:
#Categorical boolean mask
categorical_feature_mask = test.dtypes=='object'
#filter categorical columns using mask and turn it into a list
categorical_cols = test.columns[categorical_feature_mask].tolist()

In [15]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le = LabelEncoder()
# apply le on categorical feature columns
test[categorical_cols] = test[categorical_cols].apply(lambda col: le.fit_transform(col))
test[categorical_cols].sample(5)

Unnamed: 0,Sex,Cabin,Embarked
121,1,76,1
182,0,54,2
47,1,76,1
341,1,76,2
345,0,76,2


In [36]:
# seperating the target variable in training dataset
X_train=data.iloc[:,1:]
y_train=data['Survived'].values.reshape(-1,1)
# bringing in the target variable of testing set
submission=pd.read_csv('D://Titanic/gender_submission.csv')
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [37]:
# dropping irrelavent feature 
submission.drop(['PassengerId'], axis=1, inplace=True)
submission.head()

Unnamed: 0,Survived
0,0
1,1
2,0
3,0
4,1


In [38]:
print('The Shape of X_train is',X_train.shape)
print('The Shape of test is',test.shape)
print('The Shape of y_train is',y_train.shape)
print('The Shape of submission is',submission.shape)

The Shape of X_train is (891, 8)
The Shape of test is (418, 8)
The Shape of y_train is (891, 1)
The Shape of submission is (418, 1)


In [39]:
# finding 'y', coefficient and intercepts in logit
import statsmodels.api as sm
logit=sm.Logit(y_train, X_train)
# getting probability of 'y'
result=logit.fit()

Optimization terminated successfully.
         Current function value: 0.470190
         Iterations 6


In [40]:
# summary of losgistic regreesion model
result.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,891.0
Model:,Logit,Df Residuals:,883.0
Method:,MLE,Df Model:,7.0
Date:,"Wed, 11 Dec 2019",Pseudo R-squ.:,0.2939
Time:,22:35:38,Log-Likelihood:,-418.94
converged:,True,LL-Null:,-593.33
Covariance Type:,nonrobust,LLR p-value:,2.253e-71

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Pclass,-0.2400,0.147,-1.628,0.104,-0.529,0.049
Sex,-2.4209,0.186,-13.015,0.000,-2.786,-2.056
Age,-0.0132,0.007,-2.006,0.045,-0.026,-0.000
SibSp,-0.4542,0.104,-4.365,0.000,-0.658,-0.250
Parch,-0.1893,0.113,-1.673,0.094,-0.411,0.032
Fare,0.0348,0.004,8.253,0.000,0.027,0.043
Cabin,0.0094,0.003,3.006,0.003,0.003,0.015
Embarked,0.0129,0.112,0.115,0.908,-0.206,0.232


In [41]:
y_pred=result.predict(test)
# confusion matrix
result.pred_table()

array([[467.,  82.],
       [106., 236.]])

In [42]:
from sklearn.metrics import accuracy_score
accuracy_score([1 if p >0.5 else 0 for p in y_pred], submission)

0.8133971291866029

In [49]:
# to build the models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
logit_model = LogisticRegression()
logit_model.fit(X_train, y_train)

pred = logit_model.predict_proba(X_train)
print('Logit train roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
pred = logit_model.predict_proba(test)
print('Logit test roc-auc: {}'.format(roc_auc_score(submission, pred[:,1])))

Logit train roc-auc: 0.8579634423033906
Logit test roc-auc: 0.9621586861891571


In [51]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

pred = rf_model.predict_proba(X_train)
print('RF train roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
pred = rf_model.predict_proba(test)
print('RF test roc-auc: {}'.format(roc_auc_score(submission, pred[:,1])))

RF train roc-auc: 0.9955767530544638
RF test roc-auc: 0.885313612979818


In [52]:
ada_model = AdaBoostClassifier()
ada_model.fit(X_train, y_train)

pred = ada_model.predict_proba(X_train)
print('Adaboost train roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
pred = ada_model.predict_proba(test)
print('Adaboost test roc-auc: {}'.format(roc_auc_score(submission, pred[:,1])))

Adaboost train roc-auc: 0.897016372138604
Adaboost test roc-auc: 0.9448209339137317


In [53]:
xgb_model = xgb.XGBClassifier()

eval_set = [(test, submission)]
xgb_model.fit(X_train, y_train, eval_metric="auc", eval_set=eval_set, verbose=False)

pred = xgb_model.predict_proba(X_train)
print('xgb train roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
pred = xgb_model.predict_proba(test)
print('xgb test roc-auc: {}'.format(roc_auc_score(submission, pred[:,1])))

xgb train roc-auc: 0.9299923305531589
xgb test roc-auc: 0.9295854768500198
