In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from mlxtend.feature_selection import ExhaustiveFeatureSelector
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# ****DATA PREPROCESSING****

Let's read in the training dataset into a dataframe using pandas and take a look at a summary of what it consists of

In [3]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Now let's do the same for the test dataset

In [4]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


We can see we have missing values in the age, fare and embarked columns so we need to clean this up first.

The following is a nice function written by @SIVA HEMANG that replaces the age and fare null values with their respective medians when grouped by Pclass and sex.

As there are only 2 samples in the training dataset wiht missing embarked features we can afford to omit these from the dataset.

In [5]:
def cleanData(data):
    
    # Data missing Case2
    data['Age'] = data.groupby(['Pclass','Sex'])['Age'].transform(lambda x: x.fillna(x.median()))
    
    # FARE Data missing in test
    data['Fare'] = data.groupby(['Pclass','Sex'])['Fare'].transform(lambda x: x.fillna(x.median()))

    # Data missing Case3
    data.dropna(axis=0, subset=['Embarked'], inplace=True)
    
    return data

Create nice clean training and testing data using the above function and take a look at a summary of them both to verify all null vaules have been dealt with

In [6]:
clean_train = cleanData(train_data)
clean_test = cleanData(test_data)

In [7]:
clean_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Name         889 non-null    object 
 4   Sex          889 non-null    object 
 5   Age          889 non-null    float64
 6   SibSp        889 non-null    int64  
 7   Parch        889 non-null    int64  
 8   Ticket       889 non-null    object 
 9   Fare         889 non-null    float64
 10  Cabin        202 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 90.3+ KB


In [8]:
clean_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          418 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         418 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 39.2+ KB


Looks good :-)

Now let's choose our feature columns. I don't think passengerID, name, ticket or cabin would be predictive of survival so I am going to choose Pclass, sex, age, sibsp, parch, fare and embarked.

I will seperate out the y_train (target) and x_train (features) and one hot encode the sex and embarked columns and check the summary info to verify this has all worked as expected.

In [9]:
features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
y_train = clean_train["Survived"]
X_train = pd.get_dummies(clean_train[features])
X_train.info()
print(f"\n Number of samples in y = {len(y_train)}")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass      889 non-null    int64  
 1   Age         889 non-null    float64
 2   SibSp       889 non-null    int64  
 3   Parch       889 non-null    int64  
 4   Fare        889 non-null    float64
 5   Sex_female  889 non-null    uint8  
 6   Sex_male    889 non-null    uint8  
 7   Embarked_C  889 non-null    uint8  
 8   Embarked_Q  889 non-null    uint8  
 9   Embarked_S  889 non-null    uint8  
dtypes: float64(2), int64(3), uint8(5)
memory usage: 46.0 KB

 Number of samples in y = 889


Now I'm going to create equivalent X_test data by selecting the same features and perform the same one hot encoding

In [10]:
X_test = pd.get_dummies(clean_test[features])
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass      418 non-null    int64  
 1   Age         418 non-null    float64
 2   SibSp       418 non-null    int64  
 3   Parch       418 non-null    int64  
 4   Fare        418 non-null    float64
 5   Sex_female  418 non-null    uint8  
 6   Sex_male    418 non-null    uint8  
 7   Embarked_C  418 non-null    uint8  
 8   Embarked_Q  418 non-null    uint8  
 9   Embarked_S  418 non-null    uint8  
dtypes: float64(2), int64(3), uint8(5)
memory usage: 21.6 KB


Final bit of preprocessing I am going to scale the train and test features with a robust scaler

I have chosen robust scaling as I belevie minmax scaling will not handle the outliers and the data is not normally distributed so not well suited to a standard scaler.

In [11]:
X_sc = RobustScaler()
X_train = X_sc.fit_transform(X_train)
X_test= X_sc.transform(X_test)

# ****MODEL TRAINING****

Finally, let's train some models and see which one works best with it's "out of the box" default hyperparameters

I'll be carrying out exhaustive feature selection on each model individually using 5 fold cross validation on the whole training dataset. Assessing performance using the best mean CV accuracy score for each of the following models:
1. Decision Tree
2. Random Forest
3. K Nearest Neighbors
4. Logistic Regression
5. Gradient Boosting classifier
6. Light Gradient Boosting classifier
7. EXtreme Gradient Boosting classifier

In [12]:
DT_model = DecisionTreeClassifier()
EFS_DT_model = ExhaustiveFeatureSelector(DT_model, 
                           min_features=1,
                           max_features=10,
                           scoring='accuracy',
                           print_progress=True,
                           cv=5)

EFS_DT_model = EFS_DT_model.fit(X_train, y_train)

print('Best accuracy score: %.3f' % EFS_DT_model.best_score_)
print('Best subset (indices):', EFS_DT_model.best_idx_)

Features: 1023/1023

Best accuracy score: 0.819
Best subset (indices): (0, 4, 5, 8)


In [13]:
RF_model = RandomForestClassifier()
EFS_RF_model = ExhaustiveFeatureSelector(RF_model, 
                           min_features=1,
                           max_features=10,
                           scoring='accuracy',
                           print_progress=True,
                           cv=5)

EFS_RF_model = EFS_RF_model.fit(X_train, y_train)

print('Best accuracy score: %.3f' % EFS_RF_model.best_score_)
print('Best subset (indices):', EFS_RF_model.best_idx_)

Features: 1023/1023

Best accuracy score: 0.819
Best subset (indices): (0, 1, 2, 3, 4, 5, 6, 8)


In [14]:
KNN_model = KNeighborsClassifier()
EFS_KNN_model = ExhaustiveFeatureSelector(KNN_model, 
                           min_features=1,
                           max_features=10,
                           scoring='accuracy',
                           print_progress=True,
                           cv=5)

EFS_KNN_model = EFS_KNN_model.fit(X_train, y_train)

print('Best accuracy score: %.3f' % EFS_KNN_model.best_score_)
print('Best subset (indices):', EFS_KNN_model.best_idx_)

Features: 1023/1023

Best accuracy score: 0.818
Best subset (indices): (0, 1, 4, 5, 6, 9)


In [15]:
LogReg_model = LogisticRegression()
EFS_LogReg_model = ExhaustiveFeatureSelector(LogReg_model, 
                           min_features=1,
                           max_features=10,
                           scoring='accuracy',
                           print_progress=True,
                           cv=5)

EFS_LogReg_model = EFS_LogReg_model.fit(X_train, y_train)

print('Best accuracy score: %.3f' % EFS_LogReg_model.best_score_)
print('Best subset (indices):', EFS_LogReg_model.best_idx_)

Features: 1023/1023

Best accuracy score: 0.808
Best subset (indices): (0, 1, 2, 5, 7)


In [16]:
GB_model = GradientBoostingClassifier()
EFS_GB_model = ExhaustiveFeatureSelector(GB_model, 
                           min_features=1,
                           max_features=10,
                           scoring='accuracy',
                           print_progress=True,
                           cv=5)

EFS_GB_model = EFS_GB_model.fit(X_train, y_train)

print('Best accuracy score: %.3f' % EFS_GB_model.best_score_)
print('Best subset (indices):', EFS_GB_model.best_idx_)

Features: 1023/1023

Best accuracy score: 0.832
Best subset (indices): (0, 1, 2, 3, 4, 5, 6, 8)


In [17]:
LGB_model = LGBMClassifier()
EFS_LGB_model = ExhaustiveFeatureSelector(LGB_model, 
                           min_features=1,
                           max_features=10,
                           scoring='accuracy',
                           print_progress=True,
                           cv=5)

EFS_LGB_model = EFS_LGB_model.fit(X_train, y_train)

print('Best accuracy score: %.3f' % EFS_LGB_model.best_score_)
print('Best subset (indices):', EFS_LGB_model.best_idx_)

Features: 1023/1023

Best accuracy score: 0.829
Best subset (indices): (0, 1, 2, 4, 5, 7, 8, 9)


In [18]:
XGB_model = XGBClassifier()
EFS_XGB_model = ExhaustiveFeatureSelector(XGB_model, 
                           min_features=1,
                           max_features=10,
                           scoring='accuracy',
                           print_progress=True,
                           cv=5)

EFS_XGB_model = EFS_XGB_model.fit(X_train, y_train)

print('Best accuracy score: %.3f' % EFS_XGB_model.best_score_)
print('Best subset (indices):', EFS_XGB_model.best_idx_)

Features: 1023/1023

Best accuracy score: 0.829
Best subset (indices): (0, 1, 2, 3, 5)


# ****MODEL TUNING****

The gradient boosting classifier won!

Now I want to tune this model using its optimal selected features to see if I can improve it's accuracy.

I will use exhaustive gridsearch on the following 3 hyperparameters:
1. Loss function used
2. Number of estimators
3. Maximum depth

Each permutation of these hyperparameters will be assessed using 5 fold cross validation as before for consistency


In [19]:
X_train_select = EFS_GB_model.transform(X_train)
X_test_select = EFS_GB_model.transform(X_test)

In [20]:
param_grid={'loss' : ["log_loss", "deviance", "exponential"], 
                         'n_estimators' : range(1, 150), 
                         'max_depth' : range(1, 8)}
                        
grid = GridSearchCV(estimator=GB_model, param_grid = param_grid, 
                    refit = "true", verbose =1, n_jobs=-1)
grid.fit(X_train_select, y_train)
print("\n Best parameters are: " +str(grid.best_params_))
print("\n Best score is: " +str(grid.best_score_))

Fitting 5 folds for each of 3129 candidates, totalling 15645 fits


5215 fits failed out of a total of 15645.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5215 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.7/site-packages/sklearn/ensemble/_gb.py", line 525, in fit
    self._check_params()
  File "/opt/conda/lib/python3.7/site-packages/sklearn/ensemble/_gb.py", line 282, in _check_params
    raise ValueError("Loss '{0:s}' not supported. ".format(self.loss))
ValueError: Loss 'log_loss' not supported. 




 Best parameters are: {'loss': 'exponential', 'max_depth': 6, 'n_estimators': 61}

 Best score is: 0.840297086269282


In [21]:
GB_scores = cross_val_score(grid.best_estimator_, X_train_select, y_train, cv=5)
print(GB_scores)
print(GB_scores.mean())
print(GB_scores.std())

[0.81460674 0.83146067 0.87078652 0.81460674 0.85875706]
0.8380435472608392
0.022991536254466146


# ****PREDICTIONS AND OUTPUT****

I'm pleased to see an improvement from a mean accuracy score of 0.8245 to 0.8414 and a reduction in the standard deviation from 0.0215 to 0.0200

Let's use this tuned model to make our predictions.......

In [22]:
predictions = grid.predict(X_test_select)

In [23]:
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
