# Wrap Up

This last notebook in the research enviroment covers all the steps follows in the four previous notebooks and sumarizes into just one.

In [117]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.pandas.set_option('display.max_columns', None)

# Preprocessing and Feature engineering
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import MinMaxScaler

# Feature selection
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

# Model building
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

#Persist the model and the scaler
import joblib

# Warnings
import warnings
warnings.simplefilter(action='ignore')

## Drop Uninformative Variables

In the data analysis we found out that *PassengerId, Name, Ticket and Cabin* are variable that do not provide useful information to predict the likelihood of surviving.

In [118]:
data = pd.read_csv('train.csv')
print(data.shape)
data.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Split the dataset 

In [119]:
X_train, X_test, y_train, y_test = train_test_split(data,
                                                    data['Survived'],
                                                    test_size=0.2,
                                                    random_state=0) 

## Selected Features

In [120]:
# load selected features
features = pd.read_csv('selected_features.csv', header=None)
features = [x for x in features[0]]

## Missing Values

### Categorical Variables

In [121]:
# make a list of categorical variables:
cat_var_na= [var for var in data.columns if data[var].dtypes == 'O' 
          and data[var].isnull().sum() > 0]

data[cat_var_na].isnull().mean()

Cabin       0.771044
Embarked    0.002245
dtype: float64

In [122]:
# replace missing values with new label: "Missing"

X_train[cat_var_na] = X_train[cat_var_na].fillna('Missing')
X_test[cat_var_na] = X_test[cat_var_na].fillna('Missing')

# check there are no missing values
display(X_train[cat_var_na].isnull().mean())
display(X_test[cat_var_na].isnull().mean())

Cabin       0.0
Embarked    0.0
dtype: float64

Cabin       0.0
Embarked    0.0
dtype: float64

### Numerical variables

We will engineer missing values in numerical values, following two steps:

1. add a binary missing value indicator variable
2. replace the missing values in the original variable with the mode

In [123]:
# make a list with the numerical variables that contain missing values
num_var_na = [
    var for var in X.columns
    if X_train[var].isnull().sum() > 0 and X_train[var].dtypes != 'O'
]

# print percentage of missing values per variable
display(X_train[num_var_na].isnull().mean())
display(X_test[num_var_na].isnull().mean())

Age    0.198034
dtype: float64

Age    0.201117
dtype: float64

In [124]:
for var in num_var_na:
    
    # Compute the mode using the train set
    mode_val = X_train[var].mode()[0]
    
    # add binary missing indicator in the two partitions
    X_train[var+'_na'] = np.where(X_train[var].isnull(), 1, 0)
    X_test[var+'_na'] = np.where(X_test[var].isnull(), 1, 0)
    
    # replace missing values with the training mode
    X_train[var] = X_train[var].fillna(mode_val)
    X_test[var] = X_test[var].fillna(mode_val)
    
# Check there are no missing values in both partitions

display(X_train[num_var_na].isnull().sum())
display(X_test[num_var_na].isnull().sum())

Age    0
dtype: int64

Age    0
dtype: int64

## Variable Transformation

In the data analysis notebook there were two numerical variables that are not normally distributed: *Age* and *Fare*.

*Age* can be approximated to a normal distribution using a log transformation. 

However, *Fare* variable does not admit a log transformation, as it contains 0 and negative values. Even other transformation such as squaring the variable do not work either. We will limit ourselves to transform the *Fare* variable to a strictly null or positive variable.

In [125]:
# Age variable transformation
X_train['Age'] = np.log(X_train['Age'])
X_test['Age'] = np.log(X_test['Age'])

In [126]:
# Fare variable transformation
X_train['Fare'] = abs(X_train['Fare'])
X_test['Fare'] = abs(X_test['Fare'])

In [127]:
# check the train set does not contain missing values in the engineered variables
[var for var in ['Age', 'Fare'] if X_train[var].isnull().sum() > 0]

[]

In [128]:
# check the test set does not contain missing values in the engineered variables
[vars for vars in ['Age', 'Fare'] if X_test[var].isnull().sum() > 0]

[]

## Encode Categorical Variables

In this section, the categorical variables will be encoded to turn the strings into numbers. The idea is to capture the monotonic relationship between the label and the target.

To achieve that, the discrete values of the categorical variables will have a lower or higher numerical value depending on how they lower or grow the likelihood of surviving. 

In [129]:
def transform_cat(train, test, var, target):
    
    # order the categories in a variable from that with the lowest
    # surviving mean, to that with the highest
    ordered_labels = train.groupby([var])[target].mean().sort_values().index
    
    # create a dictionary of ordered categories
    ordinal_label = {k:i for i, k in enumerate(ordered_labels, 0)}
    
    # the dictionary replaces the categorical strings by integers
    train[var] = train[var].map(ordinal_label)
    test[var] = test[var].map(ordinal_label)

In [130]:
# list of categorical variables

cat_var = [var for var in X.columns if X_train[var].dtypes == 'O']

for var in cat_var:
    transform_cat(X_train, X_test, var, 'Survived')

In [131]:
# Check na in training set
[var for var in X_train.columns if X_train[var].isnull().sum() > 0]

[]

In [132]:
# Check na in test set
[var for var in X_test.columns if X_test[var].isnull().sum() > 0]

[]

## Feature Scaling

Linear models need the features to be scaled or normalised. In this case, we will scale the features so they are all between the same minimum and maximum values. To do that we make use of Scikit-Learn.

In [133]:
# capture the target
y_train = X_train['Survived']
y_test = X_test['Survived']

In [134]:
# Instance the scaler
scaler = MinMaxScaler()

# Fit the with only the selected features
scaler.fit(X_train[features])

# Persis the scaler for future use
joblib.dump(scaler, 'scaler.pkl')

# Transform the partitions
X_train = scaler.transform(X_train[features])
X_test = scaler.transform(X_test[features])

## Feature Selection

As a rule of thum you do not include feature selection stage into your production code as it has some drawbacks. So, even if we perform the feature selection into the the previous notebook, here we will not show it.

## Build the Model

In [135]:
# Set up the model
clf = LogisticRegression(random_state=0)

# train the model 
clf.fit(X_train, y_train)

# Persist the model for future use
joblib.dump(clf, 'logistic_regressor.pkl')

['logistic_regressor.pkl']

In [136]:
# Obtain predictions
pred = clf.predict(X_test)

In [137]:
# Evaluate the model with classification report
# and confusion matrix

print(classification_report(y_test, pred))

cm = confusion_matrix(y_test, pred)
print('Confusion Matrix\n', cm)

              precision    recall  f1-score   support

           0       0.83      0.83      0.83       110
           1       0.73      0.74      0.73        69

    accuracy                           0.79       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179

Confusion Matrix
 [[91 19]
 [18 51]]
