## Imports

In [1]:
import numpy    as np
import pandas   as pd
import seaborn  as sb
import matplotlib.pyplot as plt
import sklearn  as skl

from sklearn import pipeline      # Pipeline
from sklearn import preprocessing # OrdinalEncoder, LabelEncoder
from sklearn import impute
from sklearn import compose
from sklearn import model_selection # train_test_split
from sklearn import metrics         # accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import linear_model    # LogisticRegression
from sklearn import set_config

set_config(display='diagram') # Useful for display the pipeline

print("Pandas  ", pd.__version__)
print("Sklearn ", skl.__version__)

Pandas   1.2.4
Sklearn  0.24.1


## Get the dataset
- **CLOUD = True**: Download dataset from Kaggle. Necesary for cloud enviroments like COLAB. **Specify your [kaggle credentials](https://www.kaggle.com/docs/api)**.
- **CLOUD = False**: Get the dataset from your local machine. **Specify the data path**.

In [2]:
CLOUD = False

if CLOUD:
    import os
    os.environ['KAGGLE_USERNAME'] = "pathirao"
    os.environ['KAGGLE_KEY']      = "ac072a8fdca7ad179bb77bb7f8b6b1e4"  # See https://www.kaggle.com/docs/api
    #!pip install --upgrade kaggle
    !kaggle competitions download -c titanic
    DATA_PATH = "./"

else:
    DATA_PATH = "../../titanic/"

## Load data

In [3]:
df      = pd.read_csv(DATA_PATH + "train.csv", index_col='PassengerId')
df_test = pd.read_csv(DATA_PATH + "test.csv",  index_col='PassengerId')

print("Train DataFrame:", df.shape)
print("Test DataFrame: ", df_test.shape)
df.head(5)

Train DataFrame: (891, 11)
Test DataFrame:  (418, 10)


Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Check missings

In [4]:
df.isnull().sum() / len(df)

Survived    0.000000
Pclass      0.000000
Name        0.000000
Sex         0.000000
Age         0.198653
SibSp       0.000000
Parch       0.000000
Ticket      0.000000
Fare        0.000000
Cabin       0.771044
Embarked    0.002245
dtype: float64

In [5]:
df_test.isna().sum() / len(df_test) # Fare has Missings only in TEST !!!

Pclass      0.000000
Name        0.000000
Sex         0.000000
Age         0.205742
SibSp       0.000000
Parch       0.000000
Ticket      0.000000
Fare        0.002392
Cabin       0.782297
Embarked    0.000000
dtype: float64

# Preprocessing
For X data:
- We drop Survived because is the target variable
- We drop Name for simplicity. (We could extract Mr, Mrs, ...)
- We drop Ticket because it has no information -> see df.Ticket.nunique()
- We drop Cabin because it has a lot of missings (77% are missings)

Then, we identify **numerical** variables and **categorical** variables,

In [6]:
x = df.drop(columns=["Survived", 'Name', 'Ticket', 'Cabin']) # X DATA (WILL BE TRAIN+VALID DATA)
y = df["Survived"] # 0 = No, 1 = Yes

x_test = df_test.drop(columns=['Name', 'Ticket', 'Cabin']) # # X_TEST DATA (NEW DATA)

In [11]:
x.isnull().sum()

Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [12]:
x_test.isnull().sum()

Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

# Validation Strategy

Simple split:
- 80% for train
- 20% for validation

In [7]:
x_train, x_val, y_train, y_val = model_selection.train_test_split(
    x, y,
    test_size=0.2,
    stratify = y,   # ALWAYS RECOMMENDED FOR BETTER VALIDATION
    random_state=4  # Recommended for reproducibility
)

# Prepro

In [8]:
cat_vars  = ['Sex', 'Embarked']
num_vars  = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

print("\nNumerical features:\n", num_vars)
print("\nCategorical features:\n", cat_vars)


Numerical features:
 ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

Categorical features:
 ['Sex', 'Embarked']


In [None]:
num_preprocessing = pipeline.Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='mean', add_indicator=False)), # mean, median
    ('scaler', preprocessing.StandardScaler())
])

#add_indicator = The imputation fill value for each feature. Computing statistics can result in 
#np.nan values. During transform, features corresponding to np.nan statistics will be discarded.

cat_preporcessing = pipeline.Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', preprocessing.OneHotEncoder(handle_unknown='ignore'))
])

prepro = compose.ColumnTransformer(transformers=[
    ('num', num_preprocessing, num_vars),
    ('cat', cat_preporcessing, cat_vars),
], remainder='drop') # Drop other vars not specified in num_vars or cat_vars

prepro

In [None]:
x_train_prepro  = prepro.fit_transform(x_train)  # ONLY FIT THE PREPROCESSOR ON TRAIN TO AVOID DATA LEAKS !!!
x_val_prepro    = prepro.transform(x_val)

x_train_prepro.shape, x_val_prepro.shape

# Check our preprocesed data

In [None]:
cat_vars_oh = prepro.named_transformers_["cat"]["onehot"].get_feature_names(cat_vars).tolist()
cat_vars_oh

In [None]:
pd.DataFrame(data = x_train_prepro, columns=num_vars+cat_vars_oh).head() # X DATA (WILL BE TRAIN+VALID DATA)

In [None]:
pd.DataFrame(data = x_val_prepro, columns=num_vars+cat_vars_oh).head() # X_TEST DATA (NEW DATA)

# Train Model

In [None]:
model = linear_model.LogisticRegression(max_iter=2000, random_state=1)
model.fit(x_train_prepro, y_train)

## Full Pipeline: Prepro + Model

In [None]:
full_model = pipeline.Pipeline([('preprocessor', prepro), ('model', model)])
full_model

# Validate

See performance metrics like:
- Accuracy
- Balanced Accuracy
- Confusion matrix

In [None]:
# Get Predictions
y_pred       = model.predict(x_val_prepro)
y_pred_proba = model.predict_proba(x_val_prepro)[:, 1]

y_pred[:3], y_pred_proba[:3]

In [None]:
# See metrics
print("Accuracy:          ", metrics.accuracy_score(y_val, y_pred)*100)
print("Balanced accuracy: ", metrics.balanced_accuracy_score(y_val, y_pred)*100)
print("Log loss:          ", metrics.log_loss(y_val, y_pred_proba))
print("AUC:               ", metrics.roc_auc_score(y_val, y_pred_proba)*100) # Area Under ROC Curve


_, axes = plt.subplots(1, 2, figsize=(8, 4))
metrics.plot_confusion_matrix(model, x_val_prepro, y_val,
                      cmap=plt.cm.Blues, normalize=None,
                      ax=axes[0]);

metrics.plot_confusion_matrix(model, x_val_prepro, y_val,
                      cmap=plt.cm.Blues, normalize="true",
                      ax=axes[1]);

## See cofficients (Only for linear models)

In [None]:
coefs = pd.DataFrame(data=model.coef_[0].tolist(), index=num_vars+cat_vars_oh, columns=["Coefs"]).sort_values(by="Coefs")
coefs

In [None]:
coefs.plot.barh();