# 2. Evaluating models

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report

X_raw = pd.read_pickle("data/X_raw.pkl")
y_raw = pd.read_pickle("data/y_raw.pkl")


NUMERIC_FEATURES = ["CreditScore", "Age", "Balance", "Tenure", "EstimatedSalary"]
CATEGORICAL_FEATURES = ["Geography", "Gender", "NumOfProducts", "HasCrCard", "IsActiveMember"]

## Converting and generating features

In [2]:
def get_features(X_raw):
    return pd.concat([
            pd.get_dummies(X_raw[CATEGORICAL_FEATURES]).astype(float),
            X_raw[NUMERIC_FEATURES].astype(float),
        ], axis=1)

X = get_features(X_raw)
X.head()

Unnamed: 0_level_0,NumOfProducts,HasCrCard,IsActiveMember,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male,CreditScore,Age,Balance,Tenure,EstimatedSalary
CustomerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
15634602,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,619.0,42.0,0.0,2.0,101348.88
15647311,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,608.0,41.0,83807.86,1.0,112542.58
15619304,3.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,502.0,42.0,159660.8,8.0,113931.57
15701354,2.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,699.0,39.0,0.0,1.0,93826.63
15737888,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,850.0,43.0,125510.82,2.0,79084.1


## Train-test split


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y_raw["Exited"].to_numpy().flatten(), train_size=8000, random_state=42)

print(f"""
Training set: {X_train.shape}, labels: {y_train.shape},
Test set:     {X_test.shape}, labels: {y_test.shape}
""")


Training set: (8000, 13), labels: (8000,),
Test set:     (2000, 13), labels: (2000,)



In [4]:
assert np.isclose(y_train.sum() / y_train.shape[0], y_test.sum() / y_test.shape[0], rtol=0.05)

## Models
### Baseline model

In [5]:
from sklearn.linear_model import LogisticRegression

pipe_logistic_regression = make_pipeline(
    StandardScaler(),
    LogisticRegression(penalty="none")
)

pipe_logistic_regression.fit(X_train, y_train)


outputs_LR = {
    "y_train": y_train,
    "y_test": y_test,
}
outputs_LR["y_train_pred"] = pipe_logistic_regression.predict(X_train)
outputs_LR["y_test_pred"] = pipe_logistic_regression.predict(X_test)

In [6]:
def full_report(outputs, pipe):
    target_names = ["stayed", "churned"]
    print(f"Model: {pipe[-1].__class__.__name__}")
    print("=" * 60)
    print("Training:\n", classification_report(outputs["y_train"], outputs["y_train_pred"], target_names=target_names, digits=4))
    print("Test:\n", classification_report(outputs["y_test"], outputs["y_test_pred"], target_names=target_names, digits=4))
    
full_report(outputs_LR, pipe_logistic_regression)

Model: LogisticRegression
Training:
               precision    recall  f1-score   support

      stayed     0.8268    0.9644    0.8903      6356
     churned     0.6143    0.2190    0.3229      1644

    accuracy                         0.8113      8000
   macro avg     0.7206    0.5917    0.6066      8000
weighted avg     0.7831    0.8113    0.7737      8000

Test:
               precision    recall  f1-score   support

      stayed     0.8309    0.9602    0.8909      1607
     churned     0.5524    0.2010    0.2948       393

    accuracy                         0.8110      2000
   macro avg     0.6917    0.5806    0.5928      2000
weighted avg     0.7762    0.8110    0.7737      2000



### Decision Tree

In [7]:
from sklearn.tree import DecisionTreeClassifier

pipe_decision_tree = make_pipeline(
    StandardScaler(),
    DecisionTreeClassifier()
)

pipe_decision_tree.fit(X_train, y_train)


outputs_DT = {
    "y_train": y_train,
    "y_test": y_test,
}
outputs_DT["y_train_pred"] = pipe_decision_tree.predict(X_train)
outputs_DT["y_test_pred"] = pipe_decision_tree.predict(X_test)

full_report(outputs_DT, pipe_decision_tree)

Model: DecisionTreeClassifier
Training:
               precision    recall  f1-score   support

      stayed     1.0000    1.0000    1.0000      6356
     churned     1.0000    1.0000    1.0000      1644

    accuracy                         1.0000      8000
   macro avg     1.0000    1.0000    1.0000      8000
weighted avg     1.0000    1.0000    1.0000      8000

Test:
               precision    recall  f1-score   support

      stayed     0.8753    0.8475    0.8612      1607
     churned     0.4482    0.5064    0.4755       393

    accuracy                         0.7805      2000
   macro avg     0.6618    0.6770    0.6684      2000
weighted avg     0.7914    0.7805    0.7854      2000



### Random Forest

In [8]:
from sklearn.ensemble import RandomForestClassifier

pipe_random_forest = make_pipeline(
    StandardScaler(),
    RandomForestClassifier()
)

pipe_random_forest.fit(X_train, y_train)


outputs_RF = {
    "y_train": y_train,
    "y_test": y_test,
}
outputs_RF["y_train_pred"] = pipe_random_forest.predict(X_train)
outputs_RF["y_test_pred"] = pipe_random_forest.predict(X_test)

full_report(outputs_RF, pipe_random_forest)

Model: RandomForestClassifier
Training:
               precision    recall  f1-score   support

      stayed     0.9998    1.0000    0.9999      6356
     churned     1.0000    0.9994    0.9997      1644

    accuracy                         0.9999      8000
   macro avg     0.9999    0.9997    0.9998      8000
weighted avg     0.9999    0.9999    0.9999      8000

Test:
               precision    recall  f1-score   support

      stayed     0.8826    0.9633    0.9212      1607
     churned     0.7602    0.4758    0.5853       393

    accuracy                         0.8675      2000
   macro avg     0.8214    0.7196    0.7532      2000
weighted avg     0.8585    0.8675    0.8552      2000



### XGBoost model

In [9]:
from xgboost import XGBClassifier

pipe_xgb = make_pipeline(
    StandardScaler(),
    XGBClassifier()
)

pipe_xgb.fit(X_train, y_train)


outputs_XGB = {
    "y_train": y_train,
    "y_test": y_test,
}
outputs_XGB["y_train_pred"] = pipe_xgb.predict(X_train)
outputs_XGB["y_test_pred"] = pipe_xgb.predict(X_test)

full_report(outputs_XGB, pipe_xgb)





Model: XGBClassifier
Training:
               precision    recall  f1-score   support

      stayed     0.9533    0.9931    0.9728      6356
     churned     0.9681    0.8120    0.8832      1644

    accuracy                         0.9559      8000
   macro avg     0.9607    0.9026    0.9280      8000
weighted avg     0.9564    0.9559    0.9544      8000

Test:
               precision    recall  f1-score   support

      stayed     0.8870    0.9477    0.9164      1607
     churned     0.7032    0.5064    0.5888       393

    accuracy                         0.8610      2000
   macro avg     0.7951    0.7270    0.7526      2000
weighted avg     0.8509    0.8610    0.8520      2000



## Summary(?)

In [10]:
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score

outputs = [
    outputs_LR, outputs_DT, outputs_RF, outputs_XGB
]

pipes = [
    pipe_logistic_regression,
    pipe_decision_tree,
    pipe_random_forest,
    pipe_xgb,
]

models_cols = [p[-1].__class__.__name__ for p in pipes]


def metrics_report(outputs, metrics_fn):
    df = pd.concat([
        pd.DataFrame([
            metrics_fn(o["y_train"], o[f"y_train_pred"]) for o in outputs
        ], columns=["train"], index=models_cols).T,
        pd.DataFrame([
            metrics_fn(o["y_test"], o[f"y_test_pred"]) for o in outputs
        ], columns=["test"], index=models_cols).T,
    ], axis=0).T
    df.name = metrics_fn.__name__
    return df

In [11]:
metrics_report(outputs, recall_score)

Unnamed: 0,train,test
LogisticRegression,0.218978,0.201018
DecisionTreeClassifier,1.0,0.506361
RandomForestClassifier,0.999392,0.475827
XGBClassifier,0.812044,0.506361


In [12]:
metrics_report(outputs, precision_score)

Unnamed: 0,train,test
LogisticRegression,0.614334,0.552448
DecisionTreeClassifier,1.0,0.448198
RandomForestClassifier,1.0,0.760163
XGBClassifier,0.968093,0.70318


In [13]:
metrics_report(outputs, f1_score)

Unnamed: 0,train,test
LogisticRegression,0.32287,0.294776
DecisionTreeClassifier,1.0,0.475508
RandomForestClassifier,0.999696,0.58529
XGBClassifier,0.883229,0.588757


## Experiment - Reducing features

In [14]:
DEMOGRAPHIC_FEATURES = [c for c in X.columns if c.startswith("Geography") or c.startswith("Gender")] + ["Age"]

X_v1 = X[DEMOGRAPHIC_FEATURES + ["IsActiveMember", "NumOfProducts"]]

X_train, X_test, y_train, y_test = train_test_split(X_v1, y_raw["Exited"].to_numpy().flatten(), train_size=8000, random_state=42)

print(f"""
Training set: {X_train.shape}, labels: {y_train.shape},
Test set:     {X_test.shape}, labels: {y_test.shape}
""")

pipe_xgb_v1 = make_pipeline(
    StandardScaler(),
    XGBClassifier()
)

pipe_xgb_v1.fit(X_train, y_train)


outputs_XGB_v1 = {
    "y_train": y_train,
    "y_test": y_test,
}
outputs_XGB_v1["y_train_pred"] = pipe_xgb_v1.predict(X_train)
outputs_XGB_v1["y_test_pred"] = pipe_xgb_v1.predict(X_test)

outputs.append(outputs_XGB_v1)
pipes.append(pipe_xgb_v1)
models_cols = [p[-1].__class__.__name__ for p in pipes]

metrics_report(outputs, recall_score)


Training set: (8000, 8), labels: (8000,),
Test set:     (2000, 8), labels: (2000,)







Unnamed: 0,train,test
LogisticRegression,0.218978,0.201018
DecisionTreeClassifier,1.0,0.506361
RandomForestClassifier,0.999392,0.475827
XGBClassifier,0.812044,0.506361
XGBClassifier,0.500608,0.475827


No... the other features still seem to be useful.

### Experiment 2 - adding regularization

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y_raw["Exited"].to_numpy().flatten(), train_size=8000, random_state=42)

print(f"""
Training set: {X_train.shape}, labels: {y_train.shape},
Test set:     {X_test.shape}, labels: {y_test.shape}
""")

pipe_xgb_v2 = make_pipeline(
    StandardScaler(),
    XGBClassifier(reg_lambda=0.001)
)

pipe_xgb_v2.fit(X_train, y_train)


outputs_XGB_v2 = {
    "y_train": y_train,
    "y_test": y_test,
}
outputs_XGB_v2["y_train_pred"] = pipe_xgb_v2.predict(X_train)
outputs_XGB_v2["y_test_pred"] = pipe_xgb_v2.predict(X_test)

outputs.append(outputs_XGB_v2)
pipes.append(pipe_xgb_v2)
models_cols = [p[-1].__class__.__name__ for p in pipes]

metrics_report(outputs, recall_score)


Training set: (8000, 13), labels: (8000,),
Test set:     (2000, 13), labels: (2000,)







Unnamed: 0,train,test
LogisticRegression,0.218978,0.201018
DecisionTreeClassifier,1.0,0.506361
RandomForestClassifier,0.999392,0.475827
XGBClassifier,0.812044,0.506361
XGBClassifier,0.500608,0.475827
XGBClassifier,0.864964,0.503817


Not really either... Tried with both L1 and L2 regularizaiton, but it fails to increase the test set recall.
It only increases the training set recall or other metrics.

I am so focused on recall because, I believe this metrics has the most business value.
While a model may be better in terms of f1 score or precision (not to mention the accuracy, which is a bad metrics here anyway),
recall will allow us to pick the customers we believe may be about to churn.
Therefore, it shoudl cost the company less to offer some benefits to prospective non-churners (focusing on precision)
rather than not to detect a genuine churner.
Since recall (sensitivity) is the ratio of the true positives to a sum of true positives and false negatives,
this is the metrics that focuses on _detection_ of the prospective churners.