# Load Libraries


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    roc_curve,
    confusion_matrix,
)

# Data preprocessing


## Load data


In [2]:
train_data = pd.read_csv("./data/train.csv")
test_data = pd.read_csv("./data/test.csv")

In [3]:
train_data.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [4]:
test_data.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,165034,15773898,Lucchese,586,France,Female,23.0,2,0.0,2,0.0,1.0,160976.75
1,165035,15782418,Nott,683,France,Female,46.0,2,0.0,1,1.0,0.0,72549.27
2,165036,15807120,K?,656,France,Female,34.0,7,0.0,2,1.0,0.0,138882.09
3,165037,15808905,O'Donnell,681,France,Male,36.0,8,0.0,1,1.0,0.0,113931.57
4,165038,15607314,Higgins,752,Germany,Male,38.0,10,121263.62,1,1.0,0.0,139431.0


## Data validation


In [5]:
train_data.isna().sum()

id                 0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [6]:
test_data.isna().sum()

id                 0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
dtype: int64

In [7]:
train_data["Exited"].value_counts()

0    130113
1     34921
Name: Exited, dtype: int64

In [8]:
N = 90000
train_data_balanced = train_data.drop(
    train_data[train_data["Exited"].eq(0)].sample(N).index
)

In [9]:
train_data_balanced["Exited"].value_counts()

0    40113
1    34921
Name: Exited, dtype: int64

In [10]:
train_data_balanced.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
5,5,15771669,Genovese,588,Germany,Male,36.0,4,131778.58,1,1.0,0.0,136024.31,1
7,7,15669611,Chukwuebuka,678,Spain,Male,37.0,1,138476.41,1,1.0,0.0,106851.6,0
17,17,15679804,Esquivel,636,France,Male,36.0,4,117559.05,2,1.0,0.0,111573.3,0


In [11]:
train_data_balanced = train_data_balanced.drop(
    columns=["id", "CustomerId", "Surname"]
)

In [12]:
train_data_balanced.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
3,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
5,588,Germany,Male,36.0,4,131778.58,1,1.0,0.0,136024.31,1
7,678,Spain,Male,37.0,1,138476.41,1,1.0,0.0,106851.6,0
17,636,France,Male,36.0,4,117559.05,2,1.0,0.0,111573.3,0


# Modeling


## Preprocessing


### Create pipeline


In [13]:
target_data = train_data_balanced["Exited"]
train_data_balanced = train_data_balanced.drop(columns=["Exited"], axis=1)

In [14]:
categorical_cols = [
    "Geography",
    "Gender",
    "Tenure",
    "HasCrCard",
    "IsActiveMember",
]
continuous_cols = train_data_balanced.columns.difference(categorical_cols)

In [15]:
categorical_cols

['Geography', 'Gender', 'Tenure', 'HasCrCard', 'IsActiveMember']

In [16]:
continuous_cols

Index(['Age', 'Balance', 'CreditScore', 'EstimatedSalary', 'NumOfProducts'], dtype='object')

In [17]:
continuous_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler()),
    ]
)
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

In [18]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", continuous_transformer, continuous_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

### Split data


In [19]:
X_train, X_val, Y_train, Y_val = train_test_split(
    train_data_balanced, target_data, test_size=0.2, random_state=42
)

### Create model pipeline


In [20]:
lr_pipeline = Pipeline(
    [("preprocessor", preprocessor), ("model", LogisticRegression())]
)

In [21]:
rf_pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("model", RandomForestClassifier(n_estimators=150)),
    ]
)

In [22]:
svc_pipeline = Pipeline(
    [("preprocessor", preprocessor), ("model", SVC(probability=True))]
)

## Training


In [23]:
lr_pipeline.fit(X_train, Y_train)

In [24]:
rf_pipeline.fit(X_train, Y_train)

In [25]:
svc_pipeline.fit(X_train, Y_train)

In [26]:
lr_pipeline.score(X_val, Y_val)

0.7502498833877523

In [27]:
rf_pipeline.score(X_val, Y_val)

0.7906976744186046

In [28]:
svc_pipeline.score(X_val, Y_val)

0.7963616978743253

## Evaluation

In [29]:
def evaluate_model(model_pipeline, X_val, Y_val):
    predictions = model_pipeline.predict(X_val)

    report = classification_report(Y_val, predictions, output_dict=True)
    f1 = report["weighted avg"]["f1-score"]
    precision = report["weighted avg"]["precision"]
    recall = report["weighted avg"]["recall"]

    roc_auc = roc_auc_score(Y_val, model_pipeline.predict_proba(X_val)[:, 1])

    score = model_pipeline.score(X_val, Y_val)

    metrics_df = pd.DataFrame(
        {
            "F1-Score": [f1],
            "Precision": [precision],
            "Recall": [recall],
            "ROC AUC": [roc_auc],
            "Score": [score],
        }
    )

    return metrics_df

In [30]:
evaluate_model(lr_pipeline, X_val, Y_val)

Unnamed: 0,F1-Score,Precision,Recall,ROC AUC,Score
0,0.749646,0.749893,0.75025,0.819232,0.75025


In [31]:
evaluate_model(rf_pipeline, X_val, Y_val)

Unnamed: 0,F1-Score,Precision,Recall,ROC AUC,Score
0,0.790465,0.790474,0.790698,0.870125,0.790698


In [32]:
evaluate_model(svc_pipeline, X_val, Y_val)

Unnamed: 0,F1-Score,Precision,Recall,ROC AUC,Score
0,0.796159,0.796156,0.796362,0.862077,0.796362


# Prediction

In [33]:
X_test = test_data.drop(columns=["id", "CustomerId", "Surname"])
test_predicted_proba = svc_pipeline.predict_proba(X_test)[:, 1]

submission_df = pd.DataFrame(
    {"id": test_data["id"], "Exited": test_predicted_proba}
)
submission_df

Unnamed: 0,id,Exited
0,165034,0.147446
1,165035,0.834280
2,165036,0.128012
3,165037,0.669400
4,165038,0.778753
...,...,...
110018,275052,0.048241
110019,275053,0.168833
110020,275054,0.151117
110021,275055,0.343587


In [34]:
submission_df.to_csv("submission.csv", index=None)