# Homework 04


In [1]:
from pathlib import Path

import altair as alt
import numpy as np
import polars as pl
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, train_test_split
from tqdm.auto import tqdm

from ml_zoomcamp.churn import (
    get_churn_score_dv_pipeline,
)
from ml_zoomcamp.utils import clean_column_names, load_data

ROOT_DIR = Path.cwd().parent
DATA_DIR = ROOT_DIR.joinpath("data")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SEED = 1

### Getting the data


In [3]:
csv_path = DATA_DIR.joinpath("bank_marketing/bank/bank-full.csv")
df = load_data(csv_path, DATA_DIR, separator=";")
df = clean_column_names(df)

#### Cleanup Columns


In [4]:
base = [
    "age",
    "job",
    "marital",
    "education",
    "balance",
    "housing",
    "contact",
    "day",
    "month",
    "duration",
    "campaign",
    "pdays",
    "previous",
    "poutcome",
    "y",
]

In [5]:
df = df.select(pl.col(base))

### Data Preparation


In [6]:
df.schema

Schema([('age', Int64),
        ('job', String),
        ('marital', String),
        ('education', String),
        ('balance', Int64),
        ('housing', String),
        ('contact', String),
        ('day', Int64),
        ('month', String),
        ('duration', Int64),
        ('campaign', Int64),
        ('pdays', Int64),
        ('previous', Int64),
        ('poutcome', String),
        ('y', String)])

In [7]:
df.glimpse()

Rows: 45211
Columns: 15
$ age       <i64> 58, 44, 33, 47, 33, 35, 28, 42, 58, 43
$ job       <str> 'management', 'technician', 'entrepreneur', 'blue-collar', 'unknown', 'management', 'management', 'entrepreneur', 'retired', 'technician'
$ marital   <str> 'married', 'single', 'married', 'married', 'single', 'married', 'single', 'divorced', 'married', 'single'
$ education <str> 'tertiary', 'secondary', 'secondary', 'unknown', 'unknown', 'tertiary', 'tertiary', 'tertiary', 'primary', 'secondary'
$ balance   <i64> 2143, 29, 2, 1506, 1, 231, 447, 2, 121, 593
$ housing   <str> 'yes', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes'
$ contact   <str> 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown'
$ day       <i64> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5
$ month     <str> 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may'
$ duration  <i64> 261, 151, 76, 92, 198, 139, 217, 380, 50, 55
$ campaign  <i64> 1, 1, 1, 1, 

In [None]:
df.describe()

statistic,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
str,f64,str,str,str,f64,str,str,f64,str,f64,f64,f64,f64,str,str
"""count""",45211.0,"""45211""","""45211""","""45211""",45211.0,"""45211""","""45211""",45211.0,"""45211""",45211.0,45211.0,45211.0,45211.0,"""45211""","""45211"""
"""null_count""",0.0,"""0""","""0""","""0""",0.0,"""0""","""0""",0.0,"""0""",0.0,0.0,0.0,0.0,"""0""","""0"""
"""mean""",40.93621,,,,1362.272058,,,15.806419,,258.16308,2.763841,40.197828,0.580323,,
"""std""",10.618762,,,,3044.765829,,,8.322476,,257.527812,3.098021,100.128746,2.303441,,
"""min""",18.0,"""admin.""","""divorced""","""primary""",-8019.0,"""no""","""cellular""",1.0,"""apr""",0.0,1.0,-1.0,0.0,"""failure""","""no"""
"""25%""",33.0,,,,72.0,,,8.0,,103.0,1.0,-1.0,0.0,,
"""50%""",39.0,,,,448.0,,,16.0,,180.0,2.0,-1.0,0.0,,
"""75%""",48.0,,,,1428.0,,,21.0,,319.0,3.0,-1.0,0.0,,
"""max""",95.0,"""unknown""","""single""","""unknown""",102127.0,"""yes""","""unknown""",31.0,"""sep""",4918.0,63.0,871.0,275.0,"""unknown""","""yes"""


In [9]:
df.null_count().transpose(include_header=True, column_names=["null_count"]).filter(
    pl.col("null_count") > 0
).sort(pl.col("null_count"), descending=True)

column,null_count
str,u32


There are no missing features


In [10]:
categorical = ["job", "marital", "education", "housing", "contact", "month", "poutcome"]
numerical = ["age", "balance", "day", "duration", "campaign", "pdays", "previous"]

## EDA


In [None]:
df["y"].value_counts(sort=True, normalize=True)

y,proportion
str,f64
"""no""",0.883015
"""yes""",0.116985


Dataset is imbalanced and most clients are not subscirebt to a term deposit


## Target encoding


In [12]:
df = df.with_columns((pl.col("y") == "yes").cast(pl.Int8))

## Setting Up Validation Framework


In [13]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=SEED)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=SEED)

assert len(df) == len(df_train) + len(df_val) + len(df_test)

In [14]:
len(df_train), len(df_val), len(df_test)

(27126, 9042, 9043)

In [15]:
y_train = df_train["y"].to_numpy()
y_val = df_val["y"].to_numpy()
y_test = df_test["y"].to_numpy()

In [16]:
df_train = df_train.drop("y")
df_val = df_val.drop("y")
df_test = df_test.drop("y")

## 1. ROC AUC feature importance


In [17]:
auc_scores = []
for num in numerical:
    auc_score = roc_auc_score(y_train, df_train[num])
    if auc_score < 0.5:
        auc_score = roc_auc_score(y_train, -df_train[num])

    auc_scores.append((num, auc_score))
    print(num, "%.3f" % auc_score)

age 0.512
balance 0.589
day 0.526
duration 0.815
campaign 0.571
pdays 0.590
previous 0.599


In [18]:
df_auc_scores = pl.DataFrame(auc_scores, schema=["feature", "auc_score"], orient="row")
df_auc_scores

feature,auc_score
str,f64
"""age""",0.512186
"""balance""",0.588831
"""day""",0.525958
"""duration""",0.8147
"""campaign""",0.571454
"""pdays""",0.590128
"""previous""",0.598565


In [19]:
df_auc_scores.filter(pl.col("auc_score") == pl.col("auc_score").max())

feature,auc_score
str,f64
"""duration""",0.8147


`duration`


## 2. Train model


#### Encoding


In [20]:
dicts_train = df_train.select(pl.col(categorical + numerical)).to_dicts()
dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(dicts_train, y_train)

dicts_val = df_val.select(pl.col(categorical + numerical)).to_dicts()
X_val = dv.transform(dicts_val)

#### Model training


In [21]:
model = LogisticRegression(
    solver="liblinear",
    C=1.0,
    max_iter=1000,
    random_state=SEED,
)
model.fit(X_train, y_train)

In [22]:
y_pred = model.predict_proba(X_val)[:, 1]

In [23]:
roc_auc_score(y_val, y_pred).round(3)

np.float64(0.9)

## 3. Precision and Recall


In [24]:
precision_recall_scores = []
thresholds = np.linspace(0, 1, 101)

for t in thresholds:
    actual_positive = y_val == 1
    actual_negative = y_val == 0

    predict_positive = y_pred >= t
    predict_negative = y_pred < t

    tp = (predict_positive & actual_positive).sum()
    tn = (predict_negative & actual_negative).sum()

    fp = (predict_positive & actual_negative).sum()
    fn = (predict_negative & actual_positive).sum()

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)

    precision_recall_scores.append((t, precision, recall))

  precision = tp / (tp + fp)


In [25]:
df_precision_recall_scores = pl.DataFrame(
    precision_recall_scores, schema=["threshold", "precision", "recall"], orient="row"
)
df_precision_recall_scores

threshold,precision,recall
f64,f64,f64
0.0,0.121433,1.0
0.01,0.135095,0.996357
0.02,0.15882,0.995446
0.03,0.186385,0.989982
0.04,0.219392,0.972678
…,…,…
0.96,0.638298,0.027322
0.97,0.625,0.022769
0.98,0.677419,0.019126
0.99,0.6875,0.010018


In [26]:
alt.Chart(df_precision_recall_scores).transform_fold(
    ["precision", "recall"]
).mark_line().encode(
    alt.X("threshold:Q"),
    alt.Y("value:Q"),
    alt.Color("key:N"),
)

In [27]:
df_precision_recall_scores.filter(
    pl.col("threshold").is_between(0.26, 0.27)
).with_columns(pl.col("precision", "recall").round(3))

threshold,precision,recall
f64,f64,f64
0.26,0.556,0.564
0.27,0.563,0.549


In [28]:
df_precision_recall_scores.with_columns(
    (pl.col("precision") - pl.col("recall")).abs().alias("diff")
).filter(pl.col("diff") == pl.col("diff").min())

threshold,precision,recall,diff
f64,f64,f64,f64
0.26,0.556155,0.563752,0.007598


Between `0.2` and `0.3`, which is around `0.265`


## 4. F1 score


In [29]:
df_f1_scores = df_precision_recall_scores.with_columns(
    (
        pl.lit(2)
        * (pl.col("precision") * pl.col("recall"))
        / (pl.col("precision") + pl.col("recall"))
    ).alias("f1")
)

In [30]:
df_f1_scores.filter(pl.col("f1") == pl.col("f1").max())

threshold,precision,recall,f1
f64,f64,f64,f64
0.22,0.528913,0.624772,0.57286


`0.22`


## 5. 5-Fold CV


In [31]:
kfold = KFold(n_splits=5, shuffle=True, random_state=SEED)

In [32]:
auc_scores = []

for train_idx, val_idx in kfold.split(df_full_train):
    auc_score = get_churn_score_dv_pipeline(
        df_full_train,
        train_idx,
        val_idx,
        categorical,
        numerical,
        seed=SEED,
    )
    auc_scores.append(auc_score)

In [33]:
"mean_score: {:.3f} +- {:.3f}".format(np.mean(auc_scores), np.std(auc_scores))

'mean_score: 0.906 +- 0.006'

## 6. Hyperparameter Tuning


In [34]:
list_C = [0.000001, 0.001, 1]
auc_scores_with_C = []
for C in tqdm(list_C):
    auc_scores = []
    kfold = KFold(n_splits=5, shuffle=True, random_state=SEED)
    for train_idx, val_idx in kfold.split(df_full_train):
        auc_score = get_churn_score_dv_pipeline(
            df_full_train,
            train_idx,
            val_idx,
            categorical,
            numerical,
            C=C,
            seed=SEED,
        )
        auc_scores.append(auc_score)
    auc_scores_with_C.append((C, np.mean(auc_scores), np.std(auc_scores)))

df_auc_scores_with_C = pl.DataFrame(
    auc_scores_with_C, schema=["C", "mean_auc_score", "std_auc_score"], orient="row"
)

100%|██████████| 3/3 [00:15<00:00,  5.07s/it]


In [35]:
df_auc_scores_with_C

C,mean_auc_score,std_auc_score
f64,f64,f64
1e-06,0.701445,0.009444
0.001,0.860843,0.00718
1.0,0.905865,0.005653


In [36]:
df_auc_scores_with_C.filter(pl.col("mean_auc_score") == pl.col("mean_auc_score").max())

C,mean_auc_score,std_auc_score
f64,f64,f64
1.0,0.905865,0.005653


`1.0`
