# Homework 03


In [1]:
from pathlib import Path

import altair as alt
import numpy as np
import polars as pl
import polars.selectors as cs
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mutual_info_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from ml_zoomcamp.utils import clean_column_names, load_data
from ml_zoomcamp.churn import prepare_X, get_churn_score_pipeline

ROOT_DIR = Path.cwd().parent
DATA_DIR = ROOT_DIR.joinpath("data")

In [2]:
seed = 42

### Getting the data


In [3]:
csv_path = DATA_DIR.joinpath("bank_marketing/bank/bank-full.csv")
df = load_data(csv_path, DATA_DIR, separator=";")
df = clean_column_names(df)

#### Cleanup Columns


In [4]:
base = [
    "age",
    "job",
    "marital",
    "education",
    "balance",
    "housing",
    "contact",
    "day",
    "month",
    "duration",
    "campaign",
    "pdays",
    "previous",
    "poutcome",
    "y",
]

In [5]:
df = df.select(pl.col(base))

### Data Preparation


In [6]:
df.schema

Schema([('age', Int64),
        ('job', String),
        ('marital', String),
        ('education', String),
        ('balance', Int64),
        ('housing', String),
        ('contact', String),
        ('day', Int64),
        ('month', String),
        ('duration', Int64),
        ('campaign', Int64),
        ('pdays', Int64),
        ('previous', Int64),
        ('poutcome', String),
        ('y', String)])

In [7]:
df.glimpse()

Rows: 45211
Columns: 15
$ age       <i64> 58, 44, 33, 47, 33, 35, 28, 42, 58, 43
$ job       <str> 'management', 'technician', 'entrepreneur', 'blue-collar', 'unknown', 'management', 'management', 'entrepreneur', 'retired', 'technician'
$ marital   <str> 'married', 'single', 'married', 'married', 'single', 'married', 'single', 'divorced', 'married', 'single'
$ education <str> 'tertiary', 'secondary', 'secondary', 'unknown', 'unknown', 'tertiary', 'tertiary', 'tertiary', 'primary', 'secondary'
$ balance   <i64> 2143, 29, 2, 1506, 1, 231, 447, 2, 121, 593
$ housing   <str> 'yes', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes'
$ contact   <str> 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown'
$ day       <i64> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5
$ month     <str> 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may'
$ duration  <i64> 261, 151, 76, 92, 198, 139, 217, 380, 50, 55
$ campaign  <i64> 1, 1, 1, 1, 

In [None]:
df.describe()

statistic,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
str,f64,str,str,str,f64,str,str,f64,str,f64,f64,f64,f64,str,str
"""count""",45211.0,"""45211""","""45211""","""45211""",45211.0,"""45211""","""45211""",45211.0,"""45211""",45211.0,45211.0,45211.0,45211.0,"""45211""","""45211"""
"""null_count""",0.0,"""0""","""0""","""0""",0.0,"""0""","""0""",0.0,"""0""",0.0,0.0,0.0,0.0,"""0""","""0"""
"""mean""",40.93621,,,,1362.272058,,,15.806419,,258.16308,2.763841,40.197828,0.580323,,
"""std""",10.618762,,,,3044.765829,,,8.322476,,257.527812,3.098021,100.128746,2.303441,,
"""min""",18.0,"""admin.""","""divorced""","""primary""",-8019.0,"""no""","""cellular""",1.0,"""apr""",0.0,1.0,-1.0,0.0,"""failure""","""no"""
"""25%""",33.0,,,,72.0,,,8.0,,103.0,1.0,-1.0,0.0,,
"""50%""",39.0,,,,448.0,,,16.0,,180.0,2.0,-1.0,0.0,,
"""75%""",48.0,,,,1428.0,,,21.0,,319.0,3.0,-1.0,0.0,,
"""max""",95.0,"""unknown""","""single""","""unknown""",102127.0,"""yes""","""unknown""",31.0,"""sep""",4918.0,63.0,871.0,275.0,"""unknown""","""yes"""


In [None]:
df.null_count().transpose(include_header=True, column_names=["null_count"]).filter(
    pl.col("null_count") > 0
).sort(pl.col("null_count"), descending=True)

column,null_count
str,u32


There are no missing features


In [10]:
categorical = ["job", "marital", "education", "housing", "contact", "month", "poutcome"]
numerical = ["age", "balance", "day", "duration", "campaign", "pdays", "previous"]

## EDA


In [11]:
df["y"].value_counts(sort=True, normalize=True)

y,proportion
str,f64
"""no""",0.883015
"""yes""",0.116985


Dataset is imbalanced


### 1. Most frequent observation (mode) for the column `education`


In [12]:
df.select(pl.col("education").mode())

education
str
"""secondary"""


### 2. Correlation matrix


In [13]:
df_corr = (
    df.select(pl.col(numerical))
    .corr()
    .with_columns(pl.Series(numerical).alias("index"))
)
df_corr_long = df_corr.unpivot(index="index")

In [None]:
df_corr

age,balance,day,duration,campaign,pdays,previous,index
f64,f64,f64,f64,f64,f64,f64,str
1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288,"""age"""
0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674,"""balance"""
-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171,"""day"""
-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203,"""duration"""
0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855,"""campaign"""
-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482,"""pdays"""
0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0,"""previous"""


In [None]:
alt.Chart(df_corr_long).mark_rect().encode(
    alt.X("index"),
    alt.Y("variable"),
    alt.Color("value"),
)

In [16]:
df_corr_long.filter(pl.col("index") != pl.col("variable")).sort(
    "value", descending=True
)[:6]

index,variable,value
str,str,f64
"""previous""","""pdays""",0.45482
"""pdays""","""previous""",0.45482
"""campaign""","""day""",0.16249
"""day""","""campaign""",0.16249
"""age""","""balance""",0.097783
"""balance""","""age""",0.097783


`pdays` and `previous`


### Target encoding


In [17]:
df = df.with_columns(pl.when(pl.col("y") == "yes").then(1).otherwise(0).alias("y"))

### Setting Up Validation Framework


In [18]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=seed)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=seed)

In [19]:
y_train = df_train["y"].to_numpy()
y_val = df_val["y"].to_numpy()
y_test = df_test["y"].to_numpy()

In [20]:
df_train = df_train.drop("y")
df_val = df_val.drop("y")
df_test = df_test.drop("y")

### 3. Mutual information score


In [21]:
df_train.select(
    [
        pl.col(c).map_batches(lambda x: mutual_info_score(x, y_train))
        for c in categorical
    ]
).transpose(include_header=True, column_names=["score"]).sort("score", descending=True)

column,score
str,f64
"""poutcome""",0.029533
"""month""",0.02509
"""contact""",0.013356
"""housing""",0.010343
"""job""",0.007316
"""education""",0.002697
"""marital""",0.00205


### 4. Train logistic regression


#### Encoding


In [22]:
ohe = OneHotEncoder(sparse_output=False)
ohe.fit(df_train[categorical])

In [23]:
X_train = prepare_X(df_train, ohe, categorical, numerical)
X_val = prepare_X(df_val, ohe, categorical, numerical)

#### Model training


In [24]:
model = LogisticRegression(
    solver="liblinear",
    C=1.0,
    max_iter=1000,
    random_state=seed,
)
model.fit(X_train, y_train)

In [25]:
y_pred = model.predict_proba(X_val)[:, 1]

In [26]:
churn_decision = y_pred >= 0.5

In [27]:
(y_val == churn_decision).mean().round(2)

np.float64(0.9)

### 5. Feature elimination


In [28]:
base_score = get_churn_score_pipeline(
    df_train, df_val, y_train, y_val, categorical, numerical
)
base_score

np.float64(0.9009068790090687)

In [29]:
scores_without_feat = {}
for c in categorical + numerical:
    new_cat_cols = [_ for _ in categorical if _ != c]
    new_num_cols = [_ for _ in numerical if _ != c]
    score = get_churn_score_pipeline(
        df_train, df_val, y_train, y_val, new_cat_cols, new_num_cols
    )
    scores_without_feat[c] = score

df_scores = pl.DataFrame(scores_without_feat)
df_scores = df_scores.transpose(
    include_header=True, column_names=["score"]
).with_columns((pl.col("score") - pl.lit(base_score)).abs().name.suffix("_abs_diff"))

In [30]:
df_scores.sort("score_abs_diff")

column,score,score_abs_diff
str,f64,f64
"""job""",0.900907,0.0
"""age""",0.900907,0.0
"""balance""",0.900796,0.000111
"""marital""",0.901017,0.000111
"""education""",0.901017,0.000111
…,…,…
"""day""",0.901349,0.000442
"""housing""",0.900243,0.000664
"""month""",0.899801,0.001106
"""poutcome""",0.893276,0.007631


`job` and `age` are the least important features


### 6. Regularized logistic regression


In [31]:
list_C = [0.01, 0.1, 1, 10, 100]
scores_with_C = {}
for C in list_C:
    score = get_churn_score_pipeline(
        df_train, df_val, y_train, y_val, categorical, numerical, C=C
    )
    scores_with_C[str(C)] = score.round(3)

df_scores_with_C = pl.DataFrame(scores_with_C)
df_scores_with_C = df_scores_with_C.transpose(
    include_header=True, header_name="C", column_names=["score"]
)

In [32]:
df_scores_with_C.sort("score", descending=True)

C,score
str,f64
"""0.1""",0.901
"""1""",0.901
"""10""",0.901
"""100""",0.901
"""0.01""",0.898
