# Homework 03


In [1]:
from pathlib import Path

import altair as alt
import numpy as np
import polars as pl
import polars.selectors as cs
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mutual_info_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (
    LabelBinarizer,
    LabelEncoder,
    OneHotEncoder,
    StandardScaler,
)

from ml_zoomcamp.utils import clean_column_names, load_data

ROOT_DIR = Path.cwd().parent
DATA_DIR = ROOT_DIR.joinpath("data")

In [2]:
seed = 42

### Getting the data


In [3]:
csv_path = DATA_DIR.joinpath("bank_marketing/bank/bank-full.csv")
df = load_data(csv_path, DATA_DIR, separator=";")
df = clean_column_names(df)

#### Cleanup Columns


In [4]:
base = [
    "age",
    "job",
    "marital",
    "education",
    "balance",
    "housing",
    "contact",
    "day",
    "month",
    "duration",
    "campaign",
    "pdays",
    "previous",
    "poutcome",
    "y",
]

In [5]:
df = df.select(pl.col(base))

### Data Preparation


In [6]:
df.schema

Schema([('age', Int64),
        ('job', String),
        ('marital', String),
        ('education', String),
        ('balance', Int64),
        ('housing', String),
        ('contact', String),
        ('day', Int64),
        ('month', String),
        ('duration', Int64),
        ('campaign', Int64),
        ('pdays', Int64),
        ('previous', Int64),
        ('poutcome', String),
        ('y', String)])

In [None]:
df.glimpse()

Rows: 45211
Columns: 15
$ age       <i64> 58, 44, 33, 47, 33, 35, 28, 42, 58, 43
$ job       <str> 'management', 'technician', 'entrepreneur', 'blue-collar', 'unknown', 'management', 'management', 'entrepreneur', 'retired', 'technician'
$ marital   <str> 'married', 'single', 'married', 'married', 'single', 'married', 'single', 'divorced', 'married', 'single'
$ education <str> 'tertiary', 'secondary', 'secondary', 'unknown', 'unknown', 'tertiary', 'tertiary', 'tertiary', 'primary', 'secondary'
$ balance   <i64> 2143, 29, 2, 1506, 1, 231, 447, 2, 121, 593
$ housing   <str> 'yes', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes'
$ contact   <str> 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown'
$ day       <i64> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5
$ month     <str> 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may'
$ duration  <i64> 261, 151, 76, 92, 198, 139, 217, 380, 50, 55
$ campaign  <i64> 1, 1, 1, 1, 

In [None]:
df.describe()

statistic,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
str,f64,str,str,str,f64,str,str,f64,str,f64,f64,f64,f64,str,str
"""count""",45211.0,"""45211""","""45211""","""45211""",45211.0,"""45211""","""45211""",45211.0,"""45211""",45211.0,45211.0,45211.0,45211.0,"""45211""","""45211"""
"""null_count""",0.0,"""0""","""0""","""0""",0.0,"""0""","""0""",0.0,"""0""",0.0,0.0,0.0,0.0,"""0""","""0"""
"""mean""",40.93621,,,,1362.272058,,,15.806419,,258.16308,2.763841,40.197828,0.580323,,
"""std""",10.618762,,,,3044.765829,,,8.322476,,257.527812,3.098021,100.128746,2.303441,,
"""min""",18.0,"""admin.""","""divorced""","""primary""",-8019.0,"""no""","""cellular""",1.0,"""apr""",0.0,1.0,-1.0,0.0,"""failure""","""no"""
"""25%""",33.0,,,,72.0,,,8.0,,103.0,1.0,-1.0,0.0,,
"""50%""",39.0,,,,448.0,,,16.0,,180.0,2.0,-1.0,0.0,,
"""75%""",48.0,,,,1428.0,,,21.0,,319.0,3.0,-1.0,0.0,,
"""max""",95.0,"""unknown""","""single""","""unknown""",102127.0,"""yes""","""unknown""",31.0,"""sep""",4918.0,63.0,871.0,275.0,"""unknown""","""yes"""


In [9]:
df.null_count().transpose(include_header=True, column_names=["null_count"]).filter(
    pl.col("null_count") > 0
).sort(pl.col("null_count"), descending=True)

column,null_count
str,u32


There are no missing features


In [10]:
categorical = ["job", "marital", "education", "housing", "contact", "month", "poutcome"]
numerical = ["age", "balance", "day", "duration", "campaign", "pdays", "previous"]

## EDA


In [11]:
df["y"].value_counts(sort=True, normalize=True)

y,proportion
str,f64
"""no""",0.883015
"""yes""",0.116985


Dataset is imbalanced


### 1. Most frequent observation (mode) for the column `education`


In [12]:
df.select(pl.col("education").mode())

education
str
"""secondary"""


### 2. Correlation matrix


In [13]:
df_corr = (
    df.select(pl.col(numerical))
    .corr()
    .with_columns(pl.Series(numerical).alias("index"))
    .unpivot(index="index")
    .filter(pl.col("index") != pl.col("variable"))
    .select(
        (pl.col("index") + "-" + pl.col("variable")).alias("vars"),
        pl.col("value"),
        pl.col("value").abs().alias("abs_value"),
    )
    .unique(pl.col("vars"))
    .sort("abs_value", descending=True)
)

In [14]:
df.select(pl.col(numerical)).corr().with_columns(pl.Series(numerical).alias("index"))

age,balance,day,duration,campaign,pdays,previous,index
f64,f64,f64,f64,f64,f64,f64,str
1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288,"""age"""
0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674,"""balance"""
-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171,"""day"""
-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203,"""duration"""
0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855,"""campaign"""
-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482,"""pdays"""
0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0,"""previous"""


In [15]:
df_corr

vars,value,abs_value
str,f64,f64
"""previous-pdays""",0.45482,0.45482
"""pdays-previous""",0.45482,0.45482
"""day-campaign""",0.16249,0.16249
"""campaign-day""",0.16249,0.16249
"""age-balance""",0.097783,0.097783
…,…,…
"""duration-pdays""",-0.001565,0.001565
"""age-previous""",0.001288,0.001288
"""previous-age""",0.001288,0.001288
"""duration-previous""",0.001203,0.001203


### Target encoding


In [16]:
le = LabelEncoder()
y = le.fit_transform(df["y"])

### Setting Up Validation Framework


In [17]:
df = df.with_columns(pl.lit(y).alias("y"))

In [18]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=seed)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=seed)

In [19]:
y_train = df_train["y"].to_numpy()
y_val = df_val["y"].to_numpy()
y_test = df_test["y"].to_numpy()

In [20]:
df_train = df_train.drop("y")
df_val = df_val.drop("y")
df_test = df_test.drop("y")

### 3. Mutual information score


In [21]:
df_train.select(
    [
        pl.col(c).map_batches(lambda x: mutual_info_score(x, y_train))
        for c in categorical
    ]
).transpose(include_header=True, column_names=["score"]).sort("score", descending=True)

column,score
str,f64
"""poutcome""",0.029533
"""month""",0.02509
"""contact""",0.013356
"""housing""",0.010343
"""job""",0.007316
"""education""",0.002697
"""marital""",0.00205
