In [None]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np

random_state = 67

bank_marketing = fetch_ucirepo(id=222)
X = bank_marketing.data.features
y = bank_marketing.data.targets

# EDA

### Target

In [None]:
print(type(y))
print(y.shape)
print(y.dtypes)

In [None]:
y_series = y.iloc[:,0]
print(y_series.value_counts())
print(y_series.value_counts(normalize=True))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.countplot(data = y, x = y_series, palette='hls')
plt.title("Distribution of y")
plt.show()
plt.close()

Target jest mocno niezbalansowany, przez co podczas liczenia metryk nie będziemy opierać się na accuracy(baseline ~0.88 dla 'no'), będziemy raczej chcieli patrzeć na recall/f1 dla 'yes'. Threshold 0.5 też nie będzie dobry przy takim rozkładzie y.

## Features

In [None]:
X_eda = X.copy()
print(X_eda.head())
print(X_eda.dtypes)
print(X_eda.shape)

### Missing values

In [None]:
print(X_eda.isna().sum())

In [None]:
missing_cols = X_eda.isna().sum().to_frame()
missing_cols = missing_cols[missing_cols.loc[:,0] > 0].index
for col in missing_cols:
    print('unknown' in X_eda.loc[:,col].values)

Kolumny gdzie są brakujące dane maja dtype = object oraz nie mają w sobie defaultowo 'unknown', także zamienimy brakujące wartośći na 'unknown'

In [None]:
X_eda.loc[:,missing_cols] = X_eda.loc[:,missing_cols].fillna('unknown')
print(X_eda.isna().sum())

### Outliers

In [None]:
num_cols = X_eda.select_dtypes('number').columns.tolist()
cat_cols = X_eda.select_dtypes('object').columns.tolist()
print(X_eda.loc[:,num_cols].describe())

Duration mówi nam o czasie rozmowy, a znamy ją dopiero po zakończeniu jej, dlatego usuwamy ją.

In [None]:
df_with_y = X_eda.copy()
df_with_y['y'] = y_series
if 'duration' in df_with_y.columns:
    df_with_y.drop(columns=['duration'], inplace=True)

num_cols_without_duration = df_with_y.select_dtypes('number').columns.tolist()

for col in num_cols_without_duration:
    sns.violinplot(data = df_with_y, x="y", y=col, cut = 0, inner='quartile')
    plt.title(f"Distribution of {col}")
    plt.show()
    plt.close()

In [None]:
df_pdays = df_with_y[df_with_y['pdays'] != -1].copy()
sns.violinplot(data=df_pdays, x='y', y='pdays', cut=0, inner='quartile')
plt.title('Pdays without -1')
plt.show()
plt.close()

### Corelations

In [None]:
from sklearn.feature_selection import mutual_info_classif

y_bin = (y_series == "yes").astype('int')

mi_num = mutual_info_classif(X_eda.loc[:,num_cols_without_duration], y=y_bin, random_state=random_state)
mi_num = pd.Series(mi_num, index=num_cols_without_duration).sort_values(ascending=False)
print(mi_num.head())

In [None]:
from scipy.stats import chi2_contingency

def cramer_v(x,y):
    ct = pd.crosstab(x,y)
    chi2, p, dof, freq = chi2_contingency(ct)
    n = ct.to_numpy().sum()
    r, k = ct.shape
    v = np.sqrt((chi2/n)/(min(r-1,k-1)))
    return v, chi2, p, dof, r, k

rows = []
for col in cat_cols:
    v, chi2, p, dof, r, k = cramer_v(X_eda.loc[:,col], y_bin)
    rows.append({
        "feature": col,
        "cramers_v": v,
        "chi2": chi2,
        "p-value": p,
        "n_categories": r
    })

cramers_rank = pd.DataFrame(rows).sort_values(by="cramers_v", ascending=False)
print(cramers_rank.head(10))

## EDA summary

- Niezbalansowany target (~12% yes)
- Duration znane dopiero po rozmowie
- Braki danych w kolumnach kategorycznych
- Specjalna flaga -1 dla pdays jeżeli ktoś nie był kontaktowany
- Kolumny numeryczne mają duże zakresy, dla modeli liniowych można użyć scalerów
- Z kategorycznych największy związek z targetem ma poutcome, month, contact, housing, job
- Katerogyczne wymagają OneHotEncdoing

# Preprocessing

In [None]:
X_prep = X.copy()

if 'duration' in X_prep.columns:
    X_prep.drop(columns=['duration'], inplace=True)

if 'pdays' in X_prep.columns:
    X_prep['prev_contacted'] = (X_prep['pdays'] != -1).astype('int')
    X_prep['pdays_clean'] = X_prep['pdays'].replace(-1, np.nan)
    X_prep.drop(columns=['pdays'], inplace=True)

print(X_prep.dtypes)
print(X_prep.shape)
print(X_prep.head())

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_prep,
    y_bin,
    test_size=0.2,
    stratify=y_bin,
    random_state=random_state
)

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train,
    y_train,
    stratify=y_train,
    random_state= random_state,
    test_size=0.2
)

num_cols = X_train.select_dtypes('number').columns.tolist()
cat_cols = X_train.select_dtypes('object').columns.tolist()

In [None]:
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

num_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

cat_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', min_frequency=20))
])

preprocess = ColumnTransformer(transformers=[
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
])

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold

pipe_lr = Pipeline(steps=[
    ('preprocess', preprocess),
    ('model', LogisticRegression())
])

C_grid = np.logspace(-4,1,12)
n_iter = np.linspace(500, 5000, 12)

params_lr = [
    {"model__solver": ["lbfgs"], "model__C": C_grid, "model__l1_ratio": [0], "model__class_weight": [None, "balanced"], "model__max_iter": n_iter},
    {"model__solver": ["liblinear"], "model__C": C_grid, "model__l1_ratio": [0,1], "model__class_weight": [None, "balanced"], "model__max_iter": n_iter}
]

cv = StratifiedKFold(n_splits =5, shuffle=True, random_state=random_state)

gs_lr = GridSearchCV(
    pipe_lr,
    param_grid=params_lr,
    scoring='average_precision',
    cv= cv,
    n_jobs = -1
)

gs_lr.fit(X_tr, y_tr)

best_lr = gs_lr.best_estimator_

In [None]:
best_lr