In [None]:
import pandas as pd

In [None]:
df_bank = pd.read_csv("data/bank-full.csv", delimiter=";")

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df_bank, test_size=0.25, random_state=123)

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
# Distribution of the target

In [None]:
train_df["y"].value_counts(normalize=True)

In [None]:
# Distributions of categorical and numeric features

In [None]:
categorical_cols = list(train_df.drop(columns=["y"]).select_dtypes(include=["object"]).columns)
numerical_cols = list(train_df.select_dtypes(include=["int64"]).columns)

In [None]:
import altair as alt
alt.data_transformers.enable("vegafusion")

alt.Chart(train_df).mark_bar().encode(
    x="count()",
    y=alt.Y(alt.repeat()).type("nominal")
).repeat(
    categorical_cols, columns=3
)

In [None]:
alt.Chart(train_df).mark_bar().encode(
    x=alt.X(alt.repeat()).type("quantitative").bin(maxbins=40),
    y="count()"
).repeat(
    numerical_cols, columns=3
)

In [None]:
# Correlations between numeric features

In [None]:
# measure linear relationship
person_corr_df = train_df[numerical_cols].corr("pearson").unstack().reset_index()
person_corr_df.columns = ["num_variable_0", "num_variable_1", "correlation"]

corr_heatmap = alt.Chart(person_corr_df).mark_rect().encode(
    x=alt.X("num_variable_0").title("Numerical Variable"),
    y=alt.Y("num_variable_1").title("Numerical Variable"),
    color="correlation:Q"
).properties(
    width=250,
    height=250
)

text = alt.Chart(person_corr_df, title="Pearson Correlation").mark_text().encode(
    x=alt.X("num_variable_0").title("Numerical Variable"),
    y=alt.Y("num_variable_1").title("Numerical Variable"),
    text=alt.Text("correlation:Q", format=".2f")
)

corr_heatmap + text

In [None]:
# measure monotonic (incl. non-linear) relationship
# FIXME: does it matter? If we apply linear models, we're more afraid of collinearity, i.e. linear relationship
spearman_corr_df = train_df[numerical_cols].corr("spearman").unstack().reset_index()
spearman_corr_df.columns = ["num_variable_0", "num_variable_1", "correlation"]

corr_heatmap = alt.Chart(spearman_corr_df, title="Spearman Correlation").mark_rect().encode(
    x=alt.X("num_variable_0").title("Numerical Variable"),
    y=alt.Y("num_variable_1").title("Numerical Variable"),
    color="correlation:Q"
).properties(
    width=250,
    height=250
)

text = alt.Chart(spearman_corr_df).mark_text().encode(
    x=alt.X("num_variable_0").title("Numerical Variable"),
    y=alt.Y("num_variable_1").title("Numerical Variable"),
    text=alt.Text("correlation:Q", format=".2f")
)

corr_heatmap + text

In [None]:
# checking for correlation between pdays and previous
# FIXME: there is no linear relationship here
pdays_prev = alt.Chart(train_df, title="pdays vs previous").mark_point().encode(
    x="pdays",
    y="previous"
)

pdays_prev_clamped = alt.Chart(train_df, title="pdays vs previous (previous <= 50)").mark_point().encode(
    x="pdays",
    y=alt.Y("previous").scale(domain=(0, 50), clamp=True)
)

pdays_prev | pdays_prev_clamped

### Summary and Recommendations from EDA
- Generally, bar charts were created for categorical variables, and histograms for numerical variables to show illustration. Correlation heatmaps based on two different metrics were generated to investigate the relationships between numerical variables. A scatter plot specifically for `pdays` vs `previous` was created.
- Judging from the proportion of each class in the target, the dataset is unbalanced
- `job`, `education`, `contact` and `poutcome` contain unknown values. We do not have enough information on the dataset to impute these values properly. Note that these values are not null values, but strings called "unknown". **# FIXME: it seems only `contact` and `poutcome` have "unknown"**
- Out of the columns mentioned that contain unknown values, `contact` and `poutcome` need to be dropped since they contain too many unknown examples. We cannot just drop the unknowns from these columns since we would be dropping too many examples, especially considering the size of the data. **# FIXME: `poutcome` is the outcome of the previous marketing campaign. Most of the "unknown" is because `previous = 0`. If the person has never been contacted for marketing before, it makes sense to say "unknown" for this field as "previous marketing" doesn't exist. But for those who have been reached out before, this `poutcome` might be informative! On the other hand, I think `contact` could be kept, because not that many "unknown" in it actually. Also, our task is to find out the importance of features. There should be no farm to keep it, but I would expect the result to tell me that this `contact` is not important.**
- `job` and `education` can be kept. We can just drop the unknowns from these features. **# FIXME: they might contain information too?**
- The distributions of `pdays` and `previous` are heavily skewed. These variables are also correlated with 0.99 Spearman correlation score and 0.44 Pearson correlation score.
- However, upon visual inspection with a scatter plot, `pdays` and `previous` do not seem to be too correlated to be an issue. We can keep them both as features.
- Overall recommendations:
   - Drop `contact` and `poutcome`
   - Drop unknown values from `job` and `education`
   - Ordinal encode `education`
   - One-hot encode other categorical variables
   - Standardize numerical columns

In [None]:
# train_df = train_df.loc[(train_df["job"] != "unknown") & (train_df["education"] != "unknown")]
# test_df = test_df.loc[(test_df["job"] != "unknown") & (test_df["education"] != "unknown")]

# X_train = train_df.drop(columns=["y"])
# y_train = train_df["y"]
# X_test = test_df.drop(columns=["y"])
# y_test = test_df["y"]

In [None]:
# from sklearn.compose import make_column_transformer
# from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

# # recommended transformations
# categorical_feats = ["job", "marital", "default", "housing", "loan", "month"]
# ordinal_feats = ["education"]
# drop_feats = ["contact", "poutcome"]
# numerical_feats = numerical_cols

# education_levels = ["primary", "secondary", "tertiary"]

# col_transformer = make_column_transformer((OneHotEncoder(sparse_output=False, drop="if_binary"), categorical_feats),
#                                           (OrdinalEncoder(categories=[education_levels], dtype=int), ordinal_feats),
#                                           ("drop", drop_feats),
#                                           (StandardScaler(), numerical_feats))

In [None]:
X_train, y_train = train_df.drop(columns=["y"]), train_df["y"]
X_test, y_test = test_df.drop(columns=["y"]), test_df["y"]

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

categorical_feats = ["job", "marital", "default", "housing", "loan", "contact", "day", "month", "poutcome"]
ordinal_feats = ["education"]
numeric_feats = ["age", "balance", "duration", "campaign", "previous", "pdays"]

education_levels = ["unknown", "primary", "secondary", "tertiary"]

preprocessor = make_column_transformer(
    (OneHotEncoder(sparse_output=False, drop="if_binary"), categorical_feats),
    (OrdinalEncoder(categories=[education_levels], dtype=int), ordinal_feats),
    (StandardScaler(), numeric_feats)
)

# logistic regression

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

lr_pipe = make_pipeline(
    preprocessor,
    LogisticRegression(random_state=123, max_iter=2000)
)

lr_pipe.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

ConfusionMatrixDisplay.from_estimator(lr_pipe, X_train, y_train)

In [None]:
y_pred = lr_pipe.predict(X_train)
print(classification_report(y_train, y_pred))

In [None]:
from sklearn.metrics import PrecisionRecallDisplay, average_precision_score, recall_score, precision_score
import matplotlib.pyplot as plt
PrecisionRecallDisplay.from_estimator(lr_pipe, X_train, y_train)

# plot point
plt.plot(recall_score(y_train, y_pred, pos_label="yes"), precision_score(y_train, y_pred, pos_label="yes"), "ro", markersize=5) # red circle


In [None]:
lr_pipe.named_steps['logisticregression'].coef_

In [None]:
lr_pipe.named_steps['columntransformer'].named_transformers_

In [None]:
categorical_cols = lr_pipe.named_steps['columntransformer'].named_transformers_['onehotencoder'].get_feature_names_out().tolist()
ordinal_cols = lr_pipe.named_steps['columntransformer'].named_transformers_['ordinalencoder'].get_feature_names_out().tolist()
numeric_cols = lr_pipe.named_steps['columntransformer'].named_transformers_['standardscaler'].get_feature_names_out().tolist()

feature_importance = pd.DataFrame({
    'feature': categorical_cols + ordinal_cols + numeric_cols, 
    'coef': lr_pipe.named_steps['logisticregression'].coef_[0].tolist()
})
feature_importance

In [None]:
pd.set_option('display.max_rows', 100)
feature_importance.sort_values('coef', ascending=False)

In [None]:
feature_importance.sort_values('importance', ascending=False)

# decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_pipe = make_pipeline(
    preprocessor,
    DecisionTreeClassifier(random_state=123, max_depth=5)
)
dt_pipe.fit(X_train, y_train)

In [None]:
# from sklearn.tree import plot_tree
# plt.figure(figsize=(12,12))
# plot_tree(dt_pipe.named_steps['decisiontreeclassifier'], feature_names=categorical_cols + ordinal_cols + numeric_cols, fontsize=10)
# plt.show()

In [None]:
ConfusionMatrixDisplay.from_estimator(dt_pipe, X_train, y_train)

In [None]:
print(classification_report(y_train, dt_pipe.predict(X_train)))

In [None]:
_, ax = plt.subplots()

PrecisionRecallDisplay.from_estimator(dt_pipe, X_train, y_train, ax=ax)

# plot point
plt.plot(
    recall_score(y_train, dt_pipe.predict(X_train), pos_label="yes"), 
    precision_score(y_train, dt_pipe.predict(X_train), pos_label="yes"), 
    "ro", markersize=5) # red circle

PrecisionRecallDisplay.from_estimator(lr_pipe, X_train, y_train, ax=ax)

# plot point
plt.plot(recall_score(y_train, y_pred, pos_label="yes"), precision_score(y_train, y_pred, pos_label="yes"), "ro", markersize=5) # red circle

In [None]:
feature_importance = pd.DataFrame({
    'feature': categorical_cols + ordinal_cols + numeric_cols, 
    'importance': dt_pipe.named_steps['decisiontreeclassifier'].feature_importances_.tolist()
})
feature_importance.sort_values('importance', ascending=False)

# cross-validation on precision

In [None]:
?cross_validate

In [None]:
import numpy as np
#(lambda y: pd.Series(np.where(y, "yes", "no"), name="y"))
((lambda y: y == "yes")(y_train))

In [None]:
y_train

In [None]:
y_train=="yes"

In [None]:
from sklearn.model_selection import cross_validate

lr_pipe = make_pipeline(preprocessor, LogisticRegression(random_state=123, max_iter=2000))

pd.DataFrame(
    cross_validate(lr_pipe, X_train, y_train=="yes", cv=10, return_train_score=True, scoring=["accuracy", "precision", "recall"])
).agg(['mean']).round(3).T

In [None]:
pd.DataFrame(
    cross_validate(dt_pipe, X_train, y_train=="yes", cv=10, return_train_score=True, scoring=["accuracy", "precision", "recall"])
).agg(['mean']).round(3).T