In [None]:
import pandas as pd

In [None]:
df_bank = pd.read_csv("data/bank-full.csv", delimiter=";")

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df_bank, test_size=0.25, random_state=123)

In [None]:
train_df.info()

In [None]:
train_df.head()

In [None]:
train_df["y"].value_counts(normalize=True)

In [None]:
categorical_cols = list(train_df.drop(columns=["y"]).select_dtypes(include=["object"]).columns)
numerical_cols = list(train_df.select_dtypes(include=["int64"]).columns)

In [None]:
import altair as alt

alt.data_transformers.enable("vegafusion")

alt.Chart(train_df).mark_bar().encode(
    x=alt.X(alt.repeat("row")).type("nominal"),
    y="count()"
).repeat(
    row=categorical_cols
)

In [None]:
alt.Chart(train_df).mark_bar().encode(
    x=alt.X(alt.repeat("row")).type("quantitative").bin(maxbins=40),
    y="count()"
).repeat(
    row=numerical_cols
)

In [None]:
spearman_corr_df = train_df[numerical_cols].corr("spearman").unstack().reset_index()
spearman_corr_df.columns = ["num_variable_0", "num_variable_1", "correlation"]

corr_heatmap = alt.Chart(spearman_corr_df, title="Spearman Correlation").mark_rect().encode(
    x=alt.X("num_variable_0").title("Numerical Variable"),
    y=alt.Y("num_variable_1").title("Numerical Variable"),
    color="correlation:Q"
).properties(
    width=250,
    height=250
)

text = alt.Chart(spearman_corr_df).mark_text().encode(
    x=alt.X("num_variable_0").title("Numerical Variable"),
    y=alt.Y("num_variable_1").title("Numerical Variable"),
    text=alt.Text("correlation:Q", format=".2f")
)

corr_heatmap + text

In [None]:
person_corr_df = train_df[numerical_cols].corr("pearson").unstack().reset_index()
person_corr_df.columns = ["num_variable_0", "num_variable_1", "correlation"]

corr_heatmap = alt.Chart(person_corr_df).mark_rect().encode(
    x=alt.X("num_variable_0").title("Numerical Variable"),
    y=alt.Y("num_variable_1").title("Numerical Variable"),
    color="correlation:Q"
).properties(
    width=250,
    height=250
)

text = alt.Chart(person_corr_df, title="Pearson Correlation").mark_text().encode(
    x=alt.X("num_variable_0").title("Numerical Variable"),
    y=alt.Y("num_variable_1").title("Numerical Variable"),
    text=alt.Text("correlation:Q", format=".2f")
)

corr_heatmap + text

In [None]:
# checking for correlation between pdays and previous
pdays_prev = alt.Chart(train_df, title="pdays vs previous").mark_point().encode(
    x="pdays",
    y="previous"
)

pdays_prev_clamped = alt.Chart(train_df, title="pdays vs previous (previous <= 50)").mark_point().encode(
    x="pdays",
    y=alt.Y("previous").scale(domain=(0, 50), clamp=True)
)

pdays_prev | pdays_prev_clamped

### Summary and Recommendations from EDA
- Generally, bar charts were created for categorical variables, and histograms for numerical variables to show illustration. Correlation heatmaps based on two different metrics were generated to investigate the relationships between numerical variables. A scatter plot specifically for `pdays` vs `previous` was created.
- Judging from the proportion of each class in the target, the dataset is unbalanced
- `job`, `education`, `contact` and `poutcome` contain unknown values. We do not have enough information on the dataset to impute these values properly. Note that these values are not null values, but strings called "unknown".
- Out of the columns mentioned that contain unknown values, `contact` and `poutcome` need to be dropped since they contain too many unknown examples. We cannot just drop the unknowns from these columns since we would be dropping too many examples, especially considering the size of the data.
- `job` and `education` can be kept. We can just drop the unknowns from these features.
- The distributions of `pdays` and `previous` are heavily skewed. These variables are also correlated with 0.99 Spearman correlation score and 0.44 Pearson correlation score.
- However, upon visual inspection with a scatter plot, `pdays` and `previous` do not seem to be too correlated to be an issue. We can keep them both as features.
- Overall recommendations:
   - Drop `contact` and `poutcome`
   - Drop unknown values from `job` and `education`
   - Ordinal encode `education`
   - One-hot encode other categorical variables
   - Standardize numerical columns

In [None]:
train_df = train_df.loc[(train_df["job"] != "unknown") & (train_df["education"] != "unknown")]
test_df = test_df.loc[(test_df["job"] != "unknown") & (test_df["education"] != "unknown")]

X_train = train_df.drop(columns=["y"])
y_train = train_df["y"]
X_test = test_df.drop(columns=["y"])
y_test = test_df["y"]

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

# recommended transformations
categorical_feats = ["job", "marital", "default", "housing", "loan", "month"]
ordinal_feats = ["education"]
drop_feats = ["contact", "poutcome"]
numerical_feats = numerical_cols

education_levels = ["primary", "secondary", "tertiary"]

col_transformer = make_column_transformer((OneHotEncoder(sparse_output=False, drop="if_binary"), categorical_feats),
                                          (OrdinalEncoder(categories=[education_levels], dtype=int), ordinal_feats),
                                          ("drop", drop_feats),
                                          (StandardScaler(), numerical_feats))