In [1]:
import pandas as pd

In [2]:
df_bank = pd.read_csv("data/bank-full.csv", delimiter=";")

In [3]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df_bank, test_size=0.25, random_state=123)

In [37]:
train_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
26999,32,unemployed,single,secondary,no,2706,no,no,cellular,21,nov,462,3,-1,0,unknown,no
16168,37,admin.,married,secondary,no,1396,yes,no,cellular,22,jul,199,2,-1,0,unknown,no
12338,22,blue-collar,married,secondary,no,-295,yes,no,unknown,26,jun,150,2,-1,0,unknown,no
6074,36,blue-collar,married,secondary,no,-870,yes,no,unknown,26,may,102,2,-1,0,unknown,no
7385,50,admin.,married,primary,no,429,no,no,unknown,29,may,60,2,-1,0,unknown,no


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 33908 entries, 26999 to 15725
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        33908 non-null  int64 
 1   job        33908 non-null  object
 2   marital    33908 non-null  object
 3   education  33908 non-null  object
 4   default    33908 non-null  object
 5   balance    33908 non-null  int64 
 6   housing    33908 non-null  object
 7   loan       33908 non-null  object
 8   contact    33908 non-null  object
 9   day        33908 non-null  int64 
 10  month      33908 non-null  object
 11  duration   33908 non-null  int64 
 12  campaign   33908 non-null  int64 
 13  pdays      33908 non-null  int64 
 14  previous   33908 non-null  int64 
 15  poutcome   33908 non-null  object
 16  y          33908 non-null  object
dtypes: int64(7), object(10)
memory usage: 4.7+ MB


In [None]:
# Distribution of the target

In [6]:
train_df["y"].value_counts(normalize=True)

y
no     0.882358
yes    0.117642
Name: proportion, dtype: float64

In [None]:
# Distributions of categorical and numeric features

In [7]:
categorical_cols = list(train_df.drop(columns=["y"]).select_dtypes(include=["object"]).columns)
numerical_cols = list(train_df.select_dtypes(include=["int64"]).columns)

In [17]:
import altair as alt
alt.data_transformers.enable("vegafusion")

alt.Chart(train_df).mark_bar().encode(
    x="count()",
    y=alt.Y(alt.repeat()).type("nominal")
).repeat(
    categorical_cols, columns=3
)

In [18]:
alt.Chart(train_df).mark_bar().encode(
    x=alt.X(alt.repeat()).type("quantitative").bin(maxbins=40),
    y="count()"
).repeat(
    numerical_cols, columns=3
)

In [None]:
# Correlations between numeric features

In [11]:
# measure linear relationship
person_corr_df = train_df[numerical_cols].corr("pearson").unstack().reset_index()
person_corr_df.columns = ["num_variable_0", "num_variable_1", "correlation"]

corr_heatmap = alt.Chart(person_corr_df).mark_rect().encode(
    x=alt.X("num_variable_0").title("Numerical Variable"),
    y=alt.Y("num_variable_1").title("Numerical Variable"),
    color="correlation:Q"
).properties(
    width=250,
    height=250
)

text = alt.Chart(person_corr_df, title="Pearson Correlation").mark_text().encode(
    x=alt.X("num_variable_0").title("Numerical Variable"),
    y=alt.Y("num_variable_1").title("Numerical Variable"),
    text=alt.Text("correlation:Q", format=".2f")
)

corr_heatmap + text

In [10]:
# measure monotonic (incl. non-linear) relationship
# FIXME: does it matter? If we apply linear models, we're more afraid of collinearity, i.e. linear relationship
spearman_corr_df = train_df[numerical_cols].corr("spearman").unstack().reset_index()
spearman_corr_df.columns = ["num_variable_0", "num_variable_1", "correlation"]

corr_heatmap = alt.Chart(spearman_corr_df, title="Spearman Correlation").mark_rect().encode(
    x=alt.X("num_variable_0").title("Numerical Variable"),
    y=alt.Y("num_variable_1").title("Numerical Variable"),
    color="correlation:Q"
).properties(
    width=250,
    height=250
)

text = alt.Chart(spearman_corr_df).mark_text().encode(
    x=alt.X("num_variable_0").title("Numerical Variable"),
    y=alt.Y("num_variable_1").title("Numerical Variable"),
    text=alt.Text("correlation:Q", format=".2f")
)

corr_heatmap + text

In [12]:
# checking for correlation between pdays and previous
# FIXME: there is no linear relationship here
pdays_prev = alt.Chart(train_df, title="pdays vs previous").mark_point().encode(
    x="pdays",
    y="previous"
)

pdays_prev_clamped = alt.Chart(train_df, title="pdays vs previous (previous <= 50)").mark_point().encode(
    x="pdays",
    y=alt.Y("previous").scale(domain=(0, 50), clamp=True)
)

pdays_prev | pdays_prev_clamped

### Summary and Recommendations from EDA
- Generally, bar charts were created for categorical variables, and histograms for numerical variables to show illustration. Correlation heatmaps based on two different metrics were generated to investigate the relationships between numerical variables. A scatter plot specifically for `pdays` vs `previous` was created.
- Judging from the proportion of each class in the target, the dataset is unbalanced
- `job`, `education`, `contact` and `poutcome` contain unknown values. We do not have enough information on the dataset to impute these values properly. Note that these values are not null values, but strings called "unknown". **# FIXME: it seems only `contact` and `poutcome` have "unknown"**
- Out of the columns mentioned that contain unknown values, `contact` and `poutcome` need to be dropped since they contain too many unknown examples. We cannot just drop the unknowns from these columns since we would be dropping too many examples, especially considering the size of the data. **# FIXME: `poutcome` is the outcome of the previous marketing campaign. Most of the "unknown" is because `previous = 0`. If the person has never been contacted for marketing before, it makes sense to say "unknown" for this field as "previous marketing" doesn't exist. But for those who have been reached out before, this `poutcome` might be informative! On the other hand, I think `contact` could be kept, because not that many "unknown" in it actually. Also, our task is to find out the importance of features. There should be no farm to keep it, but I would expect the result to tell me that this `contact` is not important.**
- `job` and `education` can be kept. We can just drop the unknowns from these features. **# FIXME: they might contain information too?**
- The distributions of `pdays` and `previous` are heavily skewed. These variables are also correlated with 0.99 Spearman correlation score and 0.44 Pearson correlation score.
- However, upon visual inspection with a scatter plot, `pdays` and `previous` do not seem to be too correlated to be an issue. We can keep them both as features.
- Overall recommendations:
   - Drop `contact` and `poutcome`
   - Drop unknown values from `job` and `education`
   - Ordinal encode `education`
   - One-hot encode other categorical variables
   - Standardize numerical columns

In [13]:
train_df = train_df.loc[(train_df["job"] != "unknown") & (train_df["education"] != "unknown")]
test_df = test_df.loc[(test_df["job"] != "unknown") & (test_df["education"] != "unknown")]

X_train = train_df.drop(columns=["y"])
y_train = train_df["y"]
X_test = test_df.drop(columns=["y"])
y_test = test_df["y"]

In [14]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

# recommended transformations
categorical_feats = ["job", "marital", "default", "housing", "loan", "month"]
ordinal_feats = ["education"]
drop_feats = ["contact", "poutcome"]
numerical_feats = numerical_cols

education_levels = ["primary", "secondary", "tertiary"]

col_transformer = make_column_transformer((OneHotEncoder(sparse_output=False, drop="if_binary"), categorical_feats),
                                          (OrdinalEncoder(categories=[education_levels], dtype=int), ordinal_feats),
                                          ("drop", drop_feats),
                                          (StandardScaler(), numerical_feats))