In [None]:
import pandas as pd

In [None]:
df_bank = pd.read_csv("../data/bank-full.csv", delimiter=";")

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df_bank, test_size=0.25, random_state=123)

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
# Distribution of the target

In [None]:
train_df["y"].value_counts(normalize=True)

In [None]:
# Distributions of categorical and numeric features

In [None]:
categorical_cols = list(train_df.drop(columns=["y"]).select_dtypes(include=["object"]).columns)
numerical_cols = list(train_df.select_dtypes(include=["int64"]).columns)

In [None]:
import altair as alt
alt.data_transformers.enable("vegafusion")

alt.Chart(train_df).mark_bar().encode(
    x="count()",
    y=alt.Y(alt.repeat()).type("nominal")
).repeat(
    categorical_cols, columns=3
)

In [None]:
alt.Chart(train_df).mark_bar().encode(
    x=alt.X(alt.repeat()).type("quantitative").bin(maxbins=40),
    y="count()"
).repeat(
    numerical_cols, columns=3
)

In [None]:
# Correlations between numeric features

In [None]:
# measure linear relationship
person_corr_df = train_df[numerical_cols].corr("pearson").unstack().reset_index()
person_corr_df.columns = ["num_variable_0", "num_variable_1", "correlation"]

corr_heatmap = alt.Chart(person_corr_df).mark_rect().encode(
    x=alt.X("num_variable_0").title("Numerical Variable"),
    y=alt.Y("num_variable_1").title("Numerical Variable"),
    color="correlation:Q"
).properties(
    width=250,
    height=250
)

text = alt.Chart(person_corr_df, title="Pearson Correlation").mark_text().encode(
    x=alt.X("num_variable_0").title("Numerical Variable"),
    y=alt.Y("num_variable_1").title("Numerical Variable"),
    text=alt.Text("correlation:Q", format=".2f")
)

corr_heatmap + text

In [None]:
# measure monotonic (incl. non-linear) relationship
spearman_corr_df = train_df[numerical_cols].corr("spearman").unstack().reset_index()
spearman_corr_df.columns = ["num_variable_0", "num_variable_1", "correlation"]

corr_heatmap = alt.Chart(spearman_corr_df, title="Spearman Correlation").mark_rect().encode(
    x=alt.X("num_variable_0").title("Numerical Variable"),
    y=alt.Y("num_variable_1").title("Numerical Variable"),
    color="correlation:Q"
).properties(
    width=250,
    height=250
)

text = alt.Chart(spearman_corr_df).mark_text().encode(
    x=alt.X("num_variable_0").title("Numerical Variable"),
    y=alt.Y("num_variable_1").title("Numerical Variable"),
    text=alt.Text("correlation:Q", format=".2f")
)

corr_heatmap + text

In [None]:
# checking for correlation between pdays and previous
# as indicated in the chart, there is no linear relationship here
pdays_prev = alt.Chart(train_df, title="pdays vs previous").mark_point().encode(
    x="pdays",
    y="previous"
)

pdays_prev_clamped = alt.Chart(train_df, title="pdays vs previous (previous <= 50)").mark_point().encode(
    x="pdays",
    y=alt.Y("previous").scale(domain=(0, 50), clamp=True)
)

pdays_prev | pdays_prev_clamped

In [None]:
# Checking data with its description

In [None]:
import numpy as np
# check all 31 days of the month exist
np.sort(train_df["day"].unique())

In [None]:
# check that campaign >= 1; it is the number of contracts for this client during this campaign, including the last contract
train_df["campaign"].describe()

In [None]:
# As indicated, all points > 0. (The red line indicates the minimum.)
alt.Chart(train_df).mark_boxplot().encode(
    alt.X("campaign").scale(domain=(-5, 60), clamp=True)
).properties(height=50) + alt.Chart(train_df).mark_rule().encode(x="min(campaign)", color=alt.value("red"))

In [None]:
# check that previous >= 0; it is the number of contracts for this client before this campaign
train_df["previous"].describe()

In [None]:
# As indicated, all points >= 0. (The red line indicates the minimum.)
alt.Chart(train_df).mark_boxplot().encode(
    alt.X("previous").scale(domain=(-5, 50), clamp=True)
).properties(height=50) + alt.Chart(train_df).mark_rule().encode(x="min(previous)", color=alt.value("red"))

In [None]:
# check that if previous = 0, then pdays = -1
train_df.loc[train_df["previous"] == 0, "pdays"].unique()

In [None]:
# check that if previous = 0, then outcome = "unknown"
train_df.loc[train_df["previous"] == 0, "poutcome"].unique()

### Summary and Recommendations from EDA
- Generally, bar charts were created for categorical variables, and histograms for numerical variables to show illustration. Correlation heatmaps based on two different metrics were generated to investigate the relationships between numerical variables. A scatter plot specifically for `pdays` vs `previous` was created.
- Judging from the proportion of each class in the target, the dataset is unbalanced
- `job`, `education`, `contact` and `poutcome` contain unknown values. We do not have enough information on the dataset to impute these values properly. Note that these values are not null values, but strings called "unknown". Out of these columns, `contact` and `poutcome` have significant numbers of "unknown" values.
- `poutcome` is the outcome of the previous marketing campaign. Most of the "unknown" is because `previous = 0`. If the person has never been contacted for marketing before, it makes sense to say "unknown" for this field as "previous marketing" doesn't exist. But for those who have been reached out before, this `poutcome` might be informative! `contact` could be kept, because not that many "unknown" in it actually. Also, our task is to find out the importance of features. There should be no harm to keep it, but we would expect the result to tell me that this `contact` is not important.
- The distributions of `pdays` and `previous` are heavily skewed. These variables are also correlated with 0.99 Spearman correlation score and 0.44 Pearson correlation score.
- However, upon visual inspection with a scatter plot, `pdays` and `previous` do not seem to be too correlated to be an issue. We can keep them both as features.
- Overall recommendations:
   - Ordinal encode `education`
   - One-hot encode categorical variables
   - Standardize numerical columns

### Discussion
 - In this study, we would like to understand which factors would affect the most for clients' subscription to the term deposit
 - positive label: `y = "yes"`; negative label: `y = "no"`
 - We do not want to miss any potential clients. Therefore, we would like to lower as much as possible the Type I error / false positive, i.e. clients being identified as subscribed to our term deposit but actually they didn't.
 - We will select a model that gives us a robust **precision**, so that the model could better explain clients' motivation to the subscription. We would put more trust to the feature importance recommended by the model.

In [None]:
X_train, y_train = train_df.drop(columns=["y"]), train_df["y"]
X_test, y_test = test_df.drop(columns=["y"]), test_df["y"]

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

categorical_feats = ["job", "marital", "default", "housing", "loan", "contact", "day", "month", "poutcome"]
ordinal_feats = ["education"]
numeric_feats = ["age", "balance", "duration", "campaign", "previous", "pdays"]

education_levels = ["unknown", "primary", "secondary", "tertiary"]

preprocessor = make_column_transformer(
    (OneHotEncoder(sparse_output=False, drop="if_binary"), categorical_feats),
    (OrdinalEncoder(categories=[education_levels], dtype=int), ordinal_feats),
    (StandardScaler(), numeric_feats)
)

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

model_pipes = {
    "Baseline": DummyClassifier(strategy="most_frequent", random_state=522),
    "DecisionTree": make_pipeline(preprocessor, DecisionTreeClassifier(max_depth=5, random_state=522)),
    "LogisticRegression": make_pipeline(preprocessor, LogisticRegression(max_iter=2000, random_state=522)),
}

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, precision_score

mod_precision_score = make_scorer(precision_score, zero_division=0)

classification_metrics = {
    "accuracy": "accuracy",
    "precision": mod_precision_score,
    "recall": "recall", 
}
cross_val_results = {}

for name, pipe in model_pipes.items():
    cross_val_results[name] = pd.DataFrame(
        cross_validate(
            pipe, 
            X_train, 
            y_train=="yes", 
            cv=5,
            return_train_score=True, 
            scoring=classification_metrics)
    ).agg(['mean', 'std']).round(3).T


In [None]:
pd.concat(
    cross_val_results,
    axis='columns'
).xs(
    'mean',
    axis='columns',
    level=1
).style.format(
    precision=2
).background_gradient(
    axis=None
)

### Observation:
 - `LogisticRegression` has slightly better test precision than the `DecisionTreeClassifier`, but the `LogisticRegression` has a smaller gap between train scores and test scores, so `LogisticRegression` is more likely to generalize.
 - Limitation: there is still room of improvements on the precision
   - In the future, if we ever need to make prediction of the subscription, we could increase the threshold to yield a better precision. So, I guess it's not so much worry here.
   - If we really keen on improving the precision on the model level, we could see RandomForest is a choice, but not everyone of us know RandomForest yet, haha

In [None]:
lr_pipe = model_pipes['LogisticRegression']

lr_pipe.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, lr_pipe.predict(X_test)))

### Observation
 - The precision test score is similar to the validation score as well as the train score. Therefore, we would believe that the feature importance conclusion drawn from this model is generalizable to the future.

In [None]:
categorical_cols = lr_pipe.named_steps['columntransformer'].named_transformers_['onehotencoder'].get_feature_names_out().tolist()
ordinal_cols = lr_pipe.named_steps['columntransformer'].named_transformers_['ordinalencoder'].get_feature_names_out().tolist()
numeric_cols = lr_pipe.named_steps['columntransformer'].named_transformers_['standardscaler'].get_feature_names_out().tolist()

feature_importance = pd.DataFrame({
    'feature': categorical_cols + ordinal_cols + numeric_cols, 
    'coef': lr_pipe.named_steps['logisticregression'].coef_[0].tolist()
})

### Observation
 - According to the below dataframe, `poutcome`, `month` and `duration` are the top 3 features that are highly related to whether a client would subscribe to the term deposit or not.
   - If `poutcome == "success"`, clients were already experiencing the good services by the bank, so they're more willing to subscribe new products.
   - If `month == "mar"`, we are not sure why clients tend to accept the marketing and subscribe the term deposit in March, this pattern also exists in the test set, so it's not likely an over-fitting. We thought it might be related to the financial/tax period/bonus release time in Portugal, but it seems the finance year-end in Portugal is December (it could be March in some countries e.g. China). So we still can't make sense of it yet.
   - If `duration` is longer, that means the clients were more interested in the term deposit product and were more likely to stay on the call, the salesperson had time to do more pitching, so increase the chance of successful subscription

In [None]:
feature_importance.sort_values('coef', ascending=False).style.format(
    precision=3
).background_gradient(
    cmap="PiYG",
    vmin=-2,
    vmax=2,
    axis=None
)