In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, mutual_info_score
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("./data/bank-full.csv", sep=";")

In [3]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [5]:
cols = [
    "age",
    "job",
    "marital",
    "education",
    "balance",
    "housing",
    "contact",
    "day",
    "month",
    "duration",
    "campaign",
    "pdays",
    "previous",
    "poutcome",
    "y",
]

In [6]:
df = df[cols]

In [7]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

# Question 1

In [8]:
df["education"].mode()

0    secondary
Name: education, dtype: object

In [9]:
df.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

# Question 2

In [10]:
# Get all the numerical features
numerical_features = df.columns[(df.dtypes == "int").values].to_list()

In [11]:
# Compute the pairwise correlation matrix
# Note the abs!
corr = df[numerical_features].corr().abs()
corr.style.background_gradient(cmap="Reds")

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,0.00912,0.004648,0.00476,0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,0.014578,0.003435,0.016674
day,0.00912,0.004503,1.0,0.030206,0.16249,0.093044,0.05171
duration,0.004648,0.02156,0.030206,1.0,0.08457,0.001565,0.001203
campaign,0.00476,0.014578,0.16249,0.08457,1.0,0.088628,0.032855
pdays,0.023758,0.003435,0.093044,0.001565,0.088628,1.0,0.45482
previous,0.001288,0.016674,0.05171,0.001203,0.032855,0.45482,1.0


It's clear that `pdays` and `previous` have the largest correlation.

In [12]:
df["y"] = (df["y"] == "yes").astype(int)

In [68]:
def train_test_val_split(
    df: pd.DataFrame,
    val_frac: float = 0.2,
    test_frac: float = 0.2,
    random_state: int = 42,
):
    df_full_train, df_test = train_test_split(
        df, test_size=test_frac, random_state=random_state
    )
    df_train, df_val = train_test_split(
        df_full_train, test_size=val_frac / (1 - test_frac), random_state=random_state
    )
    y_train = df_train.y
    y_val = df_val.y
    y_test = df_test.y
    df_train = df_train.drop(columns=["y"])
    df_val = df_val.drop(columns=["y"])
    df_test = df_test.drop(columns=["y"])
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
    return df_train, df_val, df_test, y_train, y_val, y_test

In [69]:
df_train, df_val, df_test, y_train, y_val, y_test = train_test_val_split(df)

0.25


In [66]:
len(df_train), len(df_val), len(df_test)

(27126, 9042, 9043)

In [50]:
categorical_features = df_train.columns[(df_train.dtypes == "object").values].to_list()

In [51]:
categorical_features

['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

In [53]:
def mutual_info(series):
    return mutual_info_score(series, y_train)

In [54]:
mi = df_train[categorical_features].apply(mutual_info)

In [67]:
mi.sort_values(ascending=False)

poutcome     0.029533
month        0.025090
contact      0.013356
housing      0.010343
job          0.007316
education    0.002697
marital      0.002050
dtype: float64

# Question 4

In [92]:
train_dicts = df_train[categorical_features + numerical_features].to_dict(
    orient="records"
)
val_dicts = df_val[categorical_features + numerical_features].to_dict(orient="records")

In [93]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

In [94]:
X_train.shape

(27126, 47)

Sanity check: the total number of columns should equal to the number of numerical features plus the sum of all the unique values of every categorical feature

In [95]:
len(numerical_features) + df_train[categorical_features].nunique().sum()

47

In [96]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)

In [97]:
model.fit(X_train, y_train)

In [98]:
y_pred = model.predict(X_val)

In [99]:
acc = (y_pred == y_val).sum() / len(y_val)

In [100]:
acc_r = np.round(acc, 2)
acc_r

0.9

In [101]:
y_pred_prob = model.predict_proba(X_val)[:, 1]

In [102]:
((y_pred_prob >= 0.5).astype(int) == y_val).mean()

0.9009068790090687

# Question 5

In this interpretation of the question we assume that we should start with a model that includes _all_ features and then perform feature elimination.

In [118]:
features = sorted(numerical_features + categorical_features)

In [119]:
test_features = ["age", "balance", "marital", "previous"]

In [120]:
accuracies = {}
for feature in test_features:
    train_dicts = df_train[features].drop(columns=[feature]).to_dict(orient="records")

    val_dicts = df_val[features].drop(columns=[feature]).to_dict(orient="records")
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dicts)
    X_val = dv.transform(val_dicts)
    model = LogisticRegression(
        solver="liblinear", C=1.0, max_iter=1000, random_state=42
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    acc_f = (y_pred == y_val).mean()
    accuracies[feature] = acc_f

age 0.9013492590134926 0.9013492590134926
balance 0.9010174740101747 0.9010174740101747
marital 0.9009068790090687 0.9009068790090687
previous 0.9009068790090687 0.9009068790090687


In [107]:
accuracies = pd.Series(accuracies)

In [111]:
with pd.option_context("display.float_format", "{:0.20f}".format):
    print((accuracies - acc).abs().sort_values(ascending=True))

marital    0.00000000000000000000
previous   0.00000000000000000000
balance    0.00011059500110599529
age        0.00044238000442387015
dtype: float64


`marital` and `previous` both give the same difference (0!) with respect to the original model.

# Question 5: Alternate interpretation

In this interpretation of the question, we assume that we should train the model first with just the features mentioned in the question, and then perform feature elimination on this set of features.

In [126]:
train_dicts = df_train[test_features].to_dict(orient="records")
val_dicts = df_val[test_features].to_dict(orient="records")
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
acc_n = (y_pred == y_val).mean()

In [127]:
accuracies = {}
for feature in test_features:
    train_dicts = (
        df_train[test_features].drop(columns=[feature]).to_dict(orient="records")
    )

    val_dicts = df_val[test_features].drop(columns=[feature]).to_dict(orient="records")
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dicts)
    X_val = dv.transform(val_dicts)
    model = LogisticRegression(
        solver="liblinear", C=1.0, max_iter=1000, random_state=42
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    acc_f = (y_pred == y_val).mean()
    accuracies[feature] = acc_f

In [128]:
accuracies = pd.Series(accuracies)
with pd.option_context("display.float_format", "{:0.20f}".format):
    print((accuracies - acc_n).abs().sort_values(ascending=True))

balance    0.00000000000000000000
marital    0.00011059500110588427
age        0.00011059500110599529
previous   0.00132714001327138842
dtype: float64


We see that `balance` makes the smallest difference.

# Question 6

In [37]:
train_dicts = df_train[features].to_dict(orient="records")
val_dicts = df_val[features].to_dict(orient="records")
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

In [38]:
Cs = [0.01, 0.1, 1, 10, 100]
acs = []
for c in Cs:
    model = LogisticRegression(solver="liblinear", C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    acc_f = (y_pred == y_val).sum() / len(y_val)
    print(c, acc_f)
    acs.append(np.round(acc_f, 3))

0.01 0.8979208139792081
0.1 0.9007962840079629
1 0.9009068790090687
10 0.9009068790090687
100 0.9006856890068569


In [39]:
acs

[0.898, 0.901, 0.901, 0.901, 0.901]