# Binary Classification

In this project, the target for classification is the y variable - has the client subscribed a term deposit or not.


In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('bank-full.csv', delimiter=';')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


## EDA

In [3]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [4]:
df.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [5]:
for c in df.columns:
    print(c, df[c].nunique())
    print(df[c].unique()[:12])
    print()

age 77
[58 44 33 47 35 28 42 43 41 29 53 57]

job 12
['management' 'technician' 'entrepreneur' 'blue-collar' 'unknown'
 'retired' 'admin.' 'services' 'self-employed' 'unemployed' 'housemaid'
 'student']

marital 3
['married' 'single' 'divorced']

education 4
['tertiary' 'secondary' 'unknown' 'primary']

default 2
['no' 'yes']

balance 7168
[2143   29    2 1506    1  231  447  121  593  270  390    6]

housing 2
['yes' 'no']

loan 2
['no' 'yes']

contact 3
['unknown' 'cellular' 'telephone']

day 31
[ 5  6  7  8  9 12 13 14 15 16 19 20]

month 12
['may' 'jun' 'jul' 'aug' 'oct' 'nov' 'dec' 'jan' 'feb' 'mar' 'apr' 'sep']

duration 1573
[261 151  76  92 198 139 217 380  50  55 222 137]

campaign 48
[ 1  2  3  5  4  6  7  8  9 10 11 12]

pdays 559
[ -1 151 166  91  86 143 147  89 140 176 101 174]

previous 41
[ 0  3  1  4  2 11 16  6  5 10 12  7]

poutcome 4
['unknown' 'failure' 'other' 'success']

y 2
['no' 'yes']



In [6]:
chosen_features = ['age', 'job', 'marital', 'education', 'balance', 'housing',
        'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y']
    

In [7]:
df = df[chosen_features] 

In [8]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [9]:
df['education'].mode()

0    secondary
Name: education, dtype: object

In [10]:
(df.dtypes != object)

age           True
job          False
marital      False
education    False
balance       True
housing      False
contact      False
day           True
month        False
duration      True
campaign      True
pdays         True
previous      True
poutcome     False
y            False
dtype: bool

In [11]:
numerical_features = [ 'age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [12]:
categorical_features = [ 'job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

In [13]:
correlation_matrix = df[numerical_features].corr()
correlation_matrix

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


## Target encoding

In [14]:
df.y.value_counts()

y
no     39922
yes     5289
Name: count, dtype: int64

In [15]:
df.y[39009:39018]

39009    yes
39010    yes
39011     no
39012     no
39013     no
39014     no
39015     no
39016     no
39017    yes
Name: y, dtype: object

In [16]:
df.y = (df.y== 'yes').astype(int)

In [17]:
df.y[39009:39018]

39009    1
39010    1
39011    0
39012    0
39013    0
39014    0
39015    0
39016    0
39017    1
Name: y, dtype: int32

## Split data

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [20]:
len(df_train), len(df_val), len(df_test), len(df_val)+ len(df_test)+len(df_train) == len(df)

(27126, 9042, 9043, True)

In [21]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values


In [22]:
df_full_train.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y             int32
dtype: object

## MI score

In [23]:
from sklearn.metrics import mutual_info_score

In [24]:
def mutual_information_score_series(series):
    return mutual_info_score(series, df_full_train.y)

In [25]:
mi = df_full_train[categorical_features].apply(mutual_information_score_series)
mi.sort_values(ascending=False).round(2)

poutcome     0.03
month        0.02
contact      0.01
housing      0.01
job          0.01
education    0.00
marital      0.00
dtype: float64

## One-hot encoding

In [26]:
from sklearn.feature_extraction import DictVectorizer

In [27]:
dv = DictVectorizer(sparse=False)
train_dict = df_train[numerical_features + categorical_features].to_dict(orient= 'records')
train_dict[0]

{'age': 32,
 'balance': 1100,
 'day': 11,
 'duration': 67,
 'campaign': 1,
 'pdays': -1,
 'previous': 0,
 'job': 'technician',
 'marital': 'single',
 'education': 'tertiary',
 'housing': 'yes',
 'contact': 'cellular',
 'month': 'aug',
 'poutcome': 'unknown'}

In [28]:
X_train = dv.fit_transform(train_dict)

In [29]:
val_dict = df_val[categorical_features + numerical_features].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [30]:
print(X_train.shape)
print(X_val.shape)

(27126, 47)
(9042, 47)


## Logistic Regression

In [31]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [32]:
y_pred = model.predict_proba(X_val)
y_pred.shape

(9042, 2)

In [33]:
X_val.shape

(9042, 47)

In [34]:
y_val.shape

(9042,)

In [39]:
y_decision = (y_pred[:,1] >= 0.5)
y_decision.shape


(9042,)

In [42]:
base_model_accuracy = ( y_val == y_decision).mean()
base_model_accuracy.round(2)

0.9

## Feature elimination technique : finding the least useful feature

In [54]:
accr = {}

X_train_df = pd.DataFrame(X_train, columns=dv.get_feature_names_out())
X_val_df = pd.DataFrame(X_val, columns=dv.get_feature_names_out())

# Find feature groups that share a common prefix (like 'job=')
feature_groups = {}
for feature in X_train_df.columns:
    prefix = feature.split('=')[0]  
    if prefix not in feature_groups:
        feature_groups[prefix] = []
    feature_groups[prefix].append(feature)

for feature_group in feature_groups:
    X_train_subset = X_train_df.drop(columns = feature_groups[feature_group])
    X_val_subset = X_val_df.drop(columns = feature_groups[feature_group])

    model.fit(X_train_subset, y_train)
    y_pred_subset = model.predict_proba(X_val_subset)
    y_decision_i = (y_pred_subset[:,1] >= 0.5)

    accuracy_i = ( y_val == y_decision_i).mean()

    accuracy_diff = (base_model_accuracy - accuracy_i)
    accr[feature_group] = accuracy_diff
    print(f"Accuracy without {feature_group}: {accuracy_i}, Difference: {accuracy_diff}")


Accuracy without age: 0.9010174740101747, Difference: 0.0
Accuracy without balance: 0.9006856890068569, Difference: 0.00033178500331787486
Accuracy without campaign: 0.9006856890068569, Difference: 0.00033178500331787486
Accuracy without contact: 0.9000221190002212, Difference: 0.0009953550099535136
Accuracy without day: 0.9013492590134926, Difference: -0.00033178500331787486
Accuracy without duration: 0.8900685689006856, Difference: 0.010948905109489093
Accuracy without education: 0.9009068790090687, Difference: 0.0001105950011059953
Accuracy without housing: 0.9012386640123866, Difference: -0.00022119000221187957
Accuracy without job: 0.9011280690112807, Difference: -0.0001105950011059953
Accuracy without marital: 0.9011280690112807, Difference: -0.0001105950011059953
Accuracy without month: 0.8999115239991152, Difference: 0.0011059500110595089
Accuracy without pdays: 0.9009068790090687, Difference: 0.0001105950011059953
Accuracy without poutcome: 0.893607608936076, Difference: 0.007

## Regularized logistic regression

In [67]:
from sklearn.linear_model import Ridge
from sklearn.metrics import accuracy_score

In [69]:
c_values = [0.01, 0.1, 1, 10, 100]
accr ={}

for c in c_values:
    model = LogisticRegression(C=c, solver='liblinear')  
    model.fit(X_train_df, y_train)
    y_pred = model.predict(X_val_df)

    # Calculate accuracy and other metrics
    accuracy = accuracy_score(y_val, y_pred)
    accr[c] = {'Accuracy': accuracy}

    

for c, metrics in accr.items():
    print(f"C: {c}, Accuracy: {metrics['Accuracy']:.4f}")
    

C: 0.01, Accuracy: 0.8980
C: 0.1, Accuracy: 0.9009
C: 1, Accuracy: 0.9010
C: 10, Accuracy: 0.9010
C: 100, Accuracy: 0.9011
