In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('bank-additional-full.csv')

df

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


In [3]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
categorical_cols.remove('y')  # Exclude the target column
categorical_cols

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'day_of_week',
 'poutcome']

In [4]:
# Encode categorical variables using onehot encoding
df_onehot = df.copy()
df_onehot[categorical_cols] = df_onehot[categorical_cols].astype('category')

# Perform one-hot encoding on categorical variables
df_onehot = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [5]:
# Encode the target column using ordinal encoding
ordinal_encoder = OrdinalEncoder(categories=[['no', 'yes']])
df_onehot['y'] = ordinal_encoder.fit_transform(df[['y']])

In [6]:
# Print the encoded DataFrame
df_onehot

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_may,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success
0,56,261,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,1,0,0,0,1,0,0,0,1,0
1,57,149,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,1,0,0,0,1,0,0,0,1,0
2,37,226,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,1,0,0,0,1,0,0,0,1,0
3,40,151,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,1,0,0,0,1,0,0,0,1,0
4,56,307,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,1,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,334,1,999,0,-1.1,94.767,-50.8,1.028,4963.6,...,0,1,0,0,0,0,0,0,1,0
41184,46,383,1,999,0,-1.1,94.767,-50.8,1.028,4963.6,...,0,1,0,0,0,0,0,0,1,0
41185,56,189,2,999,0,-1.1,94.767,-50.8,1.028,4963.6,...,0,1,0,0,0,0,0,0,1,0
41186,44,442,1,999,0,-1.1,94.767,-50.8,1.028,4963.6,...,0,1,0,0,0,0,0,0,1,0


In [7]:
# Split the data into train and test sets
X = df_onehot.drop('y', axis=1)
y = df_onehot['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [8]:
model = LogisticRegression(max_iter=40000)
model.fit(X_train, y_train)


LogisticRegression(max_iter=40000)

In [14]:
# Get the feature importances or coefficients of the logistic regression model
importances = model.coef_[0]  # Coefficients for each feature

# Get the column names sorted by importance in descending order
sorted_columns = X_train.columns[np.argsort(-importances)]  # Sort in descending order

top5 = 8

# Print the column names with importance in decreasing order
print(' Top 5 columns in decreasing order of importance:')
for column in sorted_columns:
    print(column)
    
    top5 -= 1
    if top5 == 0:
        break

 Top 5 columns in decreasing order of importance:
cons.price.idx
month_mar
poutcome_nonexistent
month_aug
education_university.degree
month_jul
month_jun
marital_single


In [10]:
# Predict on the test set
y_pred = model.predict(X_test)

In [11]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.9098486687707372


In [12]:
# Identify the most important variable for a specific observation
observation_index = 3  # Observation number 4 (Python indexing starts from 0)
observation_importances = importances * X_test.iloc[observation_index]  # Importance for the specific observation
most_important_variable = X_train.columns[np.argmax(observation_importances)]

# Print the most important variable for the specific observation
print('Most Important Variable for Observation', observation_index + 1, ':', most_important_variable)


Most Important Variable for Observation 4 : cons.price.idx


In [13]:
# Identify the most important variable for a specific observation
observation_index = 19  # Observation number 4 (Python indexing starts from 0)
observation_importances = importances * X_test.iloc[observation_index]  # Importance for the specific observation
most_important_variable = X_train.columns[np.argmax(observation_importances)]

# Print the most important variable for the specific observation
print('Most Important Variable for Observation', observation_index + 1, ':', most_important_variable)


Most Important Variable for Observation 20 : cons.price.idx
