In [188]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings


# Ignore warnings
warnings.filterwarnings('ignore')

## Data Preparation

In [189]:
file_path = "/content/bank-full.csv"
df = pd.read_csv(file_path, sep=';')

df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [190]:
df = df[["age","job", "marital", "education", "balance",
         "housing", "contact", "day", "month", "duration",
         "campaign", "pdays", "previous", "poutcome", "y"]]

# Missing values
df.isnull().sum()

Unnamed: 0,0
age,0
job,0
marital,0
education,0
balance,0
housing,0
contact,0
day,0
month,0
duration,0


## Question 1
What is the most frequent observation (mode) for the column education?

In [191]:
df['education'].mode()

Unnamed: 0,education
0,secondary


## Question 2
Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?

In [192]:
# numerical columns
numerical_columns = ['age', 'balance', 'day', 'duration',
                     'campaign', 'pdays', 'previous']

# categorical columns
categorical_columns = ['job', 'marital', 'education', 'housing',
                       'contact', 'month', 'poutcome', 'y']

correlation_matrix = df[numerical_columns].corr()


correlation_matrix.style.background_gradient(cmap='coolwarm')

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


## Target encoding
Now we want to encode the y variable.
Let's replace the values yes/no with 1/0.
Split the data
Split your data in train/val/test sets with 60%/20%/20% distribution.
Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
Make sure that the target value y is not in your dataframe.

In [193]:
# Target variable
y = df['y']

# Feature variables
X = df.drop('y', axis=1)

# numerical columns
numerical_columns = ['age', 'balance', 'day', 'duration',
                     'campaign', 'pdays', 'previous']

# categorical columns
categorical_columns = ['job', 'marital', 'education', 'housing',
                       'contact', 'month', 'poutcome']

In [194]:
# Encode the target variable
le = LabelEncoder()
y = le.fit_transform(df['y'])


In [195]:
# Split the data into training and testing sets
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [196]:
# Split the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=42)


In [197]:
# Feature variables shape
print(X_train.shape, X_test.shape, X_val.shape)

# Target variable shape
print(y_train.shape, y_test.shape, y_val.shape)

(27126, 14) (9043, 14) (9042, 14)
(27126,) (9043,) (9042,)


## Question 3
- > Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.
- > Round the scores to 2 decimals using round(score, 2).

In [198]:
""" Calculate the mutual information score between y_train and
X_train[categorical_columns]"""

from sklearn.metrics import mutual_info_score

def mutual_info_y_categorical_features(X_train, y_train, categorical_columns):
    mi_scores = {}
    for col in categorical_columns:
        mi_scores[col] = round(mutual_info_score(y_train, X_train[col]), 2)

    return mi_scores

# Calculate MI scores
mi_scores = mutual_info_y_categorical_features(X_train, y_train, categorical_columns)

# Print the scores
for feature, score in mi_scores.items():
    print(f"Mutual Information Score between y_train and {feature}: {score}")


Mutual Information Score between y_train and job: 0.01
Mutual Information Score between y_train and marital: 0.0
Mutual Information Score between y_train and education: 0.0
Mutual Information Score between y_train and housing: 0.01
Mutual Information Score between y_train and contact: 0.01
Mutual Information Score between y_train and month: 0.03
Mutual Information Score between y_train and poutcome: 0.03


## Question 4
Now let's train a logistic regression.
Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
Fit the model on the training dataset.
To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
Calculate the accuracy on the validation dataset and round it to 2 decimal digits.
What accuracy did you get?

In [199]:
# One-hot encoding for categorical columns
dv = DictVectorizer(sparse=False)

train_dict = X_train[categorical_columns + numerical_columns].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = X_val[categorical_columns + numerical_columns].to_dict(orient='records')
X_val = dv.transform(val_dict)

test_dict = X_test[categorical_columns + numerical_columns].to_dict(orient='records')
X_test = dv.transform(test_dict)


In [201]:
# Train model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [202]:
y_pred = model.predict(X_val)

In [203]:
(y_pred == y_val).mean()

0.9009068790090687

In [204]:
baseline_accuracy = accuracy_score(y_val, model.predict(X_val))

In [205]:
round(baseline_accuracy, 2)

0.9

## Question 5
Let's find the least useful feature using the feature elimination technique.
Train a model with all these features (using the same parameters as in Q4).
Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
Which of following feature has the smallest difference?

In [211]:
# List of features to evaluate
features_to_evaluate = categorical_columns + numerical_columns

# Original model with all features
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
original_accuracy = model.score(X_val, y_val)

# Dictionary to store accuracy without each feature
accuracy_differences = {}

# Loop through features and exclude one at a time
for feature in features_to_evaluate:
    # Create a new DataFrame without the current feature
    X_train_excluded = X_train[:, dv.get_feature_names_out() != feature]
    X_val_excluded = X_val[:, dv.get_feature_names_out() != feature]

    # Train the model without the excluded feature
    model.fit(X_train_excluded, y_train)
    new_accuracy = model.score(X_val_excluded, y_val)

    # Calculate the accuracy difference
    accuracy_difference = original_accuracy - new_accuracy
    accuracy_differences[feature] = accuracy_difference

# Display the accuracy differences
for feature, difference in accuracy_differences.items():
    print(f"Accuracy difference when excluding {feature}: {difference}")

# Find the feature with the smallest accuracy difference
least_useful_feature = min(accuracy_differences, key=accuracy_differences.get)
print(f"The least useful feature is: {least_useful_feature}")

Accuracy difference when excluding job: 0.0
Accuracy difference when excluding marital: 0.0
Accuracy difference when excluding education: 0.0
Accuracy difference when excluding housing: 0.0
Accuracy difference when excluding contact: 0.0
Accuracy difference when excluding month: 0.0
Accuracy difference when excluding poutcome: 0.0
Accuracy difference when excluding age: -0.00044238000442387015
Accuracy difference when excluding balance: -0.0001105950011059953
Accuracy difference when excluding day: -0.00044238000442387015
Accuracy difference when excluding duration: 0.011170095111700862
Accuracy difference when excluding campaign: 0.0006635700066356387
Accuracy difference when excluding pdays: 0.0
Accuracy difference when excluding previous: 0.0
The least useful feature is: age


## Question 6
Now let's train a regularized logistic regression.
- > Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].
- > Train models using all the features as in Q4.
- > Calculate the accuracy on the validation dataset and round it to 3 decimal digits.
- > Which of these C leads to the best accuracy on the validation set?

In [214]:
# List of C values to evaluate
C_values = [0.01, 0.1, 1, 10, 100]

# Dictionary to store accuracy for each C value
accuracy_results = {}

# Loop through each C value
for C in C_values:
    # Train the model with the current C value
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    # Calculate accuracy on the validation set
    accuracy = model.score(X_val, y_val)

    # Store the rounded accuracy
    accuracy_results[C] = accuracy

# Display the accuracy results for each C value
for C, accuracy in accuracy_results.items():
    print(f"C: {C}, Accuracy: {accuracy}")

# Find the C value that leads to the best accuracy
best_C = max(accuracy_results, key=accuracy_results.get)
best_accuracy = accuracy_results[best_C]
print(f"The best C value is: {best_C} with an accuracy of: {best_accuracy}")


C: 0.01, Accuracy: 0.8979208139792081
C: 0.1, Accuracy: 0.9007962840079629
C: 1, Accuracy: 0.9009068790090687
C: 10, Accuracy: 0.9009068790090687
C: 100, Accuracy: 0.9006856890068569
The best C value is: 1 with an accuracy of: 0.9009068790090687
