# Homework 3: 

##  Machine Learning for Classification
___

#### Import the necessary libraries, and load the data
____

In [6]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, mutual_info_score
from sklearn.model_selection import train_test_split

In [7]:
# loading the data 
file = r'/Users/teslim/OneDrive/mlzoomcamp/bank/bank-full.csv'
df = pd.read_csv(file, sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [8]:
# checking the data types
df.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [9]:
df.columns 

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [185]:
interested_columns = [
    'age', 
    'job', 
    'marital', 
    'education',  
    'balance', 
    'housing',
    'contact', 
    'day', 
    'month', 
    'duration', 
    'campaign', 
    'pdays',
    'previous', 
    'poutcome', 
    'y']


df = df[interested_columns]
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


#### Data Preparation and Preprocessing
_____

In [186]:
# checking for missing values
df.isnull().sum()   

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [187]:
# checking the value of target varaible 
df.y.value_counts()

y
no     39922
yes     5289
Name: count, dtype: int64

In [188]:
# converting the target column to binary
df.y = (df.y == 'yes').astype(int)

In [189]:
df.y.head(10)

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
Name: y, dtype: int64

In [190]:
df

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,825,no,cellular,17,nov,977,3,-1,0,unknown,1
45207,71,retired,divorced,primary,1729,no,cellular,17,nov,456,2,-1,0,unknown,1
45208,72,retired,married,secondary,5715,no,cellular,17,nov,1127,5,184,3,success,1
45209,57,blue-collar,married,secondary,668,no,telephone,17,nov,508,4,-1,0,unknown,0


#### Question 1: 
____


What is the most frequent observation (mode) for the column education?

In [191]:
df['education'].mode()

0    secondary
Name: education, dtype: object

#### Question 2: 
____

Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.   

What are the two features that have the biggest correlation?

`age` and `balance`  
`day` and `campaign`  
`da`y and `pdays`  
`pdays` and `previous`  

**Target encoding**   

- Now we want to encode the y variable.
- Let's replace the values yes/no with 1/0.

**Split the data**

- Split your data in train/val/test sets with 60%/20%/20% distribution.
- Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
- Make sure that the target value y is not in your dataframe.


In [205]:
# Get all the numerical features
numerical_features = df.columns[(df.dtypes == "int").values].to_list()

In [203]:
# Compute the pairwise correlation matrix
# Note the abs!
corr = df[numerical_features].corr().abs()
corr.style.background_gradient(cmap="coolwarm")

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,y
age,1.0,0.097783,0.00912,0.004648,0.00476,0.023758,0.001288,0.025155
balance,0.097783,1.0,0.004503,0.02156,0.014578,0.003435,0.016674,0.052838
day,0.00912,0.004503,1.0,0.030206,0.16249,0.093044,0.05171,0.028348
duration,0.004648,0.02156,0.030206,1.0,0.08457,0.001565,0.001203,0.394521
campaign,0.00476,0.014578,0.16249,0.08457,1.0,0.088628,0.032855,0.073172
pdays,0.023758,0.003435,0.093044,0.001565,0.088628,1.0,0.45482,0.103621
previous,0.001288,0.016674,0.05171,0.001203,0.032855,0.45482,1.0,0.093236
y,0.025155,0.052838,0.028348,0.394521,0.073172,0.103621,0.093236,1.0


From the correlation matrix, we can see that there is no strong correlation between pydays and previous. Therefore, we can assume that the assumption of multicollinearity is satisfied.

#### Question 3: 
____

Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.
Round the scores to 2 decimals using round(score, 2).


In [194]:
# checking the length of the data
len(df)

45211

In [195]:
# checking the shape of the data
df.shape

(45211, 15)

In [11]:

# splitting the data into training and testing
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)


# splitting the training data into training and validation
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)


# reseting the index after splitting 
df_train_full = df_train_full.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)


# assigning the target variables to the y_train and y_val
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values


# deleting the target variables from the data
del df_train['y']
del df_val['y']
del df_test['y']


In [197]:
# checking the length of the data
len(df_train), len(df_val), len(df_test)

(27126, 9042, 9043)

In [206]:
# categorical features
categorical = [
    'job', 
    'marital', 
    'education', 
    'housing', 
    'contact', 
    'month', 
    'poutcome'
]
categorical


['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

In [216]:
numerical = [
    'age',
    'balance',
    'day', 
    'duration', 
    'campaign', 
    'pdays', 
    'previous']

numerical

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [215]:
# mutual info score function  between y and dependents
def calculate_mi(series):
    return mutual_info_score(series, df_train_full.y)



# calculate mutual info score for all categorical columns
mi = df_train_full[categorical].apply(calculate_mi)
mi.sort_values(ascending=False).round(2)

poutcome     0.03
month        0.02
contact      0.01
housing      0.01
job          0.01
education    0.00
marital      0.00
dtype: float64

The variable with the highest score is: poutcome

#### Question 4: 
____

- Now let's train a logistic regression.   
- Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.  
- Fit the model on the training dataset.  
- To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
- model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)  
- Calculate the accuracy on the validation dataset and round it to 2 decimal digits.  

In [217]:
# One-hot encoding

# Initialize DictVectorizer
dv = DictVectorizer(sparse=False)

# Convert DataFrame to list of dictionaries
train_dict = df_train[categorical+ numerical].to_dict(orient='records')
val_dict =  df_val[categorical + numerical].to_dict(orient='records')


# Transform the list of dictionaries into a matrix
X_train = dv.fit_transform(train_dict)
X_val = dv.transform(val_dict)

# # Get feature names
feature_names = dv.get_feature_names_out()

# Display the encoded matrix and feature names
print(pd.DataFrame(X_train, columns=feature_names))

        age  balance  campaign  contact=cellular  contact=telephone  \
0      32.0   1100.0       1.0               1.0                0.0   
1      38.0      0.0       1.0               1.0                0.0   
2      49.0   3309.0       2.0               1.0                0.0   
3      37.0   2410.0       1.0               1.0                0.0   
4      31.0   3220.0       4.0               1.0                0.0   
...     ...      ...       ...               ...                ...   
27121  27.0    167.0       2.0               1.0                0.0   
27122  40.0    693.0       1.0               1.0                0.0   
27123  54.0      0.0       1.0               0.0                0.0   
27124  25.0   2311.0       2.0               1.0                0.0   
27125  30.0     15.0       2.0               1.0                0.0   

       contact=unknown   day  duration  education=primary  \
0                  0.0  11.0      67.0                0.0   
1                  0.0  1

In [218]:
list(feature_names)

['age',
 'balance',
 'campaign',
 'contact=cellular',
 'contact=telephone',
 'contact=unknown',
 'day',
 'duration',
 'education=primary',
 'education=secondary',
 'education=tertiary',
 'education=unknown',
 'housing=no',
 'housing=yes',
 'job=admin.',
 'job=blue-collar',
 'job=entrepreneur',
 'job=housemaid',
 'job=management',
 'job=retired',
 'job=self-employed',
 'job=services',
 'job=student',
 'job=technician',
 'job=unemployed',
 'job=unknown',
 'marital=divorced',
 'marital=married',
 'marital=single',
 'month=apr',
 'month=aug',
 'month=dec',
 'month=feb',
 'month=jan',
 'month=jul',
 'month=jun',
 'month=mar',
 'month=may',
 'month=nov',
 'month=oct',
 'month=sep',
 'pdays',
 'poutcome=failure',
 'poutcome=other',
 'poutcome=success',
 'poutcome=unknown',
 'previous']

In [219]:
# fitting the model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [220]:
# model intercept
model.intercept_[0]

-0.9897986105241079

In [221]:
# model coef
model.coef_[0].round(3)

array([ 1.000e-03,  0.000e+00, -7.800e-02,  2.550e-01,  8.400e-02,
       -1.329e+00,  9.000e-03,  4.000e-03, -4.430e-01, -2.450e-01,
       -6.800e-02, -2.330e-01, -1.470e-01, -8.430e-01,  8.900e-02,
       -2.270e-01, -2.700e-01, -3.370e-01, -9.000e-02,  2.770e-01,
       -3.030e-01, -1.300e-01,  2.920e-01, -1.500e-01,  3.200e-02,
       -1.750e-01, -3.430e-01, -4.790e-01, -1.690e-01, -8.000e-03,
       -7.320e-01,  3.940e-01, -3.400e-01, -1.193e+00, -1.052e+00,
        3.180e-01,  1.490e+00, -5.020e-01, -9.810e-01,  8.050e-01,
        8.100e-01, -1.000e-03, -8.080e-01, -6.230e-01,  1.499e+00,
       -1.058e+00,  8.000e-03])

In [222]:
# model prediction focusing on the soft prediction
y_pred = model.predict_proba(X_val)[:, 1]
y_pred

array([0.01280984, 0.00972527, 0.15368043, ..., 0.05339037, 0.00903052,
       0.28018338])

In [237]:
y_decision = (y_pred >= 0.5)


In [238]:
# checking the accuracy of the model
(y_val == y_decision).mean()

0.9012386640123866

In [239]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = y_decision.astype(int)
df_pred['actual'] = y_val

In [240]:
df_pred

Unnamed: 0,probability,prediction,actual
0,0.012810,0,0
1,0.009725,0,0
2,0.153680,0,1
3,0.231043,0,0
4,0.442013,0,1
...,...,...,...
9037,0.022346,0,0
9038,0.267929,0,1
9039,0.053390,0,0
9040,0.009031,0,0


In [241]:
df_pred['correct'] = df_pred.prediction == df_pred.actual
df_pred

Unnamed: 0,probability,prediction,actual,correct
0,0.012810,0,0,True
1,0.009725,0,0,True
2,0.153680,0,1,False
3,0.231043,0,0,True
4,0.442013,0,1,False
...,...,...,...,...
9037,0.022346,0,0,True
9038,0.267929,0,1,False
9039,0.053390,0,0,True
9040,0.009031,0,0,True


In [242]:
df_pred.correct.mean()

0.9012386640123866

#### Question 5: 
____

- Let's find the least useful feature using the feature elimination technique.    
- Train a model with all these features (using the same parameters as in Q4).   
- Now exclude each feature from this set and train a model without it. Record the accuracy for each model.   
- For each feature, calculate the difference between the original accuracy and the accuracy without the feature.   


In [13]:
# Define numerical and categorical features
numerical = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
categorical = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']
features = numerical + categorical


In [14]:
# Convert the DataFrames to lists of dictionaries for DictVectorizer
train_dict = df_train[features].to_dict(orient='records')
val_dict = df_val[features].to_dict(orient='records')


In [15]:
# Initialize DictVectorizer and transform the data
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dict)
X_val = dv.transform(val_dict)


In [16]:
# Train the model with all features
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)


In [18]:
# Ensure y_val is in binary format
y_val_binary = (y_val == 'yes').astype(int)

# Predict on the validation set and calculate the accuracy with all features
y_pred_all = model.predict_proba(X_val)[:, 1]
y_decision_all = (y_pred_all >= 0.5).astype(int)
original_accuracy = accuracy_score(y_val_binary, y_decision_all)


In [19]:
# List of test features to evaluate
test_features = ["age", "balance", "marital", "previous"]
accuracy_diffs = {}


In [23]:
# Ensure y_val is in binary format
y_val_binary = (y_val == 'yes').astype(int)

# Evaluate the accuracy difference when removing each feature
for feature in test_features:
    # Exclude the feature and train the model again
    reduced_features = [f for f in features if f != feature]
    
    # Convert the DataFrames to lists of dictionaries for DictVectorizer
    train_dict_reduced = df_train[reduced_features].to_dict(orient='records')
    val_dict_reduced = df_val[reduced_features].to_dict(orient='records')
    
    # Initialize DictVectorizer and transform the data
    dv_reduced = DictVectorizer(sparse=False)
    X_train_reduced = dv_reduced.fit_transform(train_dict_reduced)
    X_val_reduced = dv_reduced.transform(val_dict_reduced)
    
    # Train the model with the reduced feature set
    model_reduced = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_reduced.fit(X_train_reduced, y_train)
    
    # Predict on the validation set and calculate accuracy
    y_pred_reduced = model_reduced.predict_proba(X_val_reduced)[:, 1]
    y_decision_reduced = (y_pred_reduced >= 0.5).astype(int)
    reduced_accuracy = accuracy_score(y_val_binary, y_decision_reduced)
    
    # Calculate the difference in accuracy
    accuracy_diff = original_accuracy - reduced_accuracy
    accuracy_diffs[feature] = accuracy_diff




#### Question 6: 
_____

- Now let's train a regularized logistic regression.
- Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].
- Train models using all the features as in Q4.
- Calculate the accuracy on the validation dataset and round it to 3 decimal digits.
- Which of these C leads to the best accuracy on the validation set?

* 0.01
* 0.1
* 1
* 10
* 100
Note: If there are multiple options, select the smallest C.


In [25]:
# Proceed to Question 6: Testing different values of C
C_values = [0.01, 0.1, 1, 10, 100]
C_accuracies = {}

# Ensure y_val is in binary format
y_val_binary = (y_val == 'yes').astype(int)

for C in C_values:
    # Train the model with the specified C value
    model_C = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model_C.fit(X_train, y_train)
    
    # Predict and calculate accuracy
    y_pred_C = model_C.predict_proba(X_val)[:, 1]
    y_decision_C = (y_pred_C >= 0.5).astype(int)
    accuracy_C = accuracy_score(y_val_binary, y_decision_C)
    
    # Round to 3 decimal digits
    C_accuracies[C] = round(accuracy_C, 3)

# Display the accuracy differences and regularized logistic regression results
print("Feature Elimination Results")
print(pd.DataFrame.from_dict(accuracy_diffs, orient='index', columns=['Accuracy Difference']))
print("Regularized Logistic Regression Results")
print(pd.DataFrame.from_dict(C_accuracies, orient='index', columns=['Accuracy']))

# Display the accuracy differences and regularized logistic regression results
accuracy_diffs, C_accuracies

Feature Elimination Results
          Accuracy Difference
age                  0.000111
balance              0.000111
marital              0.000000
previous             0.000332
Regularized Logistic Regression Results
        Accuracy
0.01       0.898
0.10       0.901
1.00       0.901
10.00      0.901
100.00     0.900


({'age': 0.00011059500110588427,
  'balance': 0.00011059500110588427,
  'marital': 0.0,
  'previous': 0.00033178500331787486},
 {0.01: 0.898, 0.1: 0.901, 1: 0.901, 10: 0.901, 100: 0.9})

The best accuracy on the validation set is achieved with C = 0.01