# model Implementation

Since we are developing two predictive models based on two aproches
    1. random forest
    2. Nueral network

we will prepare the dataset that optimize the performance for each.


In [85]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split,cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,StandardScaler,MinMaxScaler
from sklearn.compose import ColumnTransformer

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix,roc_auc_score, roc_curve
 
from imblearn.pipeline import Pipeline 

from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Dropout





## Random forest

### benchmark model for random forest

In [148]:
data = pd.read_csv('data/bank_rf.csv',index_col=None)
data.columns

Index(['age', 'default', 'balance', 'housing', 'loan', 'day', 'campaign',
       'pdays', 'previous', 'y', 'job_blue-collar', 'job_entrepreneur',
       'job_housemaid', 'job_management', 'job_retired', 'job_self-employed',
       'job_services', 'job_student', 'job_technician', 'job_unemployed',
       'job_unknown', 'marital_married', 'marital_single', 'contact_telephone',
       'contact_unknown', 'poutcome_other', 'poutcome_success',
       'poutcome_unknown', 'month_aug', 'month_dec', 'month_feb', 'month_jan',
       'month_jul', 'month_jun', 'month_mar', 'month_may', 'month_nov',
       'month_oct', 'month_sep', 'education_secondary', 'education_tertiary',
       'education_unknown'],
      dtype='object')

In [149]:

X = data.drop('y', axis=1)
y = data['y']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [124]:





rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC score: ",roc_auc_score(y_test,y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8937299568727193
ROC-AUC score:  0.6046560066171548
Confusion Matrix:
 [[7838  114]
 [ 847  244]]
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.99      0.94      7952
           1       0.68      0.22      0.34      1091

    accuracy                           0.89      9043
   macro avg       0.79      0.60      0.64      9043
weighted avg       0.88      0.89      0.87      9043



Due to the class imbalance issue the recall for the minority class (yes ) is low. this is a critical issue since the practical utility of the preditive model is to narrow down potential customers form a pool of customers. high number of false negatives would means missing out on potential revenue.


To moitigate that we can try oversampling and undersampling tecniques to see the potential difference

### Upsampling with ramdom sampling and smote

In [125]:
from imblearn.over_sampling import RandomOverSampler, SMOTE


### Random sampling with sklearn resampler

In [79]:
from sklearn.utils import resample


train_data = pd.concat([X_train, y_train], axis=1)


majority_class = train_data[train_data['y'] == 0]
minority_class = train_data[train_data['y'] == 1]

minority_class_oversampled = resample(minority_class,
                                      replace=True,  # Sample with replacement
                                      n_samples=len(majority_class),  # Match the majority class size
                                      random_state=42)


oversampled_data = pd.concat([majority_class, minority_class_oversampled])


X_train_ros = oversampled_data.drop('y', axis=1)
y_train_ros = oversampled_data['y']


rf_ros = RandomForestClassifier(random_state=42)
rf_ros.fit(X_train_ros, y_train_ros)

# Evaluate the model
y_pred_ros = rf_ros.predict(X_test)
print("Random Oversampling Results with resample:")
print(classification_report(y_test, y_pred_ros))
print(confusion_matrix(y_test, y_pred_ros))


Random Oversampling Results with resample:
              precision    recall  f1-score   support

           0       0.91      0.96      0.94      7952
           1       0.54      0.31      0.39      1091

    accuracy                           0.88      9043
   macro avg       0.73      0.64      0.66      9043
weighted avg       0.87      0.88      0.87      9043

[[7665  287]
 [ 754  337]]


In [128]:


ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

# Train and evaluate the Random Forest model
print("Random Oversampling Results:")
rf_ros = RandomForestClassifier(random_state=42,n_estimators=200)
rf_ros.fit(X_train_ros, y_train_ros)
y_pred_ros = rf_ros.predict(X_test)


print("ROC-AUC score: ",roc_auc_score(y_test,y_pred_ros))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_ros))
print("Classification Report:\n", classification_report(y_test, y_pred_ros))


Random Oversampling Results:
ROC-AUC score:  0.6382594374680715
Confusion Matrix:
 [[7680  272]
 [ 752  339]]
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.97      0.94      7952
           1       0.55      0.31      0.40      1091

    accuracy                           0.89      9043
   macro avg       0.73      0.64      0.67      9043
weighted avg       0.87      0.89      0.87      9043



In [129]:

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("\nSMOTE Results:")
rf_smote = RandomForestClassifier(random_state=42,n_estimators=200)
rf_smote.fit(X_train_smote, y_train_smote)
y_pred_smote = rf_smote.predict(X_test)

print("ROC-AUC score: ",roc_auc_score(y_test,y_pred_smote))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_smote))
print("Classification Report:\n", classification_report(y_test, y_pred_smote))


SMOTE Results:
ROC-AUC score:  0.635988767158404
Confusion Matrix:
 [[7571  381]
 [ 742  349]]
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.95      0.93      7952
           1       0.48      0.32      0.38      1091

    accuracy                           0.88      9043
   macro avg       0.69      0.64      0.66      9043
weighted avg       0.86      0.88      0.86      9043



### Undersampling tecniques

In [131]:
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Create and train a Balanced Random Forest
brf = BalancedRandomForestClassifier(n_estimators=200, random_state=42)
brf.fit(X_train, y_train)

# Evaluate the model
y_pred_brf = brf.predict(X_test)


print("ROC-AUC score: ",roc_auc_score(y_test,y_pred_brf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_brf))
print("Classification Report:\n", classification_report(y_test, y_pred_brf))


ROC-AUC score:  0.7298923006416131
Confusion Matrix:
 [[6958  994]
 [ 453  638]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.88      0.91      7952
           1       0.39      0.58      0.47      1091

    accuracy                           0.84      9043
   macro avg       0.66      0.73      0.69      9043
weighted avg       0.87      0.84      0.85      9043



#### Hyper parameter tuning

In [143]:
from sklearn.metrics import make_scorer, recall_score


recall_minority_scorer = make_scorer(recall_score, pos_label=1)


In [None]:
from sklearn.model_selection import GridSearchCV


# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],

}


brf = BalancedRandomForestClassifier(random_state=42)

# Set up GridSearchCV using recall of the minority class as the scoring metric
grid_search = GridSearchCV(estimator=brf, param_grid=param_grid, 
                           scoring=recall_minority_scorer, cv=5, verbose=3, n_jobs=-1)

# Fit the model to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters: ", grid_search.best_params_)
print("Best Recall Score for Minority Class: ", grid_search.best_score_)


Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV 3/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.631 total time=   3.0s
[CV 2/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.662 total time=   3.1s
[CV 4/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.601 total time=   3.2s
[CV 5/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.592 total time=   3.5s
[CV 1/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.617 total time=   4.0s
[CV 2/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.652 total time=   5.9s
[CV 3/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.637 total time=   6.2s
[CV 1/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.611 total time= 

In [152]:

best_brf = grid_search.best_estimator_

y_pred_brf = best_brf.predict(X_test)

# Evaluate the model
print("ROC-AUC score: ", roc_auc_score(y_test, y_pred_brf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_brf))
print("Classification Report:\n", classification_report(y_test, y_pred_brf))

# Calculate and print recall specifically for the minority class
recall_minority = recall_score(y_test, y_pred_brf, pos_label=1)
print(f"Recall for Minority Class: {recall_minority}")


ROC-AUC score:  0.715993313224904
Confusion Matrix:
 [[6511 1441]
 [ 422  669]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.82      0.87      7952
           1       0.32      0.61      0.42      1091

    accuracy                           0.79      9043
   macro avg       0.63      0.72      0.65      9043
weighted avg       0.86      0.79      0.82      9043

Recall for Minority Class: 0.613198900091659


In [150]:
X_train.shape

(36168, 41)

#### ensamble of random forests with unsersampling

In [27]:
# Handle class imbalance using bagging (undersample majority class)
def balanced_bagging(X, y, n_estimators=10):
    models = []
    for _ in range(n_estimators):
        # Combine X and y
        data = pd.concat([X, y], axis=1)

        # Separate majority and minority classes
        majority = data[data['y'] == 0]
        minority = data[data['y'] == 1]

        # Undersample majority class
        majority_downsampled = resample(
            majority,
            replace=False,
            n_samples=len(minority),  # Match minority class size
            random_state=np.random.randint(1000)
        )

        # Combine undersampled majority with minority
        balanced_data = pd.concat([majority_downsampled, minority])
                

        # Separate features and target
        X_balanced = balanced_data.drop('y', axis=1)
        y_balanced = balanced_data['y']
        

        # Train a random forest model on balanced data
        model = RandomForestClassifier(random_state=42,n_estimators=200)
        model.fit(X_balanced, y_balanced)
        
        models.append(model)
        

    return models

# Train models using balanced bagging
models = balanced_bagging(X_train, y_train, n_estimators=10)


In [28]:
# Predict with an ensemble approach
def predict_with_bagging(models, X):
    predictions = np.zeros((len(X), len(models)))
    for i, model in enumerate(models):
        predictions[:, i] = model.predict(X)
    # Use majority voting
    final_predictions = np.round(np.mean(predictions, axis=1))
    return final_predictions

# Evaluate on test data
y_pred = predict_with_bagging(models, X_test)

In [29]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[6385 1567]
 [ 379  712]]

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.80      0.87      7952
           1       0.31      0.65      0.42      1091

    accuracy                           0.78      9043
   macro avg       0.63      0.73      0.65      9043
weighted avg       0.87      0.78      0.81      9043



## Nerual netwrok

In [132]:
df = pd.read_csv('data/bank_nn.csv')

df.columns

Index(['age', 'education', 'default', 'balance', 'housing', 'loan', 'day',
       'campaign', 'pdays', 'previous', 'y', 'job_admin.', 'job_blue-collar',
       'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student', 'job_technician',
       'job_unemployed', 'marital_divorced', 'marital_married',
       'marital_single', 'contact_cellular', 'contact_telephone',
       'contact_unknown', 'poutcome_failure', 'poutcome_other',
       'poutcome_success', 'poutcome_unknown', 'month_sin', 'month_cos'],
      dtype='object')

### Standered Nural netwrok without modifications

In [133]:
X = df.drop('y', axis=1)
y = df['y']
numerical_features = ['age', 'balance', 'day', 'campaign', 'pdays', 'previous']
scaler = StandardScaler()
X[numerical_features] = scaler.fit_transform(X[numerical_features])


In [134]:


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

model = Sequential([
    Dense(64, input_dim=X.shape[1], activation='relu'),    
    Dense(32, activation='relu'), 
    Dense(16, activation='relu'),  
    Dense(1, activation='sigmoid')  
])


model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

# Predict
y_pred = (model.predict(X_test) > 0.3).astype("int32")



print("ROC-AUC score: ",roc_auc_score(y_test,y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m905/905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8894 - loss: 0.3347 - val_accuracy: 0.8900 - val_loss: 0.3097
Epoch 2/30
[1m905/905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8958 - loss: 0.2940 - val_accuracy: 0.8908 - val_loss: 0.3050
Epoch 3/30
[1m905/905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8917 - loss: 0.3006 - val_accuracy: 0.8907 - val_loss: 0.3040
Epoch 4/30
[1m905/905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8954 - loss: 0.2876 - val_accuracy: 0.8913 - val_loss: 0.3088
Epoch 5/30
[1m905/905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8947 - loss: 0.2906 - val_accuracy: 0.8905 - val_loss: 0.3022
Epoch 6/30
[1m905/905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8990 - loss: 0.2791 - val_accuracy: 0.8919 - val_loss: 0.3021
Epoch 7/30
[1m905/905[0m [32m━━━━━━━

In [136]:
print("ROC-AUC score: ",roc_auc_score(y_test,y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


ROC-AUC score:  0.650656862808381
Confusion Matrix:
 [[7469  483]
 [ 696  395]]
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.94      0.93      7952
           1       0.45      0.36      0.40      1091

    accuracy                           0.87      9043
   macro avg       0.68      0.65      0.66      9043
weighted avg       0.86      0.87      0.86      9043



## Over sampling with nueral netwroks

#### Random oversampling

In [137]:
ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

In [138]:
model = Sequential([
    Dense(64, input_dim=X.shape[1], activation='relu'),    
    Dense(32, activation='relu'), 
    Dense(16, activation='relu'),  
    Dense(1, activation='sigmoid')  
])


model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

# Predict
y_pred = (model.predict(X_test) > 0.3).astype("int32")

print("ROC-AUC score: ",roc_auc_score(y_test,y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m905/905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8845 - loss: 0.3466 - val_accuracy: 0.8905 - val_loss: 0.3075
Epoch 2/30
[1m905/905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8964 - loss: 0.2956 - val_accuracy: 0.8883 - val_loss: 0.3126
Epoch 3/30
[1m905/905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8926 - loss: 0.3030 - val_accuracy: 0.8900 - val_loss: 0.3047
Epoch 4/30
[1m905/905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8947 - loss: 0.2912 - val_accuracy: 0.8904 - val_loss: 0.3049
Epoch 5/30
[1m905/905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8987 - loss: 0.2858 - val_accuracy: 0.8916 - val_loss: 0.3051
Epoch 6/30
[1m905/905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8998 - loss: 0.2848 - val_accuracy: 0.8929 - val_loss: 0.3045
Epoch 7/30
[1m905/905[0m [32m━━━━━━━

### smote 

In [141]:

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [142]:

model = Sequential([
    Dense(64, input_dim=X.shape[1], activation='relu'),    
    Dense(32, activation='relu'), 
    Dense(16, activation='relu'),  
    Dense(1, activation='sigmoid')  
])


model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

# Predict
y_pred = (model.predict(X_test) > 0.3).astype("int32")


print("ROC-AUC score: ",roc_auc_score(y_test,y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m905/905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.8555 - loss: 0.3661 - val_accuracy: 0.8897 - val_loss: 0.3124
Epoch 2/30
[1m905/905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8979 - loss: 0.2939 - val_accuracy: 0.8912 - val_loss: 0.3064
Epoch 3/30
[1m905/905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8965 - loss: 0.2906 - val_accuracy: 0.8898 - val_loss: 0.3046
Epoch 4/30
[1m905/905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8985 - loss: 0.2857 - val_accuracy: 0.8905 - val_loss: 0.3040
Epoch 5/30
[1m905/905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8925 - loss: 0.2961 - val_accuracy: 0.8911 - val_loss: 0.3011
Epoch 6/30
[1m905/905[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8935 - loss: 0.2887 - val_accuracy: 0.8905 - val_loss: 0.3007
Epoch 7/30
[1m905/905[0m [32m━━━━━━━