In [115]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder




In [116]:
df = pd.read_csv("data/bank-additional-full.csv",delimiter=";")



In [117]:
df.drop(columns=['duration'], inplace=True)

In [118]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'campaign', 'pdays', 'previous',
       'poutcome', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx',
       'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [119]:
df["day_of_week"]

0        mon
1        mon
2        mon
3        mon
4        mon
        ... 
41183    fri
41184    fri
41185    fri
41186    fri
41187    fri
Name: day_of_week, Length: 41188, dtype: object

In [120]:
# Preprocess categorical variables (use one-hot encoding or label encoding)
df = pd.get_dummies(df, columns=['job', 'marital', 'education', 'contact', 'month', 'poutcome','day_of_week'], drop_first=True)



In [121]:
df.columns

Index(['age', 'default', 'housing', 'loan', 'campaign', 'pdays', 'previous',
       'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m',
       'nr.employed', 'y', 'job_blue-collar', 'job_entrepreneur',
       'job_housemaid', 'job_management', 'job_retired', 'job_self-employed',
       'job_services', 'job_student', 'job_technician', 'job_unemployed',
       'job_unknown', 'marital_married', 'marital_single', 'marital_unknown',
       'education_basic.6y', 'education_basic.9y', 'education_high.school',
       'education_illiterate', 'education_professional.course',
       'education_university.degree', 'education_unknown', 'contact_telephone',
       'month_aug', 'month_dec', 'month_jul', 'month_jun', 'month_mar',
       'month_may', 'month_nov', 'month_oct', 'month_sep',
       'poutcome_nonexistent', 'poutcome_success', 'day_of_week_mon',
       'day_of_week_thu', 'day_of_week_tue', 'day_of_week_wed'],
      dtype='object')

In [122]:
# Apply Label Encoding to binary variables (e.g., 'default', 'housing', 'loan')
label_cols = ['default', 'housing', 'loan']
label_encoder = LabelEncoder()

for col in label_cols:
    df[col] = label_encoder.fit_transform(df[col])

In [123]:
# Separate input features (X) and target variable (y)
X = df.drop(columns=['y'])
y = df['y'].map({'no': 0, 'yes': 1})  # Map target to binary values (0 for no, 1 for yes)




In [124]:
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import pandas as pd

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Undersample the majority class (y=0)
def undersample_majority_class(X, y):
    # Combine X and y
    data = pd.concat([X, y], axis=1)

    # Separate majority and minority classes
    majority = data[data['y'] == 0]
    minority = data[data['y'] == 1]

    # Undersample majority class
    majority_downsampled = resample(
        majority,
        replace=False,
        n_samples=len(minority),  # Match minority class size
        random_state=42
    )

    # Combine undersampled majority with minority
    balanced_data = pd.concat([majority_downsampled, minority])

    # Separate features and target
    X_balanced = balanced_data.drop('y', axis=1)
    y_balanced = balanced_data['y']

    return X_balanced, y_balanced

# Undersample the majority class in the training set
X_train_balanced, y_train_balanced = undersample_majority_class(X_train, y_train)

# Use BaggingClassifier with RandomForestClassifier as the base estimator
rf = RandomForestClassifier(n_estimators=100, random_state=42)
bagging = BaggingClassifier(estimator=rf, n_estimators=10, random_state=42, bootstrap=True)

# Train the model using the balanced dataset
model = bagging.fit(X_train_balanced, y_train_balanced)

# Predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Confusion Matrix:
 [[9040 1928]
 [ 472  917]]

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.82      0.88     10968
           1       0.32      0.66      0.43      1389

    accuracy                           0.81     12357
   macro avg       0.64      0.74      0.66     12357
weighted avg       0.88      0.81      0.83     12357



In [125]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Handle class imbalance using bagging or class weights in RandomForestClassifier
# Method 1: Use class_weight='balanced' in Random Forest
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)

# Method 2: Use BaggingClassifier to oversample the minority class
bagging = BaggingClassifier(estimator=rf, n_estimators=10, random_state=42, bootstrap=True)

# Train the model using Bagging or balanced RandomForest
model = bagging.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Confusion Matrix:
 [[10733   235]
 [ 1025   364]]

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.98      0.94     10968
           1       0.61      0.26      0.37      1389

    accuracy                           0.90     12357
   macro avg       0.76      0.62      0.66     12357
weighted avg       0.88      0.90      0.88     12357

