In [57]:
import numpy as np
import pandas as pd
import sklearn.model_selection as ms
import sklearn.ensemble as en
import sklearn.metrics as met

campaign = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bank.csv')

In [58]:
X = pd.get_dummies(campaign.drop(columns=["y"]), drop_first=True)
y = campaign["y"]

X_train, X_test, y_train, y_test = ms.train_test_split(
    X,
    y,
    test_size=0.15,
    random_state=42,
    stratify=y
)

baseline_clf = en.RandomForestClassifier(
    n_estimators=600,
    max_depth=15,
    min_samples_leaf=10,
    random_state=42,
    class_weight={"no": 1, "yes": 5},
    max_features="sqrt",
    bootstrap=True,
    n_jobs=2
)

baseline_clf.fit(X_train, y_train)
baseline_pred = baseline_clf.predict(X_test)

print("\n===============================")
print("BASELINE MODEL RESULTS (default threshold = 0.50)")
print("===============================")
print(met.classification_report(y_test, baseline_pred))
print(met.confusion_matrix(y_test, baseline_pred))

param_dist = {
    "n_estimators": [300, 600, 900, 1200],
    "max_depth": [None, 8, 12, 15, 18, 22],
    "min_samples_leaf": [1, 2, 5, 10, 20],
    "min_samples_split": [2, 5, 10, 20, 50],
    "max_features": ["sqrt", "log2", 0.3, 0.5, 0.8],
    "bootstrap": [True, False]
}

rf_for_search = en.RandomForestClassifier(
    random_state=42,
    class_weight={"no": 1, "yes": 5},
    n_jobs=2
)

search = ms.RandomizedSearchCV(
    estimator=rf_for_search,
    param_distributions=param_dist,
    n_iter=1,          # increase this for more searching (ex: 80)
    scoring="f1",       # balances precision and recall
    cv=3,
    random_state=42,
    verbose=1,
    n_jobs=2
)

search.fit(X_train, y_train)

print("\n===============================")
print("BEST MODEL FROM RANDOMIZED SEARCH")
print("===============================")
print("Best Parameters:", search.best_params_)
print("Best CV F1:", round(search.best_score_, 4))

best_clf = search.best_estimator_

# Evaluate best model using default predict() first
best_pred_default = best_clf.predict(X_test)

print("\n===============================")
print("BEST MODEL RESULTS (default threshold = 0.50)")
print("===============================")
print(met.classification_report(y_test, best_pred_default))
print(met.confusion_matrix(y_test, best_pred_default))


# -----------------------------
# 4) OPTION 1: Threshold tuning (ON BEST MODEL)
# -----------------------------
proba_yes = best_clf.predict_proba(X_test)[:, 1]

# Convert y_test into 0/1 (1 = yes, 0 = no)
y_true = (y_test == "yes").astype(int)

best_thr = 0.5
best_f1 = -1

# Try many thresholds and pick best F1 score
for thr in np.arange(0.10, 0.91, 0.01):
    y_pred_thr = (proba_yes >= thr).astype(int)
    f1 = met.f1_score(y_true, y_pred_thr)
    if f1 > best_f1:
        best_f1 = f1
        best_thr = thr

print("\n===============================")
print("THRESHOLD TUNING ON BEST MODEL")
print("===============================")
print("Best threshold:", round(best_thr, 2))
print("Best F1:", round(best_f1, 4))

# Final predictions using best threshold
y_pred_best = (proba_yes >= best_thr).astype(int)

print("\n===============================")
print("FINAL RESULTS (best model + best threshold)")
print("===============================")
print(met.classification_report(y_true, y_pred_best))
print(met.confusion_matrix(y_true, y_pred_best))



BASELINE MODEL RESULTS (default threshold = 0.50)
              precision    recall  f1-score   support

          no       0.94      0.91      0.93      4930
         yes       0.45      0.59      0.51       631

    accuracy                           0.87      5561
   macro avg       0.70      0.75      0.72      5561
weighted avg       0.89      0.87      0.88      5561

[[4474  456]
 [ 261  370]]
Fitting 3 folds for each of 1 candidates, totalling 3 fits





BEST MODEL FROM RANDOMIZED SEARCH
Best Parameters: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': 0.5, 'max_depth': 8, 'bootstrap': True}
Best CV F1: nan

BEST MODEL RESULTS (default threshold = 0.50)
              precision    recall  f1-score   support

          no       0.94      0.91      0.93      4930
         yes       0.45      0.58      0.51       631

    accuracy                           0.87      5561
   macro avg       0.70      0.75      0.72      5561
weighted avg       0.89      0.87      0.88      5561

[[4487  443]
 [ 265  366]]

THRESHOLD TUNING ON BEST MODEL
Best threshold: 0.52
Best F1: 0.512

FINAL RESULTS (best model + best threshold)
              precision    recall  f1-score   support

           0       0.94      0.91      0.93      4930
           1       0.46      0.58      0.51       631

    accuracy                           0.88      5561
   macro avg       0.70      0.75      0.72      5561
weighted avg       0

In [59]:
final_pred_df = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bank_holdout_test_mini.csv')
final_pred_df = pd.get_dummies(final_pred_df)

# This part ensures that the final_pred_df has the same colums as the training columns
train_cols = X_train.columns
final_pred_df = final_pred_df.reindex(columns=train_cols, fill_value=0)
final_pred = clf.predict(final_pred_df)
# Convert the NumPy Array to a pandas Array
final_pred = pd.DataFrame({
    'prediction': final_pred
})

# Map the 'yes' and 'no' to 1 and 0
final_pred['prediction'] = final_pred['prediction'].map({
    'yes': 1,
    'no': 0
})

# Save as a csv
final_pred.to_csv('predictions.csv', index=False)