In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

# Load datasets (ensure you already have cleaned the data as per the previous steps)
file_path = r"C:\Users\samia\OneDrive\Documents\GitHub\springboard\Project Proposal\bank+marketing\bank-additional\bank-additional\bank-additional-full.csv"
file_path1 = r"C:\Users\samia\OneDrive\Documents\GitHub\springboard\Project Proposal\bank+marketing\bank-additional\bank-additional\bank-additional.csv"

# Check if files exist
if os.path.exists(file_path):
    bank_additional_full = pd.read_csv(file_path)
else:
    print(f"File not found: {file_path}")

if os.path.exists(file_path1):
    bank_full = pd.read_csv(file_path1)
else:
    print(f"File not found: {file_path1}")

# Step 1: Handle missing values (if any)
bank_additional_full = bank_additional_full.dropna()  # Or fill with bank_additional_full.fillna() if you prefer

# Step 2: Create dummy/indicator features for categorical variables
categorical_columns = bank_additional_full.select_dtypes(include=['object']).columns
bank_additional_full = pd.get_dummies(bank_additional_full, columns=categorical_columns, drop_first=True, sparse=True)

# Step 3: Identify numeric columns and standardize them
numeric_columns = bank_additional_full.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
bank_additional_full[numeric_columns] = scaler.fit_transform(bank_additional_full[numeric_columns])

# Step 4: Split data into training and testing datasets
X = bank_additional_full.drop('y', axis=1)  # Drop the target variable 'y'
y = bank_additional_full['y']  # The target variable

# Step 5: Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Model 1 - Logistic Regression
logreg_model = LogisticRegression(max_iter=1000)
logreg_model.fit(X_train, y_train)

# Step 7: Model 2 - Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Step 8: Model 3 - Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)

# Step 9: Model Evaluation - Accuracy, Precision, Recall, F1 Score, and AUC-ROC

# Predict on the test set
logreg_preds = logreg_model.predict(X_test)
rf_preds = rf_model.predict(X_test)
gb_preds = gb_model.predict(X_test)

# Calculate metrics for Logistic Regression
print("Logistic Regression Classification Report:")
print(classification_report(y_test, logreg_preds))
print(f"Logistic Regression AUC-ROC: {roc_auc_score(y_test, logreg_preds)}")
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, logreg_preds)}")

# Calculate metrics for Random Forest
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, rf_preds))
print(f"Random Forest AUC-ROC: {roc_auc_score(y_test, rf_preds)}")
print(f"Random Forest Accuracy: {accuracy_score(y_test, rf_preds)}")

# Calculate metrics for Gradient Boosting
print("\nGradient Boosting Classification Report:")
print(classification_report(y_test, gb_preds))
print(f"Gradient Boosting AUC-ROC: {roc_auc_score(y_test, gb_preds)}")
print(f"Gradient Boosting Accuracy: {accuracy_score(y_test, gb_preds)}")

# Step 10: Hyperparameter Tuning for Random Forest and Gradient Boosting using GridSearchCV
# Hyperparameter tuning for Random Forest
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf_grid_search = GridSearchCV(estimator=rf_model, param_grid=rf_param_grid, cv=3, n_jobs=-1, verbose=2)
rf_grid_search.fit(X_train, y_train)
print(f"Best Parameters for Random Forest: {rf_grid_search.best_params_}")

# Hyperparameter tuning for Gradient Boosting
gb_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5, 7]
}

gb_grid_search = GridSearchCV(estimator=gb_model, param_grid=gb_param_grid, cv=3, n_jobs=-1, verbose=2)
gb_grid_search.fit(X_train, y_train)
print(f"Best Parameters for Gradient Boosting: {gb_grid_search.best_params_}")

# Step 11: Final Model Evaluation
# After grid search, we can evaluate the final best models
rf_best_model = rf_grid_search.best_estimator_
gb_best_model = gb_grid_search.best_estimator_

# Evaluate final best models on test set
rf_best_preds = rf_best_model.predict(X_test)
gb_best_preds = gb_best_model.predict(X_test)

# Print classification reports for the best models
print("\nRandom Forest (After Grid Search) Classification Report:")
print(classification_report(y_test, rf_best_preds))

print("\nGradient Boosting (After Grid Search) Classification Report:")
print(classification_report(y_test, gb_best_preds))

# AUC-ROC and Accuracy for Best Models
print(f"Best Random Forest AUC-ROC: {roc_auc_score(y_test, rf_best_preds)}")
print(f"Best Gradient Boosting AUC-ROC: {roc_auc_score(y_test, gb_best_preds)}")

# Conclusion: Based on evaluation, choose the best model considering trade-offs


ValueError: at least one array or dtype is required