# Required Libraries

In [1]:
import random
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from scipy.stats import zscore
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier

# Load Preprocessed Data

In [2]:
X_train = pd.read_csv("X_train.csv")
X_valid = pd.read_csv("X_valid.csv")
X_test = pd.read_csv("X_test.csv")
y_train = pd.read_csv("y_train.csv")
y_valid = pd.read_csv("y_valid.csv")
y_test = pd.read_csv("y_test.csv")
# Reshape the target arrays
y_train = np.ravel(y_train)
y_val = np.ravel(y_valid)
y_test = np.ravel(y_test)

In [3]:
# Display the shapes of the resulting sets
print("Training Data - X_train shape:", X_train.shape)
print("Training Data - y_train shape:", y_train.shape)
print("Cross Validation Data - X_val shape:", X_valid.shape)
print("Cross Validation Data - y_val shape:", y_valid.shape)
print("Testing Data - X_test shape:", X_test.shape)
print("Testing Data - y_test shape:", y_test.shape)

Training Data - X_train shape: (101923, 25)
Training Data - y_train shape: (101923,)
Cross Validation Data - X_val shape: (25481, 25)
Cross Validation Data - y_val shape: (25481, 1)
Testing Data - X_test shape: (31852, 25)
Testing Data - y_test shape: (31852,)


# Basic Models

## Bagging

In [None]:
# Initialize the base classifier (Decision Tree in this case)
#base_classifier = DecisionTreeClassifier(random_state=random_seed)

# Initialize the BaggingClassifier
bagging_classifier = BaggingClassifier(n_estimators=200)

# Train the BaggingClassifier on the training data
bagging_classifier.fit(X_train, y_train)

# Predictions on the validation set
y_valid_pred = bagging_classifier.predict(X_valid)

# Evaluate the performance on the validation set
accuracy_valid = accuracy_score(y_valid, y_valid_pred)
print(f"Accuracy on the validation set: {accuracy_valid:.2%}")

# You can also print additional evaluation metrics if needed
print("\nClassification Report on Validation Set:")
print(classification_report(y_valid, y_valid_pred))

# Confusion Matrix on Validation Set
conf_matrix_valid = confusion_matrix(y_valid, y_valid_pred)
print("\nConfusion Matrix on Validation Set:")
print(conf_matrix_valid)

## Boosting

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the base classifier (Decision Tree in this case)
#base_classifier = DecisionTreeClassifier(random_state=random_seed)

# Initialize the AdaBoostClassifier
adaboost_classifier = AdaBoostClassifier(n_estimators=200)

# Train the AdaBoostClassifier on the training data
adaboost_classifier.fit(X_train, y_train)

# Predictions on the validation set
y_valid_pred_boosting = adaboost_classifier.predict(X_valid)

# Evaluate the performance on the validation set
accuracy_valid_boosting = accuracy_score(y_valid, y_valid_pred_boosting)
print(f"Accuracy on the validation set (Boosting): {accuracy_valid_boosting:.2%}")

# Additional evaluation metrics
print("\nClassification Report on Validation Set (Boosting):")
print(classification_report(y_valid, y_valid_pred_boosting))

# Confusion Matrix on Validation Set
conf_matrix_valid_boosting = confusion_matrix(y_valid, y_valid_pred_boosting)
print("\nConfusion Matrix on Validation Set (Boosting):")
print(conf_matrix_valid_boosting)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
base_classifier = DecisionTreeClassifier()

# Initialize the RandomForestClassifier
random_forest_classifier = RandomForestClassifier(n_estimators=200)

# Train the RandomForestClassifier on the training data
random_forest_classifier.fit(X_train, y_train)

# Predictions on the validation set
y_valid_pred_forest = random_forest_classifier.predict(X_valid)

# Evaluate the performance on the validation set
accuracy_valid_forest = accuracy_score(y_valid, y_valid_pred_forest)
print(f"Accuracy on the validation set (Random Forests): {accuracy_valid_forest:.2%}")

# Additional evaluation metrics
print("\nClassification Report on Validation Set (Random Forests):")
print(classification_report(y_valid, y_valid_pred_forest))

# Confusion Matrix on Validation Set
conf_matrix_valid_forest = confusion_matrix(y_valid, y_valid_pred_forest)
print("\nConfusion Matrix on Validation Set (Random Forests):")
print(conf_matrix_valid_forest)

# Hyperparameter Tuning

In [4]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Function for model evaluation
def evaluate_model(model, X, y, dataset_name):
    y_pred = model.predict(X)
    
    # Accuracy
    accuracy = accuracy_score(y, y_pred)
    print(f"Accuracy on {dataset_name}: {accuracy:.2%}")

    # Additional metrics
    print(f"\nClassification Report on {dataset_name}:")
    print(classification_report(y, y_pred))

    # Confusion Matrix
    conf_matrix = confusion_matrix(y, y_pred)
    print(f"\nConfusion Matrix on {dataset_name}:")
    print(conf_matrix)

## Grid Search

In [None]:
# Bagging with Hyperparameter Tuning (Grid Search)
#bagging_base_classifier = DecisionTreeClassifier(random_state=random_seed)
bagging_model = BaggingClassifier(random_state=42)

bagging_param_grid = {
    'n_estimators': [200],
    'max_samples': [0.5],
    'max_features': [0.7],
}

bagging_grid_search = GridSearchCV(bagging_model, param_grid=bagging_param_grid, cv=5)
bagging_grid_search.fit(X_train, y_train)

print("Best Hyperparameters for Bagging:")
print(bagging_grid_search.best_params_)

# Evaluate Bagging on Validation Set
evaluate_model(bagging_grid_search, X_valid, y_valid, "Validation Set (Bagging)")

In [23]:
# Boosting with Hyperparameter Tuning (Grid Search)
#boosting_base_classifier = DecisionTreeClassifier(random_state=random_seed)
boosting_model = AdaBoostClassifier(random_state=42)

boosting_param_grid = {
    'n_estimators': [50, 100, 150, 500],
    'learning_rate': [0.01, 0.1, 1.0],
}

boosting_grid_search = GridSearchCV(boosting_model, param_grid=boosting_param_grid, cv=5)
boosting_grid_search.fit(X_train, y_train)

print("\nBest Hyperparameters for Boosting:")
print(boosting_grid_search.best_params_)

# Evaluate Boosting on Validation Set
evaluate_model(boosting_grid_search, X_valid, y_valid, "Validation Set (Boosting)")


Best Hyperparameters for Boosting:
{'learning_rate': 1.0, 'n_estimators': 500}
Accuracy on Validation Set (Boosting): 76.72%

Classification Report on Validation Set (Boosting):
              precision    recall  f1-score   support

           0       0.85      0.72      0.78     14415
           1       0.69      0.83      0.76     11066

    accuracy                           0.77     25481
   macro avg       0.77      0.77      0.77     25481
weighted avg       0.78      0.77      0.77     25481


Confusion Matrix on Validation Set (Boosting):
[[10356  4059]
 [ 1872  9194]]


In [24]:
# Random Forests with Hyperparameter Tuning (Grid Search)
forest_model = RandomForestClassifier(random_state=42)

forest_param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

forest_grid_search = GridSearchCV(forest_model, param_grid=forest_param_grid, cv=5)
forest_grid_search.fit(X_train, y_train)

print("\nBest Hyperparameters for Random Forests:")
print(forest_grid_search.best_params_)

# Evaluate Random Forests on Validation Set
evaluate_model(forest_grid_search, X_valid, y_valid, "Validation Set (Random Forests)")


Best Hyperparameters for Random Forests:
{'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 500}
Accuracy on Validation Set (Random Forests): 76.56%

Classification Report on Validation Set (Random Forests):
              precision    recall  f1-score   support

           0       0.84      0.72      0.78     14415
           1       0.69      0.83      0.75     11066

    accuracy                           0.77     25481
   macro avg       0.77      0.77      0.77     25481
weighted avg       0.78      0.77      0.77     25481


Confusion Matrix on Validation Set (Random Forests):
[[10346  4069]
 [ 1903  9163]]


## Randomized Search

In [29]:
# Bagging with Hyperparameter Tuning (Randomized Search)
bagging_randomized_search = RandomizedSearchCV(bagging_model, param_distributions=bagging_param_grid, n_iter=10, cv=5, random_state=42)
bagging_randomized_search.fit(X_train, y_train)

print("\nBest Hyperparameters for Bagging (Randomized Search):")
print(bagging_randomized_search.best_params_)

# Evaluate Bagging on Validation Set
evaluate_model(bagging_randomized_search, X_valid, y_valid, "Validation Set (Bagging - Randomized)")


Best Hyperparameters for Bagging (Randomized Search):
{'n_estimators': 200, 'max_samples': 0.5, 'max_features': 0.7}
Accuracy on Validation Set (Bagging - Randomized): 76.45%

Classification Report on Validation Set (Bagging - Randomized):
              precision    recall  f1-score   support

           0       0.84      0.73      0.78     14415
           1       0.70      0.81      0.75     11066

    accuracy                           0.76     25481
   macro avg       0.77      0.77      0.76     25481
weighted avg       0.77      0.76      0.77     25481


Confusion Matrix on Validation Set (Bagging - Randomized):
[[10484  3931]
 [ 2071  8995]]


In [30]:
# Boosting with Hyperparameter Tuning (Randomized Search)
boosting_randomized_search = RandomizedSearchCV(boosting_model, param_distributions=boosting_param_grid, n_iter=10, cv=5, random_state=42)
boosting_randomized_search.fit(X_train, y_train)

print("\nBest Hyperparameters for Boosting (Randomized Search):")
print(boosting_randomized_search.best_params_)

# Evaluate Boosting on Validation Set
evaluate_model(boosting_randomized_search, X_valid, y_valid, "Validation Set (Boosting - Randomized)")


Best Hyperparameters for Boosting (Randomized Search):
{'n_estimators': 500, 'learning_rate': 1.0}
Accuracy on Validation Set (Boosting - Randomized): 76.72%

Classification Report on Validation Set (Boosting - Randomized):
              precision    recall  f1-score   support

           0       0.85      0.72      0.78     14415
           1       0.69      0.83      0.76     11066

    accuracy                           0.77     25481
   macro avg       0.77      0.77      0.77     25481
weighted avg       0.78      0.77      0.77     25481


Confusion Matrix on Validation Set (Boosting - Randomized):
[[10356  4059]
 [ 1872  9194]]


In [31]:
# Random Forests with Hyperparameter Tuning (Randomized Search)
forest_randomized_search = RandomizedSearchCV(forest_model, param_distributions=forest_param_grid, n_iter=10, cv=5, random_state=42)
forest_randomized_search.fit(X_train, y_train)

print("\nBest Hyperparameters for Random Forests (Randomized Search):")
print(forest_randomized_search.best_params_)

# Evaluate Random Forests on Validation Set
evaluate_model(forest_randomized_search, X_valid, y_valid, "Validation Set (Random Forests - Randomized)")


Best Hyperparameters for Random Forests (Randomized Search):
{'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 20}
Accuracy on Validation Set (Random Forests - Randomized): 76.56%

Classification Report on Validation Set (Random Forests - Randomized):
              precision    recall  f1-score   support

           0       0.84      0.72      0.78     14415
           1       0.69      0.83      0.75     11066

    accuracy                           0.77     25481
   macro avg       0.77      0.77      0.77     25481
weighted avg       0.78      0.77      0.77     25481


Confusion Matrix on Validation Set (Random Forests - Randomized):
[[10346  4069]
 [ 1903  9163]]


In [32]:
# Final System: Choose the Best Performing Models
best_bagging_model = bagging_randomized_search.best_estimator_
best_boosting_model = boosting_randomized_search.best_estimator_
best_forest_model = forest_randomized_search.best_estimator_

# Evaluate on Test Set
evaluate_model(best_bagging_model, X_test, y_test, "Test Set (Bagging)")
evaluate_model(best_boosting_model, X_test, y_test, "Test Set (Boosting)")
evaluate_model(best_forest_model, X_test, y_test, "Test Set (Random Forests)")

Accuracy on Test Set (Bagging): 77.00%

Classification Report on Test Set (Bagging):
              precision    recall  f1-score   support

           0       0.84      0.73      0.78     17936
           1       0.70      0.82      0.76     13916

    accuracy                           0.77     31852
   macro avg       0.77      0.78      0.77     31852
weighted avg       0.78      0.77      0.77     31852


Confusion Matrix on Test Set (Bagging):
[[13181  4755]
 [ 2572 11344]]
Accuracy on Test Set (Boosting): 77.00%

Classification Report on Test Set (Boosting):
              precision    recall  f1-score   support

           0       0.84      0.72      0.78     17936
           1       0.70      0.83      0.76     13916

    accuracy                           0.77     31852
   macro avg       0.77      0.78      0.77     31852
weighted avg       0.78      0.77      0.77     31852


Confusion Matrix on Test Set (Boosting):
[[12996  4940]
 [ 2387 11529]]
Accuracy on Test Set (Random 

#  Bayesian method: 

## Tree-structured Parzen Estimator (TPE) algorithm

In [None]:
from hyperopt import hp, tpe, Trials, fmin
from sklearn.model_selection import cross_val_score
random_seed = 7370 + 7524
# Define the search space for hyperparameters
space = {
    'n_estimators': hp.choice('n_estimators', [50, 100, 150]),
    'max_depth': hp.choice('max_depth', [None, 10, 20]),
    'min_samples_split': hp.choice('min_samples_split', [2, 5, 10]),
    'min_samples_leaf': hp.choice('min_samples_leaf', [1, 2, 4]),
}

# Objective function for hyperopt to minimize (negative accuracy)
def objective(params):
    model = RandomForestClassifier(random_state=random_seed, **params)
    score = cross_val_score(model, X_train, y_train, cv=5).mean()
    return -score  # Minimize negative accuracy

# Perform hyperparameter tuning using Bayesian optimization with TPE algorithm
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=50, trials=trials, rstate=np.random.RandomState(random_seed))

# Get the best hyperparameters found by Bayesian optimization
best_params = space_eval(space, best)
print("Best Hyperparameters for Random Forests (Bayesian Optimization):")
print(best_params)

# Train the model with the best hyperparameters on the entire training set
best_forest_model_bayesian = RandomForestClassifier(random_state=random_seed, **best_params)
best_forest_model_bayesian.fit(X_train, y_train)

# Evaluate the best model on the test set
evaluate_model(best_forest_model_bayesian, X_test, y_test, "Test Set (Random Forests - Bayesian Optimization)")

## (Tree-structured Parzen Estimator (TPE) algorithm)

In [None]:
from skopt import gp_minimize
from skopt.space import Real, Integer
from skopt.utils import use_named_args
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Define the search space for hyperparameters
space = [
    Integer(10 600, name='n_estimators'),
    Integer(1, 20, name='max_depth'),
    Integer(1, 30 name='min_samples_split'),
    Integer(1, 30 name='min_samples_leaf')
]

# Objective function for skopt to minimize (negative accuracy)
@use_named_args(space)
def objective(**params):
    model = RandomForestClassifier(random_state=42, **params)
    score = cross_val_score(model, X_train, y_train, cv=5).mean()
    return -score  # Minimize negative accuracy

# Perform hyperparameter tuning using Gaussian Process optimization
result = gp_minimize(objective, space, n_calls=50, random_state=42)

# Get the best hyperparameters found by Gaussian Process optimization
best_params = dict(zip(['n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf'], result.x))
print("Best Hyperparameters for Random Forests (Gaussian Process Optimization):")
print(best_params)

# Train the model with the best hyperparameters on the entire training set
best_forest_model_gp = RandomForestClassifier(random_state=42, **best_params)
best_forest_model_gp.fit(X_train, y_train)

# Evaluate the best model on the test set
evaluate_model(best_forest_model_gp, X_test, y_test, "Test Set (Random Forests - Gaussian Process Optimization)")

# Final System Testing

In [35]:
# Evaluate Best Bagging Model on Test Set
print("\nEvaluation of Best Bagging Model on Test Set:")
evaluate_model(best_bagging_model, X_test, y_test, "Test Set (Best Bagging)")

# Evaluate Best Boosting Model on Test Set
print("\nEvaluation of Best Boosting Model on Test Set:")
evaluate_model(best_boosting_model, X_test, y_test, "Test Set (Best Boosting)")

# Evaluate Best Random Forest Model on Test Set
print("\nEvaluation of Best Random Forest Model on Test Set:")
evaluate_model(best_forest_model, X_test, y_test, "Test Set (Best Random Forest)")

# Compare Models
accuracy_bagging = accuracy_score(y_test, best_bagging_model.predict(X_test))
accuracy_boosting = accuracy_score(y_test, best_boosting_model.predict(X_test))
accuracy_forest = accuracy_score(y_test, best_forest_model.predict(X_test))

print("\nComparison of Model Performances on Test Set:")
print(f"Accuracy - Bagging: {accuracy_bagging:.2%}")
print(f"Accuracy - Boosting: {accuracy_boosting:.2%}")
print(f"Accuracy - Random Forest: {accuracy_forest:.2%}")

best_model = max([(accuracy_bagging, 'Bagging'), (accuracy_boosting, 'Boosting'), (accuracy_forest, 'Random Forest')],
                 key=lambda x: x[0])


Evaluation of Best Bagging Model on Test Set:
Accuracy on Test Set (Best Bagging): 77.00%

Classification Report on Test Set (Best Bagging):
              precision    recall  f1-score   support

           0       0.84      0.73      0.78     17936
           1       0.70      0.82      0.76     13916

    accuracy                           0.77     31852
   macro avg       0.77      0.78      0.77     31852
weighted avg       0.78      0.77      0.77     31852


Confusion Matrix on Test Set (Best Bagging):
[[13181  4755]
 [ 2572 11344]]

Evaluation of Best Boosting Model on Test Set:
Accuracy on Test Set (Best Boosting): 77.00%

Classification Report on Test Set (Best Boosting):
              precision    recall  f1-score   support

           0       0.84      0.72      0.78     17936
           1       0.70      0.83      0.76     13916

    accuracy                           0.77     31852
   macro avg       0.77      0.78      0.77     31852
weighted avg       0.78      0.77    

In [19]:
print(f"\nThe best-performing model on the test set is: {best_model[1]} with an accuracy of {best_model[0]:.2%}")


The best-performing model on the test set is: Random Forest with an accuracy of 77.06%
