# Required Libraries

In [103]:
import random
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from scipy.stats import zscore
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, make_scorer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import make_classification
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

In [104]:
X_train = pd.read_csv("X_train.csv")
X_valid = pd.read_csv("X_valid.csv")
X_test = pd.read_csv("X_test.csv")
y_train = pd.read_csv("y_train.csv")
y_valid = pd.read_csv("y_valid.csv")
y_test = pd.read_csv("y_test.csv")
# Reshape the target arrays
y_train = np.ravel(y_train)
y_val = np.ravel(y_valid)
y_test = np.ravel(y_test)

In [105]:
# Display the shapes of the resulting sets
print("Training Data - X_train shape:", X_train.shape)
print("Training Data - y_train shape:", y_train.shape)
print("Cross Validation Data - X_val shape:", X_valid.shape)
print("Cross Validation Data - y_val shape:", y_valid.shape)
print("Testing Data - X_test shape:", X_test.shape)
print("Testing Data - y_test shape:", y_test.shape)

Training Data - X_train shape: (101923, 25)
Training Data - y_train shape: (101923,)
Cross Validation Data - X_val shape: (25481, 25)
Cross Validation Data - y_val shape: (25481, 1)
Testing Data - X_test shape: (31852, 25)
Testing Data - y_test shape: (31852,)


In [106]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

# Bagging

In [13]:
class BaggingClassifier:
    def __init__(self, n_estimators=50, max_features=0.7, max_depth=None):
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.max_depth = max_depth
        self.estimators = []
        
    def fit(self, X, y):
        for _ in range(self.n_estimators):
            indices = np.random.choice(len(X), len(X), replace=True)
            X_bootstrap = X.iloc[indices]
            y_bootstrap = y[indices]
            estimator = DecisionTreeClassifier(max_features=self.max_features, max_depth=self.max_depth).fit(X_bootstrap, y_bootstrap)
            self.estimators.append(estimator)
            
    def predict(self, X):
        # Make predictions using all the base classifiers
        predictions = [estimator.predict(X) for estimator in self.estimators]
        # Aggregate predictions using majority voting
        majority_votes = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=predictions)

        return majority_votes
    
    def get_params(self, deep=True):
        return {
            'n_estimators': self.n_estimators,
            'max_features': self.max_features,
            'max_depth': self.max_depth
        }

    def set_params(self, **params):
        for param, value in params.items():
            setattr(self, param, value)
        return self

In [6]:
bagging_classifier_basic = BaggingClassifier()
# Fit BaggingClassifier on training data
bagging_classifier_basic.fit(X_train, y_train)

# Predict on validation set
y_pred = bagging_classifier_basic.predict(X_valid)
# Evaluate accuracy
accuracy = accuracy_score(y_valid, y_pred)
print(f"Accuracy on the validation set (Bagging): {accuracy:.2%}")

Accuracy on the validation set (Bagging): 77.34%


In [76]:
class AdaBoostClassifier:
    def __init__(self, n_estimators=50, learning_rate=1.0):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.estimators = []
        self.weights = []

    def fit(self, X, y):
        # Initialize Equal Weights 
        self.weights = np.ones(len(X)) / len(X)
        for _ in range(self.n_estimators):
            # Train a decision stump
            estimator = DecisionTreeClassifier(max_depth=1).fit(X, y, sample_weight=self.weights)
  
            # Compute Error rate
            predictions = estimator.predict(X)
            incorrect = (predictions != y) 
            error_rate = np.dot(self.weights, incorrect)
 
            # Compute Alpha_t
            alpha = self.learning_rate * np.log((1 - error_rate) / error_rate)
            self.estimators.append((estimator, alpha))
 
            # Reweighting
            self.weights *= np.exp(-y * alpha * predictions)
        
            # Normalization >> SUMMATION = 1
            self.weights /= np.sum(self.weights)

    def predict(self, X):
        predictions = np.empty(len(X))
        for estimator, alpha in self.estimators:
            predictions += alpha * estimator.predict(X)
        return np.sign(predictions)
    
    def get_params(self, deep=True):
        return {'n_estimators': self.n_estimators, 'learning_rate': self.learning_rate}

    def set_params(self, **params):
        if 'n_estimators' in params:
            self.n_estimators = params['n_estimators']
        if 'learning_rate' in params:
            self.learning_rate = params['learning_rate']
        return self

In [40]:
boosting_classifier_basic = AdaBoostClassifier()

# Fit BaggingClassifier on training data
boosting_classifier_basic.fit(X_train, y_train)

# Predict on validation set
y_pred = boosting_classifier_basic.predict(X_valid)
# Evaluate accuracy
accuracy = accuracy_score(y_valid, y_pred)
print(f"Accuracy on the validation set (Boosting): {accuracy:.2%}")

Accuracy on the validation set (Boosting): 73.42%


In [107]:
class RandomForestClassifier:
    def __init__(self, n_estimators=500, max_features='auto', max_depth=20, min_samples_split=10, min_samples_leaf=1):
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.max_depth = max_depth
        self.estimators = []
        
    def fit(self, X, y):
        for _ in range(self.n_estimators):
            indices = np.random.choice(len(X), len(X), replace=True)
            X_bootstrap = X.iloc[indices]
            y_bootstrap = y[indices]
            estimator = DecisionTreeClassifier(max_features=self.max_features, max_depth=self.max_depth, min_samples_split = self.min_samples_split, min_samples_leaf = self.min_samples_leaf).fit(X_bootstrap, y_bootstrap)
            self.estimators.append(estimator)
            
    def predict(self, X):
        # Make predictions using all the base classifiers
        predictions = [estimator.predict(X) for estimator in self.estimators]
        # Aggregate predictions using majority voting
        majority_votes = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=predictions)
        return majority_votes
    
    def get_params(self, deep=True):
        return {
            'n_estimators': self.n_estimators,
            'max_features': self.max_features,
            'min_samples_split': self.min_samples_split,
            'min_samples_leaf': self.min_samples_leaf,
            'max_depth': self.max_depth
        }

    def set_params(self, **params):
        for param, value in params.items():
            setattr(self, param, value)
        return self

In [85]:
random_forest_classifier_basic = RandomForestClassifier()

# Fit BaggingClassifier on training data
random_forest_classifier_basic.fit(X_train, y_train)

# Predict on validation set
y_pred = random_forest_classifier_basic.predict(X_valid)
# Evaluate accuracy
accuracy = accuracy_score(y_valid, y_pred)
print(f"Accuracy on the validation set (Random Forest): {accuracy:.2%}")

Accuracy on the validation set (Random Forest): 77.68%


# Hyperparameters Tuning

## Grid Search

In [91]:
def grid_search_tuning(model, param_grid, X_train, y_train):
    scoring = make_scorer(accuracy_score)  # Use accuracy as the scoring metric
    grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, scoring=scoring)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

### Bagging

In [7]:
# Define parameter grids for Grid Search 
param_grid = {
    'n_estimators': [20, 50, 100, 200],
    'max_features': [0.5, 0.7, 0.9],
    'max_depth': [None, 10, 20]
}

In [8]:
# Create an instance
bagging_clf = BaggingClassifier()

# Perform hyperparameter tuning using Grid Search
best_bagging_model_gs = grid_search_tuning(bagging_clf, param_grid, X_train, y_train)

# Print best hyperparameters for Grid Search
print("Best hyperparameters for Bagging Classifier (Grid Search):")
print(best_bagging_model_gs.get_params())
print("--------------------------------------------------------------------------")

# Print Accuracy on test set
bagging_accuracy_gs = evaluate_model(best_bagging_model_gs, X_valid, y_valid)
print(f"Bagging Classifier Grid Search Accuracy: {bagging_accuracy_gs:.2%}")

Best hyperparameters for Bagging Classifier (Grid Search):
{'n_estimators': 50, 'max_features': 0.7, 'max_depth': None}
--------------------------------------------------------------------------
Bagging Classifier Grid Search Accuracy: 77.63%


### Boosting

In [92]:
# Define parameter grids for Grid Search 
param_grid = {
    'n_estimators': [10, 20, 50, 100, 500],
    'learning_rate': [0.001, 0.1, 1.0]
}

In [79]:
# Create an instance
adaboost_clf = AdaBoostClassifier()

# Perform hyperparameter tuning using Grid Search
best_adaboost_model_gs = grid_search_tuning(adaboost_clf, param_grid, X_train, y_train)

# Print best hyperparameters for Grid Search
print("\nBest hyperparameters for AdaBoost Classifier (Grid Search):")
print(best_adaboost_model_gs.get_params())
print("--------------------------------------------------------------------------")

adaboost_accuracy_gs = evaluate_model(best_adaboost_model_gs, X_valid, y_valid)
print(f"Boosting Classifier Grid Search Accuracy: {adaboost_accuracy_gs:.2%}")


Best hyperparameters for AdaBoost Classifier (Grid Search):
{'n_estimators': 50, 'learning_rate': 1.0}
--------------------------------------------------------------------------
Boosting Classifier Grid Search Accuracy: 72.10%


### Random Forest

In [93]:
param_grid = {
    'n_estimators': [500, 600],
    'max_depth': [20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [94]:
randomforest_clf = RandomForestClassifier()

best_randomforest_model_gs = grid_search_tuning(randomforest_clf, param_grid, X_train, y_train)


print("\nBest hyperparameters for Random Forest Classifier (Grid Search):")
print(best_randomforest_model_gs.get_params())
print("--------------------------------------------------------------------------")

randomforest_accuracy_gs = evaluate_model(best_randomforest_model_gs, X_valid, y_valid)
print(f"Random Forest Classifier Grid Search Accuracy: {randomforest_accuracy_gs:.2%}")


Best hyperparameters for Random Forest Classifier (Grid Search):
{'n_estimators': 600, 'max_features': 'auto', 'min_samples_split': 15, 'min_samples_leaf': 1, 'max_depth': 40}
--------------------------------------------------------------------------
Random Forest Classifier Grid Search Accuracy: 77.96%


## Randomized Search

In [108]:
def randomized_search_tuning(model, param_distributions, X_train, y_train):
    scoring = make_scorer(accuracy_score)  # Use accuracy as the scoring metric
    randomized_search = RandomizedSearchCV(model, param_distributions, n_iter=1, cv=5, n_jobs=-1, random_state=42, scoring=scoring)
    randomized_search.fit(X_train, y_train)
    return randomized_search.best_estimator_

### Bagging

In [15]:
param_distributions = {
    'n_estimators': [50, 100, 200, 500],
    'max_features': [0.5, 0.7, 0.9],
    'max_depth': [None, 10, 20]
}

In [16]:
best_bagging_model_rs = randomized_search_tuning(bagging_clf, param_distributions, X_train, y_train)

print("\nBest hyperparameters for Bagging Classifier (Randomized Search):")
print(best_bagging_model_rs.get_params())
print("--------------------------------------------------------------------------")

bagging_accuracy_rs = evaluate_model(best_bagging_model_rs, X_valid, y_valid)
print(f"Bagging Classifier Randomized Search Accuracy: {bagging_accuracy_rs:.2%}")


Best hyperparameters for Bagging Classifier (Randomized Search):
{'n_estimators': 500, 'max_features': 0.9, 'max_depth': 20}
--------------------------------------------------------------------------
Bagging Classifier Randomized Search Accuracy: 77.89%


### Boosting

In [216]:
param_distributions = {
    'n_estimators': [20, 100, 250, 500],
    'learning_rate': [1.0]
}

In [217]:
best_adaboost_model_rs = randomized_search_tuning(adaboost_clf, param_distributions, X_train, y_train)

print("\nBest hyperparameters for AdaBoost Classifier (Randomized Search):")
print(best_adaboost_model_rs.get_params())
print("--------------------------------------------------------------------------")

adaboost_accuracy_rs = evaluate_model(best_adaboost_model_rs, X_valid, y_valid)
print(f"Boosting Classifier Randomized Search Accuracy: {adaboost_accuracy_rs:.2%}")


Best hyperparameters for AdaBoost Classifier (Randomized Search):
{'n_estimators': 20, 'learning_rate': 1.0}
--------------------------------------------------------------------------
Boosting Classifier Randomized Search Accuracy: 74.78%


### Random Forest

In [109]:
param_grid = {
    'n_estimators': [600,800],
    'max_depth': [50],
    'min_samples_split': [20],
    'min_samples_leaf': [1]
}

In [110]:
best_randomforest_model_rs = randomized_search_tuning(randomforest_clf, param_distributions, X_train, y_train)

print("\nBest hyperparameters for Random Forest Classifier (Randomized Search):")
print(best_randomforest_model_rs.get_params())
print("--------------------------------------------------------------------------")

randomforest_accuracy_rs = evaluate_model(best_randomforest_model_rs, X_valid, y_valid)
print(f"Random Forest Classifier Randomized Search Accuracy: {randomforest_accuracy_rs:.2%}")


Best hyperparameters for Random Forest Classifier (Randomized Search):
{'n_estimators': 500, 'max_features': 0.9, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 20}
--------------------------------------------------------------------------
Random Forest Classifier Randomized Search Accuracy: 77.78%


## Bayesian method

In [251]:
def bayesian_optimization_tuning(model, search_space, X_train, y_train):
    bayes_search = BayesSearchCV(
        model,
        search_space,
        n_iter=100,
        cv=5,
        random_state=42,
        n_jobs=-1,
        scoring='accuracy'  
    )
    bayes_search.fit(X_train, y_train)
    return bayes_search.best_estimator_, bayes_search.best_params_

### Bagging

In [240]:
search_space = {
    'n_estimators': [50, 100, 200, 500],
    'max_features': [0.5, 0.7, 0.9],
    'max_depth': [5, 10, 20]
}

In [241]:
# Create an instance of BaggingClassifier
bagging_clf = BaggingClassifier()

# Perform hyperparameter tuning using Bayesian optimization
best_bagging_model_bo, best_bagging_params = bayesian_optimization_tuning(bagging_clf, search_space, X_train, y_train)

# Print the best parameters found for Bagging Classifier
print("Best parameters for Bagging Classifier:")
print(best_bagging_params)
print("-------------------------------------------------------")

bagging_accuracy_bo = evaluate_model(best_bagging_model_bo, X_valid, y_valid)
print("Bagging Classifier Bayesian Optimization Accuracy:", bagging_accuracy_bo * 100, "%")





Best parameters for Bagging Classifier:
OrderedDict([('max_depth', 20), ('max_features', 0.7), ('n_estimators', 500)])
-------------------------------------------------------
Bagging Classifier Bayesian Optimization Accuracy: 76.64141909658177 %


### Boosting

In [252]:
search_space = {
    'n_estimators': [20, 100, 250, 500],
    'learning_rate': [0.01, 1.0]
}

In [253]:
# Create an instance of AdaBoostClassifier
adaboost_clf = AdaBoostClassifier()

# Perform hyperparameter tuning using Bayesian optimization
best_adaboost_model_bo, best_adaboost_params = bayesian_optimization_tuning(adaboost_clf, search_space, X_train, y_train)

# Print the best parameters found for AdaBoost Classifier
print("Best parameters for AdaBoost Classifier:")
print(best_adaboost_params)
print("-------------------------------------------------------")

adaboost_accuracy_bo = evaluate_model(best_adaboost_model_bo, X_valid, y_valid)
print("AdaBoost Classifier Bayesian Optimization Accuracy:", adaboost_accuracy_bo * 100, "%")

Best parameters for AdaBoost Classifier:
OrderedDict([('learning_rate', 1.0), ('n_estimators', 20)])
-------------------------------------------------------
AdaBoost Classifier Bayesian Optimization Accuracy: 74.77728503590912 %


### Random Forest

In [261]:
search_space = {
    'n_estimators': [400, 500, 600],
    'max_depth': [20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [263]:
# Create an instance of RandomForestClassifier
randomforest_clf = RandomForestClassifier()

# Perform hyperparameter tuning using Bayesian optimization
best_randomforest_model_bo, best_randomforest_params = bayesian_optimization_tuning(randomforest_clf, search_space, X_train, y_train)

# Print the best parameters found for Random Forest Classifier
print("Best parameters for Random Forest Classifier:")
print(best_randomforest_params)
print("-------------------------------------------------------")

randomforest_accuracy_bo = evaluate_model(best_randomforest_model_bo, X_valid, y_valid)
print("Random Forest Classifier Bayesian Optimization Accuracy:", randomforest_accuracy_bo*100, "%")

Best parameters for Random Forest Classifier:
OrderedDict([('max_depth', 50), ('min_samples_leaf', 1), ('min_samples_split', 10), ('n_estimators', 500)])
-------------------------------------------------------
Random Forest Classifier Bayesian Optimization Accuracy: 76.25289431341 %


# Final System

In [111]:
print("-----------------------------------------------------------------------------")
print(f"The best model is Random Forest:")      
print(best_randomforest_model_gs.get_params())

randomforest_accuracy_gs = evaluate_model(best_randomforest_model_gs, X_test, y_test)
print(f"Random Forest Classifier On test set Accuracy: {randomforest_accuracy_gs:.2%}")
print("-----------------------------------------------------------------------------")

-----------------------------------------------------------------------------
The best model is Random Forest:
{'n_estimators': 600, 'max_features': 'auto', 'min_samples_split': 15, 'min_samples_leaf': 1, 'max_depth': 40}
Random Forest Classifier On test set Accuracy: 78.19%
-----------------------------------------------------------------------------
