# 1. Load the Data

* First connect the drive to the Lab before any processes.
* Load all the available data that we have available to our project.
* It may impede the process and speed but I think reducing the bactch number will help.
* Splitting data into training, validation, and test sets.



In [15]:
import os
import random
import shutil
import numpy as np
import pandas as pd
import torch
import tensorflow as tf
import torch.nn as nn
import torch.optim as optim
import torchvision
import cv2

In [43]:
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow import keras
from tensorflow.keras.applications import MobileNetV2
from torchvision import datasets, transforms
from torch.utils.data import random_split, DataLoader, Dataset, WeightedRandomSampler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from PIL import Image
from collections import Counter

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
root_folders = [
    "/content/drive/MyDrive/CoWorkStuff/Data/01_test_and_val_dataset",
    "/content/drive/MyDrive/CoWorkStuff/Data/02_test_and_val_dataset_improved",
    "/content/drive/MyDrive/CoWorkStuff/Data/03_test_and_val_dataset_resampled",
    "/content/drive/MyDrive/CoWorkStuff/Data/04_test_and_val_dataset_resampled_resized",
    "/content/drive/MyDrive/CoWorkStuff/Data/05_train_dataset",
    "/content/drive/MyDrive/CoWorkStuff/Data/06_train_deduplicated",
    "/content/drive/MyDrive/CoWorkStuff/Data/07_train_deduplicated_dataset",
    "/content/drive/MyDrive/CoWorkStuff/Data/08_train_data",
    "/content/drive/MyDrive/CoWorkStuff/Data/09_train_data_resized",
    "/content/drive/MyDrive/CoWorkStuff/Data/10_train_pal_data",
    "/content/drive/MyDrive/CoWorkStuff/Data/11_test_pal_data",
    "/content/drive/MyDrive/CoWorkStuff/Data/12_val_pal_data"
    ]


# Define a combined folder to store the merged data
combined_folder = "/content/drive/MyDrive/CoWorkStuff/CombinedData"

# Create the combined dataset folder if it doesn't exist
os.makedirs(combined_folder, exist_ok=True)

In [None]:
# Define a dictionary to map subfolder names to class labels
class_mapping = {
    'subfolder1': 'Cercospora',
    'subfolder2': 'Healthy',
    'subfolder3': 'Miner',
    'subfolder4': 'Phoma',
    'subfolder5': 'Rust'
    }

# Move subfolder content into the combined folder
def move_subfolder_content(src, dst):
  for item in os.listdir(src):
    s = os.path.join(src, item)
    d = os.path.join(dst, item)
    if os.path.isdir(s):
      os.makedirs(d, exist_ok=True)
      move_subfolder_content(s, d)
    else:
      shutil.move(s, d)

# Combine data from different root folders into the combined folder
for root_folder in root_folders:
  for subfolder in os.listdir(root_folder):
    subfolder_path = os.path.join(root_folder, subfolder)
    if os.path.isdir(subfolder_path):
      # Get the class label from the dictionary
      class_label = class_mapping.get(subfolder, 'Other')
      # Create a target folder based on the class label
      target_folder = os.path.join(combined_folder, class_label)
      # Create the target folder if it doesn't exist
      os.makedirs(target_folder, exist_ok=True)
      # Move the subfolder content to the target folder
      move_subfolder_content(subfolder_path, target_folder)

In [None]:
# Recursively collect image files from the folder(s)
def collect_image_files(folder):
  image_files = []
  for root, _, files in os.walk(folder):
    for file in files:
      if file.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.gif')):
        image_files.append(os.path.join(root, file))
  return image_files

# Shuffle and split data for each class
def shuffle_and_split_data(root_folder, target_root, train_ratio, val_ratio):
  for class_label in os.listdir(root_folder):
    class_folder = os.path.join(root_folder, class_label)

    if os.path.isdir(class_folder):
      # List all files in the class folder
      class_data = collect_image_files(class_folder)
      random.shuffle(class_data)

      # Split the data into train, validation, and test sets
      num_samples = len(class_data)
      num_train = int(train_ratio * num_samples)
      num_val = int(val_ratio * num_samples)

      train_data = class_data[:num_train]
      val_data = class_data[num_train:num_train + num_val]
      test_data = class_data[num_train + num_val:]

      # Create target folders
      train_folder = os.path.join(target_root, 'train', class_label)
      val_folder = os.path.join(target_root, 'val', class_label)
      test_folder = os.path.join(target_root, 'test', class_label)

      os.makedirs(train_folder, exist_ok=True)
      os.makedirs(val_folder, exist_ok=True)
      os.makedirs(test_folder, exist_ok=True)

      # Move data to respective folders
      move_data(train_data, class_folder, train_folder)
      move_data(val_data, class_folder, val_folder)
      move_data(test_data, class_folder, test_folder)

# Define move_data
def move_data(data_list, source_folder, dest_folder):
  for data in data_list:
    src_path = os.path.join(source_folder, data)
    dst_path = os.path.join(dest_folder, data)
    if os.path.exists(src_path):
      shutil.move(src_path, dst_path)

# Define the split ratios
train_ratio = 0.65
val_ratio = 0.15
test_ratio = 0.20

# Define the target root folder for the train, val, and test sets
target_root = "/content/drive/MyDrive/CoWorkStuff/SplitData"

# Create the train, validation, and test folders
os.makedirs(os.path.join(target_root, 'train'), exist_ok=True)
os.makedirs(os.path.join(target_root, 'val'), exist_ok=True)
os.makedirs(os.path.join(target_root, 'test'), exist_ok=True)

# Shuffle and split data for each class
shuffle_and_split_data(combined_folder, target_root, train_ratio, val_ratio)

#######################################################################################################

# 2. Load the **Models Pre-trained** by our talented model building team

* Select as many models that were created.
* If possible, check the best performing one or possibly fine tunning them.
* Preferrably ensamble the models to make it more solid and stronger.
* Without any particular order or favouritism, for the pre-trained models; credit, thanks and props goes out to the following collaborators:
  - Darshan
  - Lucas
  - Dimitra
  - Juan

In [4]:
# Load .h5 model (TensorFlow/Keras)
model_01 = keras.models.load_model(
    "/content/drive/MyDrive/CoWorkStuff/Models/model_CNN1_BRACOL.h5")

In [5]:
model_02 = keras.models.load_model(
    "/content/drive/MyDrive/CoWorkStuff/Models/baseline_resnet50.h5")

In [6]:
model_03 = keras.models.load_model(
    "/content/drive/MyDrive/CoWorkStuff/Models/without_cersc_and_healthy_resnet50_deduplicated_mix_val_train_67acc.h5")

In [7]:
model_04 = keras.models.load_model(
    "/content/drive/MyDrive/CoWorkStuff/Models/without_cersc_and_healthy_resnet50_deduplicated.h5")

In [8]:
model_05 = keras.models.load_model(
    "/content/drive/MyDrive/CoWorkStuff/Models/without_cersc_and_healthy_resnet50.h5")

In [9]:
model_06 = keras.models.load_model(
    "/content/drive/MyDrive/CoWorkStuff/Models/without_cersc_resnet50_deduplicated_mix_val_train_75acc.h5")

In [10]:
# Load .pth model (PyTorch)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
torch.cuda.empty_cache()

class CoffeeLeafClassifier(nn.Module):
  def __init__(self):
    super(CoffeeLeafClassifier, self).__init__()

    # Convolutional layers
    self.conv_layers = nn.Sequential(
        nn.Conv2d(3, 32, kernel_size=3, padding=1),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2, stride=2),

        nn.Conv2d(32, 64, kernel_size=3),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2, stride=2),

        nn.Conv2d(64, 128, kernel_size=3),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2, stride=2),
        )
    # Fully connected layers
    self.fc_layers = nn.Sequential(
        nn.Linear(128 * 30 * 30, 512),
        nn.ReLU(),
        nn.Dropout(0.5),

        nn.Linear(512, 256),
        nn.ReLU(),
        nn.Dropout(0.5),

        nn.Linear(256, 128),
        nn.ReLU(),
        nn.Dropout(0.5),
        nn.Linear(128, 5) # 5 classes
        )

  def forward(self, x):
    x = self.conv_layers(x)
    x = x.view(x.size(0), -1) # Flatten the output
    x = self.fc_layers(x)
    return x

model_path = os.path.join(
    "/content/drive/MyDrive/CoWorkStuff/Models/cnn_strategy1_weighted_loss",
    "/content/drive/MyDrive/CoWorkStuff/Models/cnn_strategy1_weighted_loss/coffee_leaf_classifier.pth")
model_07 = torch.load(model_path, map_location=torch.device('cpu'))
model_07.to(device)
model_07.eval()

cuda


CoffeeLeafClassifier(
  (conv_layers): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1))
    (7): ReLU()
    (8): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc_layers): Sequential(
    (0): Linear(in_features=115200, out_features=512, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=512, out_features=256, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.5, inplace=False)
    (6): Linear(in_features=256, out_features=128, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.5, inplace=False)
    (9): Linear(in_features=128, out_features=5, bias=True)
  )
)

#######################################################################################################

# 3. Current Models' Performance

In [23]:
from keras.applications.vgg16 import preprocess_input
from PIL import Image
import numpy as np
import os
import cv2

# Define the folder containing your test data
val_data_folder = "/content/drive/MyDrive/CoWorkStuff/SplitData/val"

# Define lists to store model predictions
predictions_model_01 = []
predictions_model_02 = []
predictions_model_03 = []
predictions_model_04 = []
predictions_model_05 = []
predictions_model_06 = []
predictions_model_07 = []

# Define a list to store true labels
true_labels = []

# Class Mapping
class_mapping = {
    'Cercospora': 0,
    'Healthy': 1,
    'Miner': 2,
    'Phoma': 3,
    'Rust': 4
}

# Define a function to load and preprocess an image
def load_and_preprocess_image(image_path, target_size):
  try:
    img = Image.open(image_path)
    img = img.resize(target_size)
    img_array = np.array(img)

    # Preprocessing the img_array
    img_array = preprocess_input(img_array)
    return img_array

  except Exception as e:
    print(f"Error processing {image_path}: {e}")
    return None

# Iterate through the val data folder
for class_label in os.listdir(val_data_folder):
  if class_label in class_mapping:
    class_folder = os.path.join(val_data_folder, class_label)
    true_class = class_mapping[class_label]

    # Iterate through the images in the class folder
    for image_name in os.listdir(class_folder):
      image_path = os.path.join(class_folder, image_name)

      # Load and preprocess the image
      img = load_and_preprocess_image(image_path, target_size=(256, 256))

      # Check if the image was loaded successfully
      if img is not None:
        img_02 = load_and_preprocess_image(image_path, target_size=(224, 224))

        img_03 = cv2.imread(image_path)
        img_03 = cv2.resize(img_03, (256, 256)).astype(np.float32)
        img_03 = cv2.cvtColor(img_03, cv2.COLOR_BGR2RGB)
        img_03 = preprocess_input(img_03)
        img_03 = img_03 / 255.0

        # Store the true label
        true_labels.append(true_class)

        # Make predictions using each model
        pred_model_01 = model_01.predict(np.expand_dims(img, axis=0))
        pred_model_02 = model_02.predict(np.expand_dims(img_02, axis=0))
        pred_model_03 = model_03.predict(np.expand_dims(img_02, axis=0))
        pred_model_04 = model_04.predict(np.expand_dims(img_02, axis=0))
        pred_model_05 = model_05.predict(np.expand_dims(img_02, axis=0))
        pred_model_06 = model_06.predict(np.expand_dims(img_02, axis=0))

        img_tensor = img_tensor.to(device)
        with torch.no_grad():
          pred_model_07 = model_07(img_tensor)
          pred_model_07 = pred_model_07.cpu()

        # Append predictions to respective lists
        predictions_model_01.append(np.argmax(pred_model_01))
        predictions_model_02.append(np.argmax(pred_model_02))
        predictions_model_03.append(np.argmax(pred_model_03))
        predictions_model_04.append(np.argmax(pred_model_04))
        predictions_model_05.append(np.argmax(pred_model_05))
        predictions_model_06.append(np.argmax(pred_model_06))
        predictions_model_07.append(np.argmax(pred_model_07))











In [25]:
# model predictions evaluation (e.g., accuracy)
accuracy_model_01 = accuracy_score(true_labels, predictions_model_01)
accuracy_model_02 = accuracy_score(true_labels, predictions_model_02)
accuracy_model_03 = accuracy_score(true_labels, predictions_model_03)
accuracy_model_04 = accuracy_score(true_labels, predictions_model_04)
accuracy_model_05 = accuracy_score(true_labels, predictions_model_05)
accuracy_model_06 = accuracy_score(true_labels, predictions_model_06)
accuracy_model_07 = accuracy_score(true_labels, predictions_model_07)

print("Accuracy (Model_01):", accuracy_model_01)
print("Accuracy (Model_02):", accuracy_model_02)
print("Accuracy (Model_03):", accuracy_model_03)
print("Accuracy (Model_04):", accuracy_model_04)
print("Accuracy (Model_05):", accuracy_model_05)
print("Accuracy (Model_06):", accuracy_model_06)
print("Accuracy (Model_07):", accuracy_model_07)

Accuracy (Model_01): 0.19946808510638298
Accuracy (Model_02): 0.3045212765957447
Accuracy (Model_03): 0.14627659574468085
Accuracy (Model_04): 0.16356382978723405
Accuracy (Model_05): 0.16356382978723405
Accuracy (Model_06): 0.08909574468085106
Accuracy (Model_07): 0.09042553191489362


In [26]:
# DataFrame to store predictions
original_predictions_df = pd.DataFrame({
    'True Labels': true_labels,
    'Model 01 Predictions': predictions_model_01,
    'Model 02 Predictions': predictions_model_02,
    'Model 03 Predictions': predictions_model_03,
    'Model 04 Predictions': predictions_model_04,
    'Model 05 Predictions': predictions_model_05,
    'Model 06 Predictions': predictions_model_06,
    'Model 07 Predictions': predictions_model_07,
    })

In [37]:
file_path_01 = '/content/drive/MyDrive/CoWorkStuff/original_predictions.csv'
original_predictions_df.to_csv(file_path_01, index=False)

######################################################################################################

# 4. Weighted Averaging of the models
- The weighting was done according to the performance of each pretrained model on the chosen test data

In [31]:
# Define the weights for each model based on the above "Current Performance"
weights = [0.17, 0.26, 0.13, 0.14, 0.14, 0.08, 0.08]

# Create an empty list to store the ensemble predictions
weighted_ave_predictions = []

# Iterate through the true labels and predictions
for true_label, pred_1, pred_2, pred_3, pred_4, pred_5, pred_6, pred_7 in zip(
    true_labels, predictions_model_01, predictions_model_02, predictions_model_03,
    predictions_model_04, predictions_model_05, predictions_model_06,
    predictions_model_07
    ):

  # Calculate the weighted average prediction for each class label
  weighted_ave_prediction = (
      weights[0] * pred_1 + weights[1] * pred_2 + weights[2] * pred_3 +
      weights[3] * pred_4 + weights[4] * pred_5 + weights[5] * pred_6 +
      weights[6] * pred_7
      )
  # Append the ensemble prediction to the list
  weighted_ave_predictions.append(weighted_ave_prediction)

# Convert the ensemble predictions to class labels (index of maximum value)
ensemble_weighted_ave_predictions = [np.argmax(pred) for pred in weighted_ave_predictions]

# Evaluate the ensemble predictions (e.g., accuracy, classification report)
ensemble_weighted_ave_accuracy = accuracy_score(true_labels, ensemble_weighted_ave_predictions)
ensemble_weighted_ave_classification_report = classification_report(true_labels, ensemble_weighted_ave_predictions, zero_division=1)

print("Ensemble Accuracy:", ensemble_weighted_ave_accuracy)

Ensemble Accuracy: 0.09042553191489362


In [32]:
print("Ensamble Weighted Average Classification Report:\n", ensemble_weighted_ave_classification_report)

Ensamble Weighted Average Classification Report:
               precision    recall  f1-score   support

           0       0.09      1.00      0.17        68
           1       1.00      0.00      0.00       165
           2       1.00      0.00      0.00       146
           3       1.00      0.00      0.00       123
           4       1.00      0.00      0.00       250

    accuracy                           0.09       752
   macro avg       0.82      0.20      0.03       752
weighted avg       0.92      0.09      0.01       752



In [38]:
# DataFrame to store predictions
ensemble_weighted_ave_predictions_df = pd.DataFrame({
    'True Labels': true_labels,
    'Ensemble Weighted Average Predictions': ensemble_weighted_ave_predictions
    })

file_path_02 = '/content/drive/MyDrive/CoWorkStuff/Ensemble_weighted_ave_predictions.csv'
ensemble_weighted_ave_predictions_df.to_csv(file_path_02, index=False)

######################################################################################################

# 4. Data Preperation for the rest.

In [34]:
# path to test data folder
train_data_folder = "/content/drive/MyDrive/CoWorkStuff/SplitData/train"

# lists to store images and their labels
train_images = []
train_labels = []

# list to store bad image paths
bad_file_list = []
bad_count = 0

# list of models
models = [
    ('model_01', model_01),
    ('model_02', model_02),
    ('model_03', model_03),
    ('model_04', model_04),
    ('model_05', model_05),
    ('model_06', model_06),
    ('model_07', model_07)
    ]

# batch size for loading and preprocessing images
batch_size = 32

for class_label in os.listdir(train_data_folder):
  class_folder = os.path.join(train_data_folder, class_label)

  # load and preprocess images in batches
  image_paths = [os.path.join(class_folder, image_file) for image_file in os.listdir(class_folder)]
  num_images = len(image_paths)
  num_batches = (num_images + batch_size - 1) // batch_size

  for batch_idx in range(num_batches):
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, num_images)
    batch_paths = image_paths[start_idx:end_idx]

    try:
      images = np.array([img_to_array(load_img(image_path, target_size=(224, 224))) for image_path in batch_paths])
      images = images / 255.0
      train_images.extend(images)
      train_labels.extend([class_label] * len(images))

    except Exception as e:
      # Handle bad images by appending their paths to the bad_file_list
      bad_file_list.extend(batch_paths)
      bad_count += len(batch_paths)

In [40]:
# Convert the Train lists to NumPy arrays
X_train = np.array(train_images)
y_train = np.array(train_labels)

# Save X_train and y_train
file_path_03 = '/content/drive/MyDrive/CoWorkStuff/'

np.save(os.path.join(file_path_03, "X_train.npy"), X_train)
np.save(os.path.join(file_path_03, "y_train.npy"), y_train)

In [41]:
# path to test data folder
test_data_folder = "/content/drive/MyDrive/CoWorkStuff/SplitData/test"

# lists to store images and their labels
test_images = []
test_labels = []

# Iterate through the subfolders (class labels) the test data folder
for class_label in os.listdir(test_data_folder):
  lass_folder = os.path.join(test_data_folder, class_label)

  # NumPy to efficiently load and preprocess images in batches
  image_paths = [os.path.join(class_folder, image_file) for image_file in os.listdir(class_folder)]
  num_images = len(image_paths)
  num_batches = (num_images + batch_size - 1) // batch_size

  for batch_idx in range(num_batches):
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, num_images)
    batch_paths = image_paths[start_idx:end_idx]

    try:
      images = np.array([img_to_array(load_img(image_path, target_size=(224, 224))) for image_path in batch_paths])
      images = images / 255.0
      test_images.extend(images)
      test_labels.extend([class_label] * len(images))

    except Exception as e:
      # Handle bad images by appending their paths to the bad_file_list
      bad_file_list.extend(batch_paths)
      bad_count += len(batch_paths)

In [42]:
# Convert the test lists to NumPy arrays
X_test = np.array(test_images)
y_test = np.array(test_labels)

# Save X_test and y_test
file_path_04 = '/content/drive/MyDrive/CoWorkStuff/'

np.save(os.path.join(file_path_04, "X_test.npy"), X_test)
np.save(os.path.join(file_path_04, "y_test.npy"), y_test)

######################################################################################################

# 5. Stacking

- Tried many different techniques using most types of classifiers.
- Took those classifiers and stacked them together using the Stacking classifier as the meta model.
- There has been tramendous improvement in the above models this time.
- Among these models, the highest performing model has an accuracy of about 54%.
- The lowest performing model has an accuracy of about 45%.
- So there is more or less a 9% deviation of performance among these models.
- And there is about 16% deviation in performance from the below models and the original models already trained by our collaborators.
- There is also an approximate 47% performance deviation to the below models with the original models weighted together.

In [44]:
# Create a feature matrix by stacking the predictions from different models
X = np.column_stack((predictions_model_01, predictions_model_02,
                     predictions_model_03, predictions_model_04,
                     predictions_model_05, predictions_model_06,
                     predictions_model_07))

# Split your data into training and testing sets (you can load your own data)
X_train_01, X_test_01, y_train_01, y_test_01 = train_test_split(X, true_labels,
                                                                test_size=0.2,
                                                                random_state=42)

# Create a RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the classifier on the training data
rf_classifier.fit(X_train_01, y_train_01)

# Make predictions on the test data
rf_predictions = rf_classifier.predict(X_test_01)

# Calculate the accuracy of the RandomForestClassifier
accuracy = accuracy_score(y_test_01, rf_predictions)
print(f"Random Forest Classifier Accuracy: {accuracy:.2f}")

Random Forest Classifier Accuracy: 0.50


In [74]:
rf_classification_report = classification_report(y_test_01, rf_predictions, zero_division=1)
print("Random Forest Classifier Classification Report:\n", rf_classification_report)

Random Forest Classifier Classification Report:
               precision    recall  f1-score   support

           0       0.20      0.06      0.09        17
           1       0.76      0.91      0.83        35
           2       0.26      0.29      0.27        21
           3       0.33      0.18      0.24        22
           4       0.46      0.57      0.51        56

    accuracy                           0.50       151
   macro avg       0.40      0.40      0.39       151
weighted avg       0.46      0.50      0.46       151



In [46]:
# Create a feature matrix by stacking the predictions from different models
X_02 = np.column_stack((predictions_model_01, predictions_model_02,
                     predictions_model_03, predictions_model_04,
                     predictions_model_05, predictions_model_06,
                     predictions_model_07))

# Split your data into training and testing sets (you can load your own data)
X_train_02, X_test_02, y_train_02, y_test_02 = train_test_split(X_02, true_labels,
                                                    test_size=0.2,
                                                    random_state=42)

# Create a GradientBoostingClassifier
gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Fit the classifier on the training data
gb_classifier.fit(X_train_02, y_train_02)

# Make predictions on the test data
gb_predictions = gb_classifier.predict(X_test_02)

# Calculate the accuracy of the GradientBoostingClassifier
accuracy = accuracy_score(y_test_02, gb_predictions)
print(f"Gradient Boosting Classifier Accuracy: {accuracy:.2f}")


Gradient Boosting Classifier Accuracy: 0.50


In [75]:
gb_predictions_report = classification_report(y_test_02, gb_predictions, zero_division=1)
print("Gradient Boosting Classifier Classification Report:\n", gb_predictions_report)

Gradient Boosting Classifier Classification Report:
               precision    recall  f1-score   support

           0       0.40      0.12      0.18        17
           1       0.77      0.94      0.85        35
           2       0.29      0.29      0.29        21
           3       0.23      0.14      0.17        22
           4       0.45      0.55      0.50        56

    accuracy                           0.50       151
   macro avg       0.43      0.41      0.40       151
weighted avg       0.46      0.50      0.47       151



In [48]:
from sklearn.svm import SVC

# Create a feature matrix by stacking the predictions from different models
X_04 = np.column_stack((predictions_model_01, predictions_model_02,
                     predictions_model_03, predictions_model_04,
                     predictions_model_05, predictions_model_06,
                     predictions_model_07))

# Split your data into training and testing sets (you can load your own data)
X_train_04, X_test_04, y_train_04, y_test_04 = train_test_split(X_04, true_labels,
                                                                test_size=0.2,
                                                                random_state=42)

# Create an SVC classifier
svc_classifier = SVC(kernel='linear', probability=True, random_state=42)

# Fit the classifier on the training data
svc_classifier.fit(X_train_04, y_train_04)

# Make predictions on the test data
svc_predictions = svc_classifier.predict(X_test_04)

# Calculate the accuracy of the SVC classifier
accuracy = accuracy_score(y_test_04, svc_predictions)
print(f"SVC Classifier Accuracy: {accuracy:.2f}")

SVC Classifier Accuracy: 0.54


In [76]:
svc_predictions_report = classification_report(y_test_04, svc_predictions, zero_division=1)
print("SVC Classifier Classification Report:\n", svc_predictions_report)

SVC Classifier Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.00      0.00        17
           1       0.69      0.94      0.80        35
           2       0.38      0.24      0.29        21
           3       1.00      0.00      0.00        22
           4       0.48      0.77      0.59        56

    accuracy                           0.54       151
   macro avg       0.71      0.39      0.34       151
weighted avg       0.65      0.54      0.44       151



In [60]:
from sklearn.neighbors import KNeighborsClassifier

# Create a feature matrix by stacking the predictions from different models
X_05 = np.column_stack((predictions_model_01, predictions_model_02,
                     predictions_model_03, predictions_model_04,
                     predictions_model_05, predictions_model_06,
                     predictions_model_07))

# Split your data into training and testing sets
X_train_05, X_test_05, y_train_05, y_test_05 = train_test_split(X_05, true_labels,
                                                                test_size=0.2,
                                                                random_state=42)

# Create a K-Nearest Neighbors (KNN) classifier
knn_classifier = KNeighborsClassifier(n_neighbors=100)

# Fit the KNN classifier on the training data
knn_classifier.fit(X_train_05, y_train_05)

# Make predictions on the test data
knn_predictions = knn_classifier.predict(X_test_05)

# Calculate the accuracy of the KNN classifier
accuracy = accuracy_score(y_test_05, knn_predictions)
print(f"K-Nearest Neighbors (KNN) Accuracy: {accuracy:.2f}")

K-Nearest Neighbors (KNN) Accuracy: 0.54


In [77]:
knn_predictions_classification_report = classification_report(y_test_05, knn_predictions, zero_division=1)
print("K-Nearest Neighbors (KNN) Classification Report:\n", knn_predictions_classification_report)

K-Nearest Neighbors (KNN) Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.00      0.00        17
           1       0.71      0.91      0.80        35
           2       0.45      0.24      0.31        21
           3       0.38      0.36      0.37        22
           4       0.50      0.66      0.57        56

    accuracy                           0.54       151
   macro avg       0.61      0.44      0.41       151
weighted avg       0.58      0.54      0.49       151



In [64]:
from sklearn.tree import DecisionTreeClassifier

# Create a feature matrix by stacking the predictions from different models
X_06 = np.column_stack((predictions_model_01, predictions_model_02,
                     predictions_model_03, predictions_model_04,
                     predictions_model_05, predictions_model_06,
                     predictions_model_07))

# Split your data into training and testing sets
X_train_06, X_test_06, y_train_06, y_test_06 = train_test_split(X_06,
                                                                true_labels,
                                                                test_size=0.2,
                                                                random_state=42)

# Create a Decision Tree Classifier
decision_tree_classifier = DecisionTreeClassifier(random_state=42)

# Fit the classifier on the training data
decision_tree_classifier.fit(X_train_06, y_train_06)

# Make predictions on the test data
decision_tree_predictions = decision_tree_classifier.predict(X_test_06)

# Calculate the accuracy of the Decision Tree Classifier
accuracy = accuracy_score(y_test_06, decision_tree_predictions)
print(f"Decision Tree Classifier Accuracy: {accuracy:.2f}")

Decision Tree Classifier Accuracy: 0.45


In [78]:
decision_tree_classification_report = classification_report(y_test_06, decision_tree_predictions, zero_division=1)
print("Decision Tree Classifier Classification Report:\n", decision_tree_classification_report)

Decision Tree Classifier Classification Report:
               precision    recall  f1-score   support

           0       0.10      0.06      0.07        17
           1       0.79      0.94      0.86        35
           2       0.21      0.29      0.24        21
           3       0.22      0.09      0.13        22
           4       0.43      0.46      0.44        56

    accuracy                           0.45       151
   macro avg       0.35      0.37      0.35       151
weighted avg       0.41      0.45      0.42       151



In [51]:
from sklearn.ensemble import AdaBoostClassifier

# Create a feature matrix by stacking the predictions from different models
X_07 = np.column_stack((predictions_model_01, predictions_model_02,
                     predictions_model_03, predictions_model_04,
                     predictions_model_05, predictions_model_06,
                     predictions_model_07))

# Split your data into training and testing sets
X_train_07, X_test_07, y_train_07, y_test_07 = train_test_split(X_07,
                                                                true_labels,
                                                                test_size=0.2,
                                                                random_state=42)

# Create an AdaBoost Classifier
adaboost_classifier = AdaBoostClassifier(random_state=42)

# Fit the classifier on the training data
adaboost_classifier.fit(X_train_07, y_train_07)

# Make predictions on the test data
adaboost_predictions = adaboost_classifier.predict(X_test_07)

# Calculate the accuracy of the AdaBoost Classifier
accuracy = accuracy_score(y_test_07, adaboost_predictions)
print(f"AdaBoost Classifier Accuracy: {accuracy:.2f}")

AdaBoost Classifier Accuracy: 0.48


In [79]:
adaboost_classification_report = classification_report(y_test_07, adaboost_predictions, zero_division=1)
print("AdaBoost Classifier Classification Report:\n", adaboost_classification_report)

AdaBoost Classifier Classification Report:
               precision    recall  f1-score   support

           0       0.24      0.24      0.24        17
           1       0.71      0.91      0.80        35
           2       0.21      0.24      0.22        21
           3       0.41      0.32      0.36        22
           4       0.52      0.45      0.48        56

    accuracy                           0.48       151
   macro avg       0.42      0.43      0.42       151
weighted avg       0.47      0.48      0.47       151



In [52]:
from xgboost import XGBClassifier

# Create a feature matrix by stacking the predictions from different models
X_08 = np.column_stack((predictions_model_01, predictions_model_02,
                     predictions_model_03, predictions_model_04,
                     predictions_model_05, predictions_model_06,
                     predictions_model_07))

# Split your data into training and testing sets
X_train_08, X_test_08, y_train_08, y_test_08 = train_test_split(X_08,
                                                                true_labels,
                                                                test_size=0.2,
                                                                random_state=42)

# Create an XGBoost Classifier
xgb_classifier = XGBClassifier(random_state=42)

# Fit the classifier on the training data
xgb_classifier.fit(X_train_08, y_train_08)

# Make predictions on the test data
xgb_predictions = xgb_classifier.predict(X_test_08)

# Calculate the accuracy of the XGBoost Classifier
accuracy = accuracy_score(y_test_08, xgb_predictions)
print(f"XGBoost Classifier Accuracy: {accuracy:.2f}")

XGBoost Classifier Accuracy: 0.51


In [80]:
xgb_classification_report = classification_report(y_test_08, xgb_predictions, zero_division=1)
print("XGBoost  Classifier Classification Report:\n", xgb_classification_report)

XGBoost  Classifier Classification Report:
               precision    recall  f1-score   support

           0       0.20      0.06      0.09        17
           1       0.79      0.94      0.86        35
           2       0.27      0.33      0.30        21
           3       0.40      0.18      0.25        22
           4       0.47      0.57      0.52        56

    accuracy                           0.51       151
   macro avg       0.43      0.42      0.40       151
weighted avg       0.47      0.51      0.48       151



In [53]:
from lightgbm import LGBMClassifier

# Create a feature matrix by stacking the predictions from different models
X_09 = np.column_stack((predictions_model_01, predictions_model_02,
                     predictions_model_03, predictions_model_04,
                     predictions_model_05, predictions_model_06,
                     predictions_model_07))

# Split your data into training and testing sets
X_train_09, X_test_09, y_train_09, y_test_09 = train_test_split(X_09,
                                                                true_labels,
                                                                test_size=0.2,
                                                                random_state=42)

# Create a LightGBM Classifier
lgbm_classifier = LGBMClassifier(random_state=42)

# Fit the classifier on the training data
lgbm_classifier.fit(X_train_09, y_train_09)

# Make predictions on the test data
lgbm_predictions = lgbm_classifier.predict(X_test_09)

# Calculate the accuracy of the LightGBM Classifier
accuracy = accuracy_score(y_test_09, lgbm_predictions)
print(f"LightGBM Classifier Accuracy: {accuracy:.2f}")

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 23
[LightGBM] [Info] Number of data points in the train set: 601, number of used features: 6
[LightGBM] [Info] Start training from score -2.466769
[LightGBM] [Info] Start training from score -1.531060
[LightGBM] [Info] Start training from score -1.570281
[LightGBM] [Info] Start training from score -1.783474
[LightGBM] [Info] Start training from score -1.130737
LightGBM Classifier Accuracy: 0.52


In [81]:
lgbm_classification_report = classification_report(y_test_09, lgbm_predictions, zero_division=1)
print("LightGBM  Classifier Classification Report:\n", lgbm_classification_report)

LightGBM  Classifier Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        17
           1       0.75      0.94      0.84        35
           2       0.25      0.33      0.29        21
           3       0.43      0.14      0.21        22
           4       0.49      0.62      0.55        56

    accuracy                           0.52       151
   macro avg       0.38      0.41      0.38       151
weighted avg       0.45      0.52      0.47       151



In [54]:
from sklearn.neural_network import MLPClassifier

# Create a feature matrix by stacking the predictions from different models
X_10 = np.column_stack((predictions_model_01, predictions_model_02,
                     predictions_model_03, predictions_model_04,
                     predictions_model_05, predictions_model_06,
                     predictions_model_07))

# Split your data into training and testing sets
X_train_10, X_test_10, y_train_10, y_test_10 = train_test_split(X_10,
                                                                true_labels,
                                                                test_size=0.2,
                                                                random_state=42)

# Create an MLP Classifier
mlp_classifier = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500,
                               random_state=42)

# Fit the classifier on the training data
mlp_classifier.fit(X_train_10, y_train_10)

# Make predictions on the test data
mlp_predictions = mlp_classifier.predict(X_test_10)

# Calculate the accuracy of the MLP Classifier
accuracy = accuracy_score(y_test_10, mlp_predictions)
print(f"MLP Classifier Accuracy: {accuracy:.2f}")

MLP Classifier Accuracy: 0.45


In [82]:
mlp_classification_report = classification_report(y_test_10, mlp_predictions, zero_division=1)
print("MLP Classifier Classification Report:\n", mlp_classification_report)

MLP Classifier Classification Report:
               precision    recall  f1-score   support

           0       0.14      0.06      0.08        17
           1       0.77      0.94      0.85        35
           2       0.20      0.43      0.27        21
           3       0.40      0.27      0.32        22
           4       0.47      0.34      0.40        56

    accuracy                           0.45       151
   macro avg       0.40      0.41      0.38       151
weighted avg       0.46      0.45      0.44       151



In [65]:
# Combination of individual model predictions into a single feature matrix
X_03 = np.array([predictions_model_01, predictions_model_02, predictions_model_03,
              predictions_model_04, predictions_model_05, predictions_model_06,
              predictions_model_07]).T

# Convert true_labels into a NumPy array
y_03 = np.array(true_labels)

# Split data into training and testing sets
X_train_03, X_test_03, y_train_03, y_test_03 = train_test_split(X_03, y_03, test_size=0.2,
                                                    random_state=42)

# meta-model (Logistic Regression)
meta_model = LogisticRegression()

# Stacking classifier with all the classifyers models and meta-model
stacking_classifier = StackingClassifier(
    estimators=[
        ('Gradient Boosting Classifier', GradientBoostingClassifier(n_estimators=100,
                                                                    random_state=42)),
        ('Random Forest Classifier', RandomForestClassifier(n_estimators=100,
                                                             random_state=42)),
        ("SVC Classifier Accuracy", SVC(kernel='linear', probability=True,
                                        random_state=42)),
        ("K-Nearest Neighbors (KNN)", KNeighborsClassifier(n_neighbors=100)),
        ("Decision Tree Classifier", DecisionTreeClassifier(random_state=42)),
        ("AdaBoost Classifier", AdaBoostClassifier(random_state=42)),
        ("XGBoost Classifier", XGBClassifier(random_state=42)),
        ("LightGBM Classifier", LGBMClassifier(random_state=42)),
        ("MLP Classifier", MLPClassifier(hidden_layer_sizes=(100, 50),
                                         max_iter=500,
                                         random_state=42))
         ],
        final_estimator=meta_model
                )

# Train the stacking classifier
stacking_classifier.fit(X_train_03, y_train_03)

# Make predictions using the stacking classifier
y_pred_03 = stacking_classifier.predict(X_test_03)

# Calculate accuracy on the test set
accuracy = accuracy_score(y_test_03, y_pred_03)
print("Ensemble Stacking Accuracy:", accuracy)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 23
[LightGBM] [Info] Number of data points in the train set: 601, number of used features: 6
[LightGBM] [Info] Start training from score -2.466769
[LightGBM] [Info] Start training from score -1.531060
[LightGBM] [Info] Start training from score -1.570281
[LightGBM] [Info] Start training from score -1.783474
[LightGBM] [Info] Start training from score -1.130737
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 22
[LightGBM] [Info] Number of data points in the train set: 480, number of used features: 6
[LightGBM] [Info] Start training from score -2.484907
[LightGBM] [Info] Start training from score -1.529395
[LightGBM] [Info] Start training from score -1.568616
[LightGBM] [Info] Start training from score -1.779337
[LightGBM] [Info] Start tr

In [83]:
ensemble_stacking_classification_report = classification_report(y_test_03, y_pred_03, zero_division=1)
print("Ensemble Stacking Classification Report:\n", ensemble_stacking_classification_report)

Ensemble Stacking Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        17
           1       0.80      0.94      0.87        35
           2       0.22      0.19      0.21        21
           3       0.42      0.23      0.29        22
           4       0.47      0.66      0.55        56

    accuracy                           0.52       151
   macro avg       0.38      0.40      0.38       151
weighted avg       0.45      0.52      0.48       151



######################################################################################################

# Thank You!