In [1]:
BATCH_SIZE = 128
MAX_LENGTH = 128

# Ensemble Model Run

In this notebook, we will run ensemble models on the segmented data. We will use the following models:
1. Random Forest
2. XGBoost
3. LightGBM

In [2]:
import numpy as np
import pandas as pd

In [3]:
train_labels = (
    pd.read_csv("../../../Datasets/RegExpPURE/PURE_train.csv", usecols=["Req/Not Req"])[
        "Req/Not Req"
    ]
    .map({"Req": 1, "Not_Req": 0})
    .values
)
test_labels = (
    pd.read_csv("../../../Datasets/RegExpPURE/PURE_test.csv", usecols=["Req/Not Req"])[
        "Req/Not Req"
    ]
    .map({"Req": 1, "Not_Req": 0})
    .values
)
valid_labels = (
    pd.read_csv("../../../Datasets/RegExpPURE/PURE_valid.csv", usecols=["Req/Not Req"])[
        "Req/Not Req"
    ]
    .map({"Req": 1, "Not_Req": 0})
    .values
)

In [4]:
train_feature_size = len(train_labels)
test_feature_size = len(test_labels)
valid_feature_size = len(valid_labels)

In [5]:
train_segments = np.concatenate(
    [
        np.load(f"../../../States/DistilBert_Pretrained/Train/train_{i}.npy")
        for i in range(0, train_feature_size, BATCH_SIZE)
    ]
)

test_segments = np.concatenate(
    [
        np.load(f"../../../States/DistilBert_Pretrained/Test/test_{i}.npy")
        for i in range(0, test_feature_size, BATCH_SIZE)
    ]
)

valid_segments = np.concatenate(
    [
        np.load(f"../../../States/DistilBert_Pretrained/Validation/Validation_{i}.npy")
        for i in range(0, valid_feature_size, BATCH_SIZE)
    ]
)

In [6]:
# Reshape the segments such that each segment is a 1D array
train_segments = train_segments.reshape(train_feature_size, -1)
test_segments = test_segments.reshape(test_feature_size, -1)
valid_segments = valid_segments.reshape(valid_feature_size, -1)

In [7]:
train_segments.shape, test_segments.shape, valid_segments.shape

((5306, 98304), (1534, 98304), (905, 98304))

In [8]:
# from sklearn.decomposition import PCA

# # Create a PCA object
# pca = PCA(n_components=0.97)  # Retain 95% of the variance

# # Fit PCA on the train segments
# train_segments_pca = pca.fit_transform(train_segments)
# test_segments_pca = pca.transform(test_segments)
# valid_segments_pca = pca.transform(valid_segments)

# # Print the shape of the transformed segments
# print("Train segments shape after PCA:", train_segments_pca.shape)
# print("Validation segments shape after PCA:", valid_segments_pca.shape)
# print("Test segments shape after PCA:", test_segments_pca.shape)
# print("Explained variance ratio:", pca.explained_variance_ratio_.sum())

In [9]:
import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

## Random Forest

In [10]:
from sklearn.ensemble import RandomForestClassifier

In [11]:
rf_clf = RandomForestClassifier(random_state=42)

rf_clf.fit(train_segments, train_labels)

# Test set performance
test_preds = rf_clf.predict(test_segments)
print("Test set performance: \n",classification_report(test_labels, test_preds))

# Validation set performance
valid_preds = rf_clf.predict(valid_segments)
print("Validation set performance: \n",classification_report(valid_labels, valid_preds))

Test set performance: 
               precision    recall  f1-score   support

           0       0.37      0.92      0.52       476
           1       0.88      0.28      0.43      1058

    accuracy                           0.48      1534
   macro avg       0.62      0.60      0.47      1534
weighted avg       0.72      0.48      0.46      1534

Validation set performance: 
               precision    recall  f1-score   support

           0       0.86      0.79      0.82       650
           1       0.55      0.66      0.60       255

    accuracy                           0.75       905
   macro avg       0.70      0.73      0.71       905
weighted avg       0.77      0.75      0.76       905



In [12]:
# # Define the parameter grid for hyperparameter tuning
# param_grid = {
#     "n_estimators": [100, 200, 300],
#     "max_depth": [None, 5, 10],
#     "min_samples_split": [2, 5, 10],
#     "min_samples_leaf": [1, 2, 4],
# }

# # Create a random forest classifier
# rf_clf = RandomForestClassifier(random_state=42)

# # Perform grid search cross-validation
# grid_search = GridSearchCV(
#     estimator=rf_clf, param_grid=param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=2
# )
# grid_search.fit(train_segments_pca, train_labels)

# # Print the best hyperparameters and the corresponding accuracy score
# print("Best Hyperparameters: ", grid_search.best_params_)
# print("Best Accuracy Score: ", grid_search.best_score_)

In [13]:
# rf_clf_best = RandomForestClassifier(
#     n_estimators=grid_search.best_params_["n_estimators"],
#     max_depth=grid_search.best_params_["max_depth"],
#     min_samples_split=grid_search.best_params_["min_samples_split"],
#     min_samples_leaf=grid_search.best_params_["min_samples_leaf"],
#     random_state=42,
# )

# rf_clf_best.fit(train_segments_pca, train_labels)

# # Evaluate the Random Forest Classifier on the test set
# test_predictions = rf_clf_best.predict(test_segments_pca)
# print(classification_report(test_labels, test_predictions))

# # Evaluate the Random Forest Classifier on the validation set
# valid_predictions = rf_clf_best.predict(valid_segments_pca)
# print(classification_report(valid_labels, valid_predictions))