In [1]:
BATCH_SIZE = 128
MAX_LENGTH = 128

# Ensemble Model Run

In this notebook, we will run ensemble models on the segmented data. We will use the following models:
1. Random Forest
2. XGBoost
3. LightGBM

In [2]:
import numpy as np
import pandas as pd

In [3]:
train_labels = (
    pd.read_csv("../../../Datasets/RegExpPURE/PURE_train.csv", usecols=["Req/Not Req"])[
        "Req/Not Req"
    ]
    .map({"Req": 1, "Not_Req": 0})
    .values
)
test_labels = (
    pd.read_csv("../../../Datasets/RegExpPURE/PURE_test.csv", usecols=["Req/Not Req"])[
        "Req/Not Req"
    ]
    .map({"Req": 1, "Not_Req": 0})
    .values
)
valid_labels = (
    pd.read_csv("../../../Datasets/RegExpPURE/PURE_valid.csv", usecols=["Req/Not Req"])[
        "Req/Not Req"
    ]
    .map({"Req": 1, "Not_Req": 0})
    .values
)

In [4]:
train_feature_size = len(train_labels)
test_feature_size = len(test_labels)
valid_feature_size = len(valid_labels)

In [5]:
train_segments = np.concatenate(
    [
        np.load(f"../../../States/DistilBert_Pretrained/Train/train_{i}.npy") for i in range(0, train_feature_size, BATCH_SIZE)
    ]
)

test_segments = np.concatenate(
    [
        np.load(f"../../../States/DistilBert_Pretrained/Test/test_{i}.npy") for i in range(0, test_feature_size, BATCH_SIZE)
    ]
)

valid_segments = np.concatenate(
    [
        np.load(f"../../../States/DistilBert_Pretrained/Validation/Validation_{i}.npy") for i in range(0, valid_feature_size, BATCH_SIZE)
    ]
)

The [cls] token captures the aggregate information of the input sequence and hence it is used for training classifiers. 

In [6]:
cls_token_set_train = train_segments[:, 0, :]
cls_token_set_test = test_segments[:, 0, :]
cls_token_set_valid = valid_segments[:, 0, :]

cls_token_set_train.shape, cls_token_set_test.shape, cls_token_set_valid.shape

((5306, 768), (1534, 768), (905, 768))

In [29]:
import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

## Random Forest

In [7]:
from sklearn.ensemble import RandomForestClassifier

In [8]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a random forest classifier
rf = RandomForestClassifier(random_state=42)

# Perform grid search cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(cls_token_set_train, train_labels)

# Print the best hyperparameters and the corresponding accuracy score
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best Accuracy Score: ", grid_search.best_score_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best Hyperparameters:  {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
Best Accuracy Score:  0.7491669195993546


In [14]:
rf = RandomForestClassifier(
    max_depth=10,
    min_samples_leaf=4,
    min_samples_split=2,
    n_estimators=100,
    n_jobs=-1
)

rf.fit(cls_token_set_train, train_labels)

In [21]:
print(
    classification_report(
        test_labels,
        rf.predict(cls_token_set_test),
    )
)

              precision    recall  f1-score   support

           0       0.44      0.91      0.59       476
           1       0.92      0.48      0.63      1058

    accuracy                           0.61      1534
   macro avg       0.68      0.69      0.61      1534
weighted avg       0.77      0.61      0.62      1534



In [20]:
print(classification_report(
    valid_labels, rf.predict(cls_token_set_valid)
))

              precision    recall  f1-score   support

           0       0.92      0.76      0.84       650
           1       0.58      0.84      0.69       255

    accuracy                           0.78       905
   macro avg       0.75      0.80      0.76       905
weighted avg       0.83      0.78      0.79       905



In [25]:
joblib.dump(rf, "../../../Models/RexExpPURE_Classifiers/RandomForest.joblib")

['../../../Models/RexExpPURE_Classifiers/RandomForest.joblib']

## XGB Classifier

In [28]:
import xgboost as xgb

In [23]:

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Create an XGBoost classifier
xgb_clf = xgb.XGBClassifier(random_state=42)

# Perform grid search cross-validation
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(cls_token_set_train, train_labels)

# Print the best hyperparameters and the corresponding accuracy score
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best Accuracy Score: ", grid_search.best_score_)



Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Best Hyperparameters:  {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}
Best Accuracy Score:  0.7525503602293966


In [26]:
# Create an XGBoost classifier with the best hyperparameters
xgb_clf_best = xgb.XGBClassifier(
    max_depth=grid_search.best_params_["max_depth"],
    learning_rate=grid_search.best_params_["learning_rate"],
    n_estimators=grid_search.best_params_["n_estimators"],
    gamma=grid_search.best_params_["gamma"],
    subsample=grid_search.best_params_["subsample"],
    colsample_bytree=grid_search.best_params_["colsample_bytree"],
    random_state=42,
)

# Fit the XGBoost classifier on the training data
xgb_clf_best.fit(cls_token_set_train, train_labels)

# Evaluate the XGBoost classifier on the test set
print("Test Set Performance:")
print(classification_report(test_labels, xgb_clf_best.predict(cls_token_set_test)))

# Evaluate the XGBoost classifier on the validation set
print("Validation Set Performance:")
print(classification_report(valid_labels, xgb_clf_best.predict(cls_token_set_valid)))

Test Set Performance:
              precision    recall  f1-score   support

           0       0.46      0.87      0.61       476
           1       0.90      0.55      0.68      1058

    accuracy                           0.65      1534
   macro avg       0.68      0.71      0.64      1534
weighted avg       0.77      0.65      0.66      1534

Validation Set Performance:
              precision    recall  f1-score   support

           0       0.93      0.74      0.82       650
           1       0.56      0.87      0.68       255

    accuracy                           0.77       905
   macro avg       0.75      0.80      0.75       905
weighted avg       0.83      0.77      0.78       905



In [27]:
joblib.dump(xgb_clf_best, "../../../Models/RexExpPURE_Classifiers/XGBoost.joblib")

['../../../Models/RexExpPURE_Classifiers/XGBoost.joblib']

## LightGBM Classifier

In [30]:
import lightgbm as lgb

In [None]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    "num_leaves": [31, 50, 100],
    "learning_rate": [0.1, 0.01, 0.001],
    "n_estimators": [100, 200, 300],
    "min_child_samples": [20, 50, 100],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0],
}

# Create a LightGBM classifier
lgb_clf = lgb.LGBMClassifier(random_state=42)

# Perform grid search cross-validation
grid_search = GridSearchCV(
    estimator=lgb_clf,
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
    verbose=2,
)
grid_search.fit(cls_token_set_train, train_labels)

# Print the best hyperparameters and the corresponding accuracy score
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best Accuracy Score: ", grid_search.best_score_)

# Create a LightGBM classifier with the best hyperparameters
lgb_clf_best = lgb.LGBMClassifier(
    num_leaves=grid_search.best_params_["num_leaves"],
    learning_rate=grid_search.best_params_["learning_rate"],
    n_estimators=grid_search.best_params_["n_estimators"],
    min_child_samples=grid_search.best_params_["min_child_samples"],
    subsample=grid_search.best_params_["subsample"],
    colsample_bytree=grid_search.best_params_["colsample_bytree"],
    random_state=42,
)

# Fit the model on the training data
lgb_clf_best.fit(cls_token_set_train, train_labels)

# Evaluate the model on the train set
train_predictions = lgb_clf_best.predict(cls_token_set_train)
print("Train Set Classification Report:")
print(classification_report(train_labels, train_predictions))

# Evaluate the model on the test set
test_predictions = lgb_clf_best.predict(cls_token_set_test)
print("Test Set Classification Report:")
print(classification_report(test_labels, test_predictions))