In [1]:
import numpy as np
import joblib
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.metrics import roc_curve, roc_auc_score
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd

### Standalone Model Cross-Validation

In this notebook, we will perform cross-validation on a standalone model, meaning that the training fold of 70% will be training a model that has not yet seen the data. This will act as an evaluation for the parameters of the models.

In [2]:
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier,
)
from xgboost import XGBClassifier

In [3]:
# Load all the saved models
random_forest = RandomForestClassifier()
gradient_boosting = GradientBoostingClassifier()

ada_boost = AdaBoostClassifier()

xgboost = XGBClassifier()

In [4]:
# Use this if you already have data from the model state outputs
X_train = np.loadtxt(
    "../../../Datasets/irrelevant_requirements_dataset/model_state_outputs/distilbert/reshaped_X_train_last_hidden_states.csv"
)

X_test = np.loadtxt(
    "../../../Datasets/irrelevant_requirements_dataset/model_state_outputs/distilbert/reshaped_X_test_last_hidden_states.csv"
)

y_train = np.loadtxt(
    "../../../Datasets/irrelevant_requirements_dataset/model_state_outputs/distilbert/y_train.csv"
)

y_test = np.loadtxt(
    "../../../Datasets/irrelevant_requirements_dataset/model_state_outputs/distilbert/y_test.csv"
)

In [5]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((946, 76800), (946,), (94, 76800), (94,))

In [4]:
# Load the resampled data

# X_resampled = np.loadtxt(
#     "../../../Datasets/irrelevant_requirements_dataset/distilbert_X_resampled.csv",
#     delimiter=",",
# )

# y_resampled = np.loadtxt(
#       "../../../Datasets/irrelevant_requirements_dataset/distilbert_y_resampled.csv",
#     delimiter=",",
# )

In [5]:
# X_resampled.shape, y_resampled.shape

In [6]:
# X_train, X_test, y_train, y_test = train_test_split(
#     X_resampled, y_resampled, test_size=0.2, random_state=42
# )

In [6]:
validation_fold = 5

In [7]:
cross_validation_result_dict = {
    "model_name": [],
    "accuracy": [],
    "precision": [],
    "recall": [],
    "f1": [],
    "fit_time": [],
    "score_time": [],
}


def append_scores(
    cross_validation_result_dict,
    model_name: str,
    accuracy: float,
    precision: float,
    recall: float,
    f1: float,
    fit_time: float,
    score_time: float,
):
    cross_validation_result_dict["model_name"].append(model_name)
    cross_validation_result_dict["accuracy"].append(accuracy)
    cross_validation_result_dict["precision"].append(precision)
    cross_validation_result_dict["recall"].append(recall)
    cross_validation_result_dict["f1"].append(f1)
    cross_validation_result_dict["fit_time"].append(fit_time)
    cross_validation_result_dict["score_time"].append(score_time)

In [8]:
diagonal_line = pd.DataFrame(
    {"False Positive Rate": [0, 1], "True Positive Rate": [0, 1]}
)


def show_roc_curve(model, model_name: str, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    true_positive_rate, false_positive_rate, _ = roc_curve(y_test, y_pred_proba)
    area_under_curve = roc_auc_score(y_test, y_pred_proba)

    fig = px.line(
        diagonal_line,
        x="False Positive Rate",
        y="True Positive Rate",
        title="ROC Curve",
    )

    fig.update_layout(
        xaxis_title="False Positive Rate",
        yaxis_title="True Positive Rate",
        font=dict(size=16),
        legend=dict(
            bordercolor="Black",
            borderwidth=2,
            font=dict(size=18),
        ),
    )

    fig.update_xaxes(showline=True, linewidth=2, linecolor="black", mirror=True)

    fig.update_yaxes(showline=True, linewidth=2, linecolor="black", mirror=True)

    fig.add_trace(
        go.Scatter(
            x=true_positive_rate,
            y=false_positive_rate,
            mode="lines",
            name=f"{model_name} (AUC = {area_under_curve:.4f})",
        )
    )

    fig.show()

## Cross Validation

Cross validation is a technique for assessing how the results of a statistical analysis will generalize to an independent data set. It is mainly used in settings where the goal is prediction, and one wants to estimate how accurately a predictive model will perform in practice. One round of cross-validation involves partitioning a sample of data into complementary subsets, performing the analysis on one subset (called the training set), and validating the analysis on the other subset (called the validation set or testing set). To reduce variability, multiple rounds of cross-validation are performed using different partitions, and the validation results are averaged over the rounds.

### Cross Validation for Random Forest

In [9]:
random_forest_scores = cross_validate(
    random_forest,
    X_train,
    y_train,
    cv=validation_fold,
    n_jobs=-1,
    scoring=["f1", "accuracy", "precision", "recall"],
)

In [10]:
random_forest_scores

{'fit_time': array([ 3.25759149, 24.67797732, 24.83983994, 24.23241949, 24.97461987]),
 'score_time': array([0.05789113, 0.0523386 , 0.05405259, 0.05294228, 0.05323362]),
 'test_f1': array([0.781893, 1.      , 1.      , 1.      , 1.      ]),
 'test_accuracy': array([0.72105263, 1.        , 1.        , 1.        , 1.        ]),
 'test_precision': array([0.64189189, 1.        , 1.        , 1.        , 1.        ]),
 'test_recall': array([1., 1., 1., 1., 1.])}

In [11]:
append_scores(
    cross_validation_result_dict,
    "Random Forest",
    np.mean(random_forest_scores["test_accuracy"]),
    np.mean(random_forest_scores["test_precision"]),
    np.mean(random_forest_scores["test_recall"]),
    np.mean(random_forest_scores["test_f1"]),
    np.mean(random_forest_scores["fit_time"]),
    np.mean(random_forest_scores["score_time"]),
)

In [12]:
# Fit the model
random_forest.fit(X_train, y_train)

In [13]:
show_roc_curve(random_forest, "Random Forest", X_test, y_test)

### Adaboost Cross Validation

In [14]:
adaboost_cross_validation_scores = cross_validate(
    ada_boost,
    X_train,
    y_train,
    cv=validation_fold,
    n_jobs=-1,
    scoring=["f1", "accuracy", "precision", "recall"],
)

adaboost_cross_validation_scores

{'fit_time': array([  9.44754076, 482.42376614, 476.63675714, 481.56755328,
        478.69760203]),
 'score_time': array([0.05776501, 1.78437614, 1.74983597, 1.75468898, 1.64975023]),
 'test_f1': array([0.7768595 , 0.96174863, 0.99470899, 0.96703297, 0.98924731]),
 'test_accuracy': array([0.71578947, 0.96296296, 0.99470899, 0.96825397, 0.98941799]),
 'test_precision': array([0.63945578, 1.        , 1.        , 1.        , 1.        ]),
 'test_recall': array([0.98947368, 0.92631579, 0.98947368, 0.93617021, 0.9787234 ])}

In [15]:
append_scores(
    cross_validation_result_dict,
    "AdaBoost",
    np.mean(adaboost_cross_validation_scores["test_accuracy"]),
    np.mean(adaboost_cross_validation_scores["test_precision"]),
    np.mean(adaboost_cross_validation_scores["test_recall"]),
    np.mean(adaboost_cross_validation_scores["test_f1"]),
    np.mean(adaboost_cross_validation_scores["fit_time"]),
    np.mean(adaboost_cross_validation_scores["score_time"]),
)

In [16]:
ada_boost.fit(X_train, y_train)

In [17]:
show_roc_curve(ada_boost, "AdaBoost", X_test, y_test)

### XGBoost Cross Validation

In [18]:
xgboost_scores = cross_validate(
    xgboost,
    X_train,
    y_train,
    cv=validation_fold,
    scoring=["f1", "accuracy", "precision", "recall"],
)

xgboost_scores

{'fit_time': array([25.62427688, 59.11931086, 60.6328814 , 59.07556534, 62.42699575]),
 'score_time': array([0.08681083, 0.08799744, 0.21423435, 0.08012104, 0.08718657]),
 'test_f1': array([0.781893  , 0.99470899, 1.        , 1.        , 1.        ]),
 'test_accuracy': array([0.72105263, 0.99470899, 1.        , 1.        , 1.        ]),
 'test_precision': array([0.64189189, 1.        , 1.        , 1.        , 1.        ]),
 'test_recall': array([1.        , 0.98947368, 1.        , 1.        , 1.        ])}

In [19]:
append_scores(
    cross_validation_result_dict,
    "XGBoost",
    np.mean(xgboost_scores["test_accuracy"]),
    np.mean(xgboost_scores["test_precision"]),
    np.mean(xgboost_scores["test_recall"]),
    np.mean(xgboost_scores["test_f1"]),
    np.mean(xgboost_scores["fit_time"]),
    np.mean(xgboost_scores["score_time"]),
)

In [20]:
xgboost.fit(X_train, y_train)

In [21]:
show_roc_curve(xgboost, "XGBoost", X_test, y_test)

### Gradient Boosting Cross Validation


In [22]:
gradient_boosting_cross_validation_scores = cross_validate(
    gradient_boosting,
    X_train,
    y_train,
    cv=validation_fold,
    n_jobs=-1,
    scoring=["f1", "accuracy", "precision", "recall"],
)

In [None]:
append_scores(
    cross_validation_result_dict,
    "Gradient Boosting",
    np.mean(gradient_boosting_cross_validation_scores["test_accuracy"]),
    np.mean(gradient_boosting_cross_validation_scores["test_precision"]),
    np.mean(gradient_boosting_cross_validation_scores["test_recall"]),
    np.mean(gradient_boosting_cross_validation_scores["test_f1"]),
    np.mean(gradient_boosting_cross_validation_scores["fit_time"]),
    np.mean(gradient_boosting_cross_validation_scores["score_time"]),
)

In [None]:
gradient_boosting.fit(X_train, y_train)

In [None]:
show_roc_curve(gradient_boosting, "Gradient Boost", X_test, y_test)

## Result Export

In [None]:
cross_validation_result_dict

{'model_name': ['Random Forest', 'AdaBoost', 'XGBoost', 'Gradient Boosting'],
 'accuracy': [0.9435897435897436,
  0.9242845326716294,
  0.9410090984284534,
  0.939718775847808],
 'precision': [0.9278688524590164,
  0.9289256198347108,
  0.9278688524590164,
  0.9278688524590164],
 'recall': [1.0, 0.9587745587745589, 0.9948717948717949, 0.9923076923076923],
 'f1': [0.9560000000000001,
  0.935569516340243,
  0.9534025974025975,
  0.9520784313725491],
 'fit_time': [8.419963550567626,
  102.9524254322052,
  16.370035123825073,
  632.633481836319],
 'score_time': [0.01839170455932617,
  0.36302905082702636,
  0.03557438850402832,
  0.019798469543457032]}

In [None]:
from pandas import DataFrame

cross_validation_result_df = DataFrame(cross_validation_result_dict)
cross_validation_result_df.to_csv(
    "../../../Results/irrelevant_requirements_experiment/classifiers_with_distilbert_cross_validation_results.csv",
    index=False,
)

In [None]:
cross_validation_result_df

Unnamed: 0,model_name,accuracy,precision,recall,f1,fit_time,score_time
0,Random Forest,0.94359,0.927869,1.0,0.956,8.419964,0.018392
1,AdaBoost,0.924285,0.928926,0.958775,0.93557,102.952425,0.363029
2,XGBoost,0.941009,0.927869,0.994872,0.953403,16.370035,0.035574
3,Gradient Boosting,0.939719,0.927869,0.992308,0.952078,632.633482,0.019798


In [None]:
# Make a bar chart of the cross validation results that contains the accuracy, precision, recall, and f1 scores. The graph should also contain value of each of the scores.

import plotly.graph_objects as go

fig = go.Figure(
    data=[
        go.Bar(
            name="Accuracy",
            x=cross_validation_result_df["model_name"],
            y=cross_validation_result_df["accuracy"],
        ),
        go.Bar(
            name="Precision",
            x=cross_validation_result_df["model_name"],
            y=cross_validation_result_df["precision"],
        ),
        go.Bar(
            name="Recall",
            x=cross_validation_result_df["model_name"],
            y=cross_validation_result_df["recall"],
        ),
        go.Bar(
            name="F1",
            x=cross_validation_result_df["model_name"],
            y=cross_validation_result_df["f1"],
        ),
    ]
)

fig.update_layout(
    barmode="group",
    xaxis_title="Model",
    yaxis_title="Score",
    font=dict(size=16),
    legend=dict(
        bordercolor="Black",
        borderwidth=2,
        font=dict(size=18),
    ),
)

fig.update_xaxes(showline=True, linewidth=2, linecolor="black", mirror=True)

fig.update_yaxes(showline=True, linewidth=2, linecolor="black", mirror=True)

fig.show()

### Saving the Models

After the cross validation is complete, we will save the models to disk for later use.

In [None]:
jobblib.dump(
    random_forest,
    "../../../Models/requirement_relevancy_experiment/classifier_models/distilbert_random_forest_classifier.joblib"
)

joblib.dump(
    gradient_boosting,
    "../../../Models/requirement_relevancy_experiment/classifier_models/distilbert_gradient_boost_classifier.joblib",
)

joblib.dump(
    ada_boost,
    "../../../Models/requirement_relevancy_experiment/classifier_models/distilbert_adaboost_classifier.joblib",
)

joblib.dump(
    xgboost,
    "../../../Models/requirement_relevancy_experiment/classifier_models/distilbert_xgboost_classifier.joblib",
)