In [1]:
import numpy as np
import joblib
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.metrics import roc_curve, roc_auc_score
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd

In [2]:
# Load all the saved models
random_forest = joblib.load(
    "../../../Models/requirement_relevancy_experiment/classifier_models/distilbert_random_forest_classifier.joblib"
)

gradient_boosting = joblib.load(
    "../../../Models/requirement_relevancy_experiment/classifier_models/distilbert_gradient_boost_classifier.joblib"
)

ada_boost = joblib.load(
    "../../../Models/requirement_relevancy_experiment/classifier_models/distilbert_adaboost_classifier.joblib"
)

xgboost = joblib.load(
    "../../../Models/requirement_relevancy_experiment/classifier_models/distilbert_xgboost_classifier.joblib"
)

In [3]:
# Use this if you already have data from the model state outputs
X_train = np.loadtxt(
    "../../../Datasets/irrelevant_requirements_dataset/model_state_outputs/distilbert/reshaped_X_train_last_hidden_states.csv"
)

X_test = np.loadtxt(
    "../../../Datasets/irrelevant_requirements_dataset/model_state_outputs/distilbert/reshaped_X_test_last_hidden_states.csv"
)

y_train = np.loadtxt(
    "../../../Datasets/irrelevant_requirements_dataset/model_state_outputs/distilbert/y_train.csv"
)

y_test = np.loadtxt(
    "../../../Datasets/irrelevant_requirements_dataset/model_state_outputs/distilbert/y_test.csv"
)

In [4]:
# Load the resampled data

# X_resampled = np.loadtxt(
#     "../../../Datasets/irrelevant_requirements_dataset/distilbert_X_resampled.csv",
#     delimiter=",",
# )

# y_resampled = np.loadtxt(
#       "../../../Datasets/irrelevant_requirements_dataset/distilbert_y_resampled.csv",
#     delimiter=",",
# )

In [5]:
# X_resampled.shape, y_resampled.shape

In [6]:
# X_train, X_test, y_train, y_test = train_test_split(
#     X_resampled, y_resampled, test_size=0.2, random_state=42
# )

In [7]:
validation_fold = 5

In [8]:
cross_validation_result_dict = {
    "model_name": [],
    "accuracy": [],
    "precision": [],
    "recall": [],
    "f1": [],
    "fit_time": [],
    "score_time": [],
}


def append_scores(
    cross_validation_result_dict,
    model_name: str,
    accuracy: float,
    precision: float,
    recall: float,
    f1: float,
    fit_time: float,
    score_time: float,
):
    cross_validation_result_dict["model_name"].append(model_name)
    cross_validation_result_dict["accuracy"].append(accuracy)
    cross_validation_result_dict["precision"].append(precision)
    cross_validation_result_dict["recall"].append(recall)
    cross_validation_result_dict["f1"].append(f1)
    cross_validation_result_dict["fit_time"].append(fit_time)
    cross_validation_result_dict["score_time"].append(score_time)

In [9]:
diagonal_line = pd.DataFrame(
    {"False Positive Rate": [0, 1], "True Positive Rate": [0, 1]}
)


def show_roc_curve(model, model_name: str, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    true_positive_rate, false_positive_rate, _ = roc_curve(y_test, y_pred_proba)
    area_under_curve = roc_auc_score(y_test, y_pred_proba)

    fig = px.line(
        diagonal_line,
        x="False Positive Rate",
        y="True Positive Rate",
        title="ROC Curve",
    )

    fig.update_layout(
        xaxis_title="False Positive Rate",
        yaxis_title="True Positive Rate",
        font=dict(size=16),
        legend=dict(
            bordercolor="Black",
            borderwidth=2,
            font=dict(size=18),
        ),
    )

    fig.update_xaxes(showline=True, linewidth=2, linecolor="black", mirror=True)

    fig.update_yaxes(showline=True, linewidth=2, linecolor="black", mirror=True)

    fig.add_trace(
        go.Scatter(
            x=true_positive_rate,
            y=false_positive_rate,
            mode="lines",
            name=f"{model_name} (AUC = {area_under_curve:.4f})",
        )
    )

    fig.show()

## Cross Validation

Cross validation is a technique for assessing how the results of a statistical analysis will generalize to an independent data set. It is mainly used in settings where the goal is prediction, and one wants to estimate how accurately a predictive model will perform in practice. One round of cross-validation involves partitioning a sample of data into complementary subsets, performing the analysis on one subset (called the training set), and validating the analysis on the other subset (called the validation set or testing set). To reduce variability, multiple rounds of cross-validation are performed using different partitions, and the validation results are averaged over the rounds.

### Cross Validation for Random Forest

In [10]:
random_forest_scores = cross_validate(
    random_forest,
    X_train,
    y_train,
    cv=validation_fold,
    n_jobs=-1,
    scoring=["f1", "accuracy", "precision", "recall"],
)

In [11]:
random_forest_scores

{'fit_time': array([ 1.22399974, 10.31923366, 10.3162365 ,  9.97817159, 10.26217628]),
 'score_time': array([0.01807499, 0.02094364, 0.01994014, 0.01600027, 0.01699948]),
 'test_f1': array([0.78, 1.  , 1.  , 1.  , 1.  ]),
 'test_accuracy': array([0.71794872, 1.        , 1.        , 1.        , 1.        ]),
 'test_precision': array([0.63934426, 1.        , 1.        , 1.        , 1.        ]),
 'test_recall': array([1., 1., 1., 1., 1.])}

In [12]:
append_scores(
    cross_validation_result_dict,
    "Random Forest",
    np.mean(random_forest_scores["test_accuracy"]),
    np.mean(random_forest_scores["test_precision"]),
    np.mean(random_forest_scores["test_recall"]),
    np.mean(random_forest_scores["test_f1"]),
    np.mean(random_forest_scores["fit_time"]),
    np.mean(random_forest_scores["score_time"]),
)

In [13]:
show_roc_curve(random_forest, "Random Forest", X_test, y_test)

### Adaboost Cross Validation

In [14]:
adaboost_cross_validation_scores = cross_validate(
    ada_boost,
    X_train,
    y_train,
    cv=validation_fold,
    n_jobs=-1,
    scoring=["f1", "accuracy", "precision", "recall"],
)

adaboost_cross_validation_scores

{'fit_time': array([  2.44158077, 126.2322402 , 128.55475712, 130.99494004,
        126.53860903]),
 'score_time': array([0.01700068, 0.481879  , 0.48175144, 0.34999514, 0.484519  ]),
 'test_f1': array([0.7839196 , 0.98039216, 0.97368421, 0.98039216, 0.95945946]),
 'test_accuracy': array([0.72435897, 0.98076923, 0.97435897, 0.98064516, 0.96129032]),
 'test_precision': array([0.6446281, 1.       , 1.       , 1.       , 1.       ]),
 'test_recall': array([1.        , 0.96153846, 0.94871795, 0.96153846, 0.92207792])}

In [15]:
append_scores(
    cross_validation_result_dict,
    "AdaBoost",
    np.mean(adaboost_cross_validation_scores["test_accuracy"]),
    np.mean(adaboost_cross_validation_scores["test_precision"]),
    np.mean(adaboost_cross_validation_scores["test_recall"]),
    np.mean(adaboost_cross_validation_scores["test_f1"]),
    np.mean(adaboost_cross_validation_scores["fit_time"]),
    np.mean(adaboost_cross_validation_scores["score_time"]),
)

In [16]:
show_roc_curve(ada_boost, "AdaBoost", X_test, y_test)

### XGBoost Cross Validation

In [17]:
xgboost_scores = cross_validate(
    xgboost,
    X_train,
    y_train,
    cv=validation_fold,
    scoring=["f1", "accuracy", "precision", "recall"],
)

xgboost_scores

{'fit_time': array([ 7.96252656, 17.82953143, 19.33807516, 17.31403542, 19.40600705]),
 'score_time': array([0.03100014, 0.03099036, 0.05492449, 0.03096437, 0.02999258]),
 'test_f1': array([0.78      , 1.        , 1.        , 0.98701299, 1.        ]),
 'test_accuracy': array([0.71794872, 1.        , 1.        , 0.98709677, 1.        ]),
 'test_precision': array([0.63934426, 1.        , 1.        , 1.        , 1.        ]),
 'test_recall': array([1.        , 1.        , 1.        , 0.97435897, 1.        ])}

In [18]:
append_scores(
    cross_validation_result_dict,
    "XGBoost",
    np.mean(xgboost_scores["test_accuracy"]),
    np.mean(xgboost_scores["test_precision"]),
    np.mean(xgboost_scores["test_recall"]),
    np.mean(xgboost_scores["test_f1"]),
    np.mean(xgboost_scores["fit_time"]),
    np.mean(xgboost_scores["score_time"]),
)

In [19]:
show_roc_curve(xgboost, "XGBoost", X_test, y_test)

### Gradient Boosting Cross Validation


In [20]:
gradient_boosting_cross_validation_scores = cross_validate(
    gradient_boosting,
    X_train,
    y_train,
    cv=validation_fold,
    n_jobs=-1,
    scoring=["f1", "accuracy", "precision", "recall"],
)

In [21]:
append_scores(
    cross_validation_result_dict,
    "Gradient Boosting",
    np.mean(gradient_boosting_cross_validation_scores["test_accuracy"]),
    np.mean(gradient_boosting_cross_validation_scores["test_precision"]),
    np.mean(gradient_boosting_cross_validation_scores["test_recall"]),
    np.mean(gradient_boosting_cross_validation_scores["test_f1"]),
    np.mean(gradient_boosting_cross_validation_scores["fit_time"]),
    np.mean(gradient_boosting_cross_validation_scores["score_time"]),
)

In [22]:
show_roc_curve(xgboost, "XGBoost", X_test, y_test)

## Result Export

In [23]:
cross_validation_result_dict

{'model_name': ['Random Forest', 'AdaBoost', 'XGBoost', 'Gradient Boosting'],
 'accuracy': [0.9435897435897436,
  0.9242845326716294,
  0.9410090984284534,
  0.939718775847808],
 'precision': [0.9278688524590164,
  0.9289256198347108,
  0.9278688524590164,
  0.9278688524590164],
 'recall': [1.0, 0.9587745587745589, 0.9948717948717949, 0.9923076923076923],
 'f1': [0.9560000000000001,
  0.935569516340243,
  0.9534025974025975,
  0.9520784313725491],
 'fit_time': [8.419963550567626,
  102.9524254322052,
  16.370035123825073,
  632.633481836319],
 'score_time': [0.01839170455932617,
  0.36302905082702636,
  0.03557438850402832,
  0.019798469543457032]}

In [24]:
from pandas import DataFrame

cross_validation_result_df = DataFrame(cross_validation_result_dict)
cross_validation_result_df.to_csv(
    "../../../Results/irrelevant_requirements_experiment/classifiers_with_distilbert_cross_validation_results.csv",
    index=False,
)

In [25]:
cross_validation_result_df

Unnamed: 0,model_name,accuracy,precision,recall,f1,fit_time,score_time
0,Random Forest,0.94359,0.927869,1.0,0.956,8.419964,0.018392
1,AdaBoost,0.924285,0.928926,0.958775,0.93557,102.952425,0.363029
2,XGBoost,0.941009,0.927869,0.994872,0.953403,16.370035,0.035574
3,Gradient Boosting,0.939719,0.927869,0.992308,0.952078,632.633482,0.019798


In [26]:
# Make a bar chart of the cross validation results that contains the accuracy, precision, recall, and f1 scores. The graph should also contain value of each of the scores.

import plotly.graph_objects as go

fig = go.Figure(
    data=[
        go.Bar(
            name="Accuracy",
            x=cross_validation_result_df["model_name"],
            y=cross_validation_result_df["accuracy"],
        ),
        go.Bar(
            name="Precision",
            x=cross_validation_result_df["model_name"],
            y=cross_validation_result_df["precision"],
        ),
        go.Bar(
            name="Recall",
            x=cross_validation_result_df["model_name"],
            y=cross_validation_result_df["recall"],
        ),
        go.Bar(
            name="F1",
            x=cross_validation_result_df["model_name"],
            y=cross_validation_result_df["f1"],
        ),
    ]
)

fig.update_layout(
    barmode="group",
    xaxis_title="Model",
    yaxis_title="Score",
    font=dict(size=16),
    legend=dict(
        bordercolor="Black",
        borderwidth=2,
        font=dict(size=18),
    ),
)

fig.update_xaxes(showline=True, linewidth=2, linecolor="black", mirror=True)

fig.update_yaxes(showline=True, linewidth=2, linecolor="black", mirror=True)

fig.show()