# Analysis of Results
This notebook show the analysis for the results of the models using the `text` field:
* find the best models across the categories
* Kruskal-Wallis tests $BERT_{base}$ vs {$BERT_{CW}$, $BERT_{SR}$, $BERT_{RW}$}

## Check the best classifier across the categories

In [37]:
import pandas as pd
from scipy.stats import kruskal


In [38]:
results = pd.read_csv('Results_text.csv')
results.columns = results.iloc[0]
results = results.iloc[1:]

for column in results.columns[1:]:
    results[column] = results[column].astype(float)
ML_results = results.iloc[16:]
BERT_results = results.iloc[0:16]

In [39]:
BERT_results

Unnamed: 0,Models,hazard-category,product-category,hazard,product,ST1,St2
1,Bert baseline,0.747,0.757,0.581,0.17,0.753,0.382
2,Bert CW,0.76,0.761,0.671,0.28,0.762,0.491
3,Bert SR,0.77,0.754,0.666,0.275,0.764,0.478
4,Bert RW,0.752,0.757,0.651,0.275,0.756,0.467
5,DistilBert baseline,0.761,0.757,0.593,0.154,0.76,0.378
6,DistilBert CW,0.766,0.753,0.635,0.246,0.763,0.449
7,DistilBert SR,0.756,0.759,0.644,0.24,0.763,0.448
8,DistilBert RW,0.749,0.747,0.647,0.261,0.753,0.462
9,RoBerta baseline,0.76,0.753,0.579,0.123,0.755,0.356
10,RoBerta CW,0.773,0.739,0.63,0.0,0.76,0.315


In [40]:
ML_results

Unnamed: 0,Models,hazard-category,product-category,hazard,product,ST1,St2
17,SVM baseline,0.701,0.626,0.544,0.234,0.682,0.396
18,SVM CW,0.655,0.642,0.519,0.256,0.649,0.396
19,SVM SR,0.707,0.674,0.511,0.234,0.693,0.379
20,SVM RW,0.687,0.643,0.542,0.246,0.682,0.401
21,LR baseline,0.666,0.665,0.511,0.203,0.68,0.368
22,LR CW,0.713,0.682,0.457,0.209,0.702,0.347
23,LR SR,0.698,0.677,0.454,0.233,0.691,0.354
24,LR RW,0.666,0.676,0.522,0.216,0.673,0.38
25,DT baseline,0.542,0.445,0.405,0.012,0.484,0.208
26,DT CW,0.617,0.491,0.427,0.029,0.544,0.23


In [41]:
def find_best_model(df):
  top_performers = {}
  for column in df.columns[1:]:
      top_model = df.loc[df[column].idxmax(), 'Models']
      top_score = df[column].max()
      top_performers[column] = {'Model': top_model, 'Score': top_score}
      print(f"Top performer for {column} is {top_model} with a score of {top_score}")

In [42]:
find_best_model(BERT_results)

Top performer for hazard-category is ModernBert SR with a score of 0.79
Top performer for product-category is Bert CW with a score of 0.761
Top performer for hazard is Bert CW with a score of 0.671
Top performer for product is Bert CW with a score of 0.28
Top performer for ST1 is ModernBert baseline with a score of 0.769
Top performer for St2 is Bert CW with a score of 0.491


In [43]:
find_best_model(ML_results)

Top performer for hazard-category is LR CW with a score of 0.713
Top performer for product-category is LR CW with a score of 0.682
Top performer for hazard is RF RW with a score of 0.567
Top performer for product is SVM CW with a score of 0.256
Top performer for ST1 is LR CW with a score of 0.702
Top performer for St2 is SVM RW with a score of 0.401


In [44]:
labels = ['hazard-category', 'product-category', 'hazard', 'product']

In [45]:
def compare_baseline_to_augmented(df):
  baseline_comparison = {}
  baseline_models = df[df['Models'].str.endswith('baseline')]["Models"].tolist()
  for baseline in baseline_models:
      for label in labels:
          baseline_score = df.loc[df['Models'] == baseline, label].values[0]
          augmented_scores = {}
          for suffix in ['CW', 'SR', 'RW']:
              augmented_model = baseline.replace('baseline', suffix)
              augmented_score = df.loc[df['Models'] == augmented_model, label].values[0]

              score_difference = augmented_score - baseline_score
              if score_difference > 0.1:
                print(f"{augmented_model} is better than {baseline}")
                print(f"{augmented_model} score : {augmented_score}")
                print(f"Difference: {score_difference}")

              augmented_scores[augmented_model] = score_difference
          baseline_comparison[f"{baseline} on {label}"] = augmented_scores
  return baseline_comparison
baseline_comparison = compare_baseline_to_augmented(BERT_results)

Bert CW is better than Bert baseline
Bert CW score : 0.28
Difference: 0.11000000000000001
Bert SR is better than Bert baseline
Bert SR score : 0.275
Difference: 0.10500000000000001
Bert RW is better than Bert baseline
Bert RW score : 0.275
Difference: 0.10500000000000001
DistilBert RW is better than DistilBert baseline
DistilBert RW score : 0.261
Difference: 0.10700000000000001


In [46]:
print("\nBaseline vs Augmented Comparison:")
for comparison, scores in baseline_comparison.items():
    print(f"{comparison}:")
    for model, score_diff in scores.items():
        print(f"  {model}: {score_diff:+.3f}")


Baseline vs Augmented Comparison:
Bert baseline on hazard-category:
  Bert CW: +0.013
  Bert SR: +0.023
  Bert RW: +0.005
Bert baseline on product-category:
  Bert CW: +0.004
  Bert SR: -0.003
  Bert RW: +0.000
Bert baseline on hazard:
  Bert CW: +0.090
  Bert SR: +0.085
  Bert RW: +0.070
Bert baseline on product:
  Bert CW: +0.110
  Bert SR: +0.105
  Bert RW: +0.105
DistilBert baseline on hazard-category:
  DistilBert CW: +0.005
  DistilBert SR: -0.005
  DistilBert RW: -0.012
DistilBert baseline on product-category:
  DistilBert CW: -0.004
  DistilBert SR: +0.002
  DistilBert RW: -0.010
DistilBert baseline on hazard:
  DistilBert CW: +0.042
  DistilBert SR: +0.051
  DistilBert RW: +0.054
DistilBert baseline on product:
  DistilBert CW: +0.092
  DistilBert SR: +0.086
  DistilBert RW: +0.107
RoBerta baseline on hazard-category:
  RoBerta CW: +0.013
  RoBerta SR: +0.017
  RoBerta RW: -0.003
RoBerta baseline on product-category:
  RoBerta CW: -0.014
  RoBerta SR: +0.002
  RoBerta RW: -0.

## Kruskal Wallis tests
Run Kruskal Wallis tests across the categories after running the BERT variants 3 times to compare baselines with augmentations.

In [47]:
data = pd.read_csv('Results_bert.csv')
data.head()

Unnamed: 0,Models,hazard-category,product-category,hazard,product
0,Bert baseline,0.747,0.757,0.581,0.17
1,Bert CW,0.76,0.761,0.671,0.28
2,Bert SR,0.77,0.754,0.666,0.275
3,Bert RW,0.752,0.757,0.651,0.275
4,Bert baseline,0.759,0.777,0.594,0.196


In [49]:
def perform_kruskal_baseline_vs_aug(data, baseline_model, augmentations, labels):
    results = {}
    for label in labels:
        results[label] = {}
        baseline_data = data[data['Models'] == baseline_model][label]
        for aug in augmentations:
            aug_data = data[data['Models'] == aug][label]
            stat, p = kruskal(baseline_data, aug_data)
            results[label][aug] = {'statistic': stat, 'p-value': f'{p:.4f}'}
    return results

baseline_model = 'Bert baseline'
augmentations = ['Bert CW', 'Bert RW', 'Bert SR']
labels = ['hazard-category', 'product-category', 'hazard', 'product']

kruskal_results_baseline_vs_aug = perform_kruskal_baseline_vs_aug(data, baseline_model, augmentations, labels)
kruskal_results_baseline_vs_aug


{'hazard-category': {'Bert CW': {'statistic': np.float64(0.42857142857142705),
   'p-value': '0.5127'},
  'Bert RW': {'statistic': np.float64(1.1904761904761898),
   'p-value': '0.2752'},
  'Bert SR': {'statistic': np.float64(1.1904761904761898),
   'p-value': '0.2752'}},
 'product-category': {'Bert CW': {'statistic': np.float64(1.1904761904761898),
   'p-value': '0.2752'},
  'Bert RW': {'statistic': np.float64(0.784313725490197), 'p-value': '0.3758'},
  'Bert SR': {'statistic': np.float64(3.9705882352941146),
   'p-value': '0.0463'}},
 'hazard': {'Bert CW': {'statistic': np.float64(3.857142857142854),
   'p-value': '0.0495'},
  'Bert RW': {'statistic': np.float64(3.857142857142854), 'p-value': '0.0495'},
  'Bert SR': {'statistic': np.float64(3.9705882352941146),
   'p-value': '0.0463'}},
 'product': {'Bert CW': {'statistic': np.float64(3.9705882352941146),
   'p-value': '0.0463'},
  'Bert RW': {'statistic': np.float64(3.857142857142854), 'p-value': '0.0495'},
  'Bert SR': {'statistic'

In [52]:
stats = data.groupby('Models').agg(['mean', 'std']).round(3)
stats

Unnamed: 0_level_0,hazard-category,hazard-category,product-category,product-category,hazard,hazard,product,product
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std
Models,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Bert CW,0.768,0.018,0.756,0.006,0.658,0.018,0.284,0.008
Bert RW,0.751,0.005,0.752,0.024,0.662,0.021,0.256,0.02
Bert SR,0.771,0.024,0.75,0.008,0.652,0.012,0.189,0.164
Bert baseline,0.757,0.009,0.769,0.01,0.594,0.013,0.186,0.014
