In [99]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [100]:
bert_df = pd.read_csv('data/deBERTA_result.csv')
llama_df = pd.read_csv('data/llama_result.csv')

In [101]:
def parse_labels(lable):
    lable = lable.strip().lower()
    if 'unsafe' in lable: 
        return 1
    elif 'safe' in lable and 'unsafe' not in lable: 
        return 0
    if lable == "injection": return 1
    return -1

In [102]:
bert_df['Label'] = bert_df['Label'].apply(parse_labels)
llama_df['Label'] = llama_df['Label'].apply(parse_labels)

In [103]:
print(bert_df.columns)
print(llama_df.columns)

Index(['Prompt', 'source', 'target', 'Label', 'Score'], dtype='object')
Index(['Prompt', 'source', 'target', 'Label'], dtype='object')


In [104]:
# Function to calculate metrics for each source
def calculate_metrics_per_source(df, model_name):
    results = []
    grouped = df.groupby('source')
    for source, group in grouped:
        true_labels = group['target']
        predicted_labels = group['Label']
        
        accuracy = accuracy_score(true_labels, predicted_labels)
        precision = precision_score(true_labels, predicted_labels, zero_division=0)
        recall = recall_score(true_labels, predicted_labels, zero_division=0)
        f1 = f1_score(true_labels, predicted_labels, zero_division=0)
        
        results.append([source, accuracy, precision, recall, f1])
    
    # Create a DataFrame for the results
    results_df = pd.DataFrame(results, columns=['Source', 'Accuracy', 'Precision', 'Recall', 'F1-score'])
    
    # Print the results in a compressed table format
    print(f"\n{model_name} Model Metrics:")
    print(results_df.to_string(index=False, float_format="%.2f"))
    
    # Calculate and print the average metrics
    avg_metrics = results_df[['Accuracy', 'Precision', 'Recall', 'F1-score']].mean()
    print(f"\nAverage Metrics for {model_name}:")
    print(avg_metrics.to_string(float_format="%.2f"))


In [105]:
# Calculate metrics for each source in BERT
calculate_metrics_per_source(bert_df, "BERT")


BERT Model Metrics:
              Source  Accuracy  Precision  Recall  F1-score
 forbidden_questions      1.00       1.00    1.00      1.00
infected_given_train      0.49       1.00    0.49      0.66
   jailbreak_prompts      0.77       1.00    0.77      0.87
            leetcode      0.99       0.00    0.00      0.00
     row_given_train      0.61       0.00    0.00      0.00

Average Metrics for BERT:
Accuracy    0.77
Precision   0.60
Recall      0.45
F1-score    0.51


In [106]:
# Calculate metrics for each source in LLaMA
calculate_metrics_per_source(llama_df, "LLaMA")


LLaMA Model Metrics:
              Source  Accuracy  Precision  Recall  F1-score
infected_given_train      0.10       1.00    0.10      0.19
   jailbreak_prompts      0.30       1.00    0.30      0.46
     row_given_train      0.99       0.00    0.00      0.00

Average Metrics for LLaMA:
Accuracy    0.47
Precision   0.67
Recall      0.14
F1-score    0.22
