In [12]:
import pandas as pd
import numpy as np
from bert_score import score as bert_score
from rouge import Rouge

In [13]:
# Define correct answers
correct_tool = "Cypher"
correct_cypher_query = "MATCH (reg:Regulation) WHERE reg.ID IN ['3.2.8', '4.2.1', '5.1.1'] RETURN reg.ID AS RegulationID, reg.Description AS RegulationDescription, reg.Value AS RegulationValue, reg.Notes AS RegulationNotes, reg.`Referred Documentation` AS RegulationDocumentation"

# Function to normalize text for comparison
def normalize_text(text):
    if pd.isna(text):
        return ""
    return ' '.join(str(text).strip().split()).lower()

# Analyze the "Tool Used" column
tool_accuracy = results_df['Tool Used'].value_counts(normalize=True).get(correct_tool, 0)

# Analyze the "Cypher Query" column with normalized comparison, only if the correct tool was selected
cypher_accuracy = results_df.apply(lambda row: normalize_text(row['Cypher Query']) == normalize_text(correct_cypher_query) if row['Tool Used'] == correct_tool else False, axis=1).mean()

# Analyze the "Time The Prompt Took" column
average_time = results_df['Time The Prompt Took'].mean()
time_variance = results_df['Time The Prompt Took'].var()
time_std_dev = results_df['Time The Prompt Took'].std()

print(f"Tool Used Accuracy: {tool_accuracy * 100:.2f}%")
print(f"Cypher Query Accuracy: {cypher_accuracy * 100:.2f}%")
print(f"Average Time Taken: {average_time:.2f} seconds")
print(f"Variance in Time Taken: {time_variance:.2f}")
print(f"Standard Deviation in Time Taken: {time_std_dev:.2f}")

Tool Used Accuracy: 96.00%
Cypher Query Accuracy: 64.00%
Average Time Taken: 8.31 seconds
Variance in Time Taken: 16.36
Standard Deviation in Time Taken: 4.04


In [14]:
# Load your results CSV
file_path = 'Results.csv'
results_df = pd.read_csv(file_path)

# Expert-provided correct action plan
expert_action_plan = "Action Plan: 3.2.8: Evaluate the percentage of energy consumed by the data center that is derived from renewable or sustainable sources. Document and report this percentage in comparison to the total energy consumption. Use standardized metrics such as EN 50600-4-3 or ISO/IEC 30134-3 for accurate measurement. - 4  Notes: Standardized metrics available in EN 50600-4-3 or ISO/IEC 30134-3. Referred Documentation: EN 50600-4-3, ISO/IEC 30134-3, CLC 50600-5-1. 4.2.1: Establish processes that require approval from senior management for any new service that necessitates dedicated hardware, including servers, storage, and networking components, which do not operate on a resource-sharing platform. - 4  5.1.1: Ensure that IT equipment in cabinets shares the same airflow direction. Implement the hot/cold aisle arrangement to align airflow in and across cabinets and aisles, enhancing the design with fully blanked empty cabinets or solid doors. - 4"

# Prepare data for BERTScore and ROUGE
references = [expert_action_plan] * len(results_df)
hypotheses = results_df['Final Answer'].tolist()

# Calculate BERT Scores
P, R, F1 = bert_score(hypotheses, references, lang="en", verbose=True)
mean_bert_f1 = F1.mean().item()
variance_bert_f1 = np.var(F1.tolist())
std_dev_bert_f1 = np.std(F1.tolist())

print(f"Mean BERTScore F1: {mean_bert_f1}")
print(f"Variance of BERTScore F1: {variance_bert_f1}")
print(f"Standard Deviation of BERTScore F1: {std_dev_bert_f1}")

# Calculate ROUGE Scores
rouge = Rouge()
rouge_scores = rouge.get_scores(hypotheses, references, avg=True)

mean_rouge_1 = rouge_scores['rouge-1']['f']
mean_rouge_2 = rouge_scores['rouge-2']['f']
mean_rouge_l = rouge_scores['rouge-l']['f']

print(f"Mean ROUGE-1 F1: {mean_rouge_1}")
print(f"Mean ROUGE-2 F1: {mean_rouge_2}")
print(f"Mean ROUGE-L F1: {mean_rouge_l}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 7.91 seconds, 12.64 sentences/sec
Mean BERTScore F1: 0.9100011587142944
Variance of BERTScore F1: 0.003703262796907575
Standard Deviation of BERTScore F1: 0.06085443941823451
Mean ROUGE-1 F1: 0.5407024092395128
Mean ROUGE-2 F1: 0.3396871363256173
Mean ROUGE-L F1: 0.5084143227730317
