In [23]:
import os
import numpy as np
import evaluate
#from string2string.similarity import BARTScore

In [24]:

def extract_text_from_files(folder1, folder2):
    # Dictionary to store text from both folders
    data = []

    # Get the list of files from both folders (assuming both folders have the same filenames)
    files1 = sorted(os.listdir(folder1))
    files2 = sorted(os.listdir(folder2))

    # Ensure both folders contain the same filenames
    if files1 != files2:
        print("Error: Folder file names don't match.")
        return None
    
    # Loop through the files and extract content
    for filename in files1:
        file1_path = os.path.join(folder1, filename)
        file2_path = os.path.join(folder2, filename)
        
        # Check if both files exist
        if os.path.exists(file1_path) and os.path.exists(file2_path):
            # Read content from folder 1 (ground truth)
            with open(file1_path, 'r', encoding='utf-8') as file1:
                folder1_text = file1.read()

            # Read content from folder 2 (generated)
            with open(file2_path, 'r', encoding='utf-8') as file2:
                folder2_text = file2.read()

            # Store both in the dictionary
            data.append({
                'ground_truth': folder1_text,
                'generated': folder2_text
            })
        else:
            print(f"Error: File {filename} is missing in one of the folders.")
    
    return data

In [25]:
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")
bertscore = evaluate.load("bertscore")
meteor =evaluate.load("meteor")

def calculate_rouge(results, rouge):
    for result in results:
        rouge.add(prediction=result['generated'], reference=result['ground_truth'])
    return rouge.compute()

def calculate_bleu(results, bleu):
    for result in results:
        bleu.add(prediction=result['generated'], reference=result['ground_truth'])
    return bleu.compute()

def calculate_meteor(results, meteor):
    for result in results:
        meteor.add(prediction=result['generated'], reference=result['ground_truth'])
    return meteor.compute()

def calculate_bertscore(results, bertscore):
    for result in results:
        bertscore.add(prediction=result['generated'], reference=result['ground_truth'])
    return bertscore.compute(lang='en') 




[nltk_data] Downloading package wordnet to
[nltk_data]     /data/nlp/spandan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /data/nlp/spandan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /data/nlp/spandan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [26]:

# Define folder paths
folder1 = './code/legalLLM/chatbot/evaluator/chats_txt_truth'
folder2 = './code/legalLLM/chatbot/evaluator/chats_txt_65'

folder1 = os.path.join(os.getcwd(), folder1)
folder2 = os.path.join(os.getcwd(), folder2)

# Call the function and get the data
data = extract_text_from_files(folder1, folder2)
# print(len(file_data), file_data[0])

In [27]:
rouge_results = calculate_rouge(data, rouge)
print("rouge_results ",rouge_results)

rouge_results  {'rouge1': 0.6690425144050275, 'rouge2': 0.41078427028279957, 'rougeL': 0.3322386011985272, 'rougeLsum': 0.6046693994342135}


In [28]:
bleu_results = calculate_bleu(data, bleu)
print("bleu_results ", bleu_results)

bleu_results  {'bleu': 0.37478123326559953, 'precisions': [0.6357315420423074, 0.40417939952302795, 0.3017374701237681, 0.25446857930934363], 'brevity_penalty': 1.0, 'length_ratio': 1.0720366573047357, 'translation_length': 220623, 'reference_length': 205798}


In [29]:
meteor_results = calculate_bleu(data, meteor)
print("meteor_results ", meteor_results)

meteor_results  {'meteor': 0.4192907460694575}


In [30]:
bertscore_results = calculate_bertscore(data, bertscore)
print("bertscore_results - precision:",np.array(bertscore_results['precision']).mean())
print("bertscore_results - recall:",np.array(bertscore_results['recall']).mean())
print("bertscore_results - f1:",np.array(bertscore_results['f1']).mean())

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


bertscore_results - precision: 0.9093403761203472
bertscore_results - recall: 0.9062752072627728
bertscore_results - f1: 0.9077879502223088


In [31]:
print("rouge_results ",rouge_results)
print("bleu_results ", bleu_results)
print("meteor_results ", meteor_results)
print("bertscore_results - precision:",np.array(bertscore_results['precision']).mean())
print("bertscore_results - recall:",np.array(bertscore_results['recall']).mean())
print("bertscore_results - f1:",np.array(bertscore_results['f1']).mean())

rouge_results  {'rouge1': 0.6690425144050275, 'rouge2': 0.41078427028279957, 'rougeL': 0.3322386011985272, 'rougeLsum': 0.6046693994342135}
bleu_results  {'bleu': 0.37478123326559953, 'precisions': [0.6357315420423074, 0.40417939952302795, 0.3017374701237681, 0.25446857930934363], 'brevity_penalty': 1.0, 'length_ratio': 1.0720366573047357, 'translation_length': 220623, 'reference_length': 205798}
meteor_results  {'meteor': 0.4192907460694575}
bertscore_results - precision: 0.9093403761203472
bertscore_results - recall: 0.9062752072627728
bertscore_results - f1: 0.9077879502223088
