In [2]:
import csv
import os
import pandas as pd
# from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# from metrics.ExactMatch import ExactMatch

In [3]:
# function to load a CSV file into a dictionary
# the dictionary keys are the column headers and the values are lists of column data
def load_csv_to_dict(file_path, name):
    with open(file_path, mode='r', encoding='utf-8') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        
        data_dict = {field: [] for field in csv_reader.fieldnames}
        data_dict['name'] = name  # Add the experiment name to the dictionary

        for row in csv_reader:
            for field in csv_reader.fieldnames:
                data_dict[field].append(row[field])

    return data_dict

# function tofind all results.csv files
# returns a list of tuples containing the parent directory name and the path to the results.csv file
def load_results_csv_paths(root_dir):
    results_list = []

    for root, dirs, files in os.walk(root_dir):
        for dir_name in dirs:
            csv_dir_path = os.path.join(root, dir_name, 'csv')
            results_path = os.path.join(csv_dir_path, 'results.csv')
            
            if os.path.isfile(results_path):
                parent_dir = os.path.basename(root)
                results_list.append((parent_dir, results_path))

    return results_list

In [4]:
def compute_cosine_similarity(sentence1, sentence2):
    """
    Compute the cosine similarity between two sentences using TF-IDF.

    Args:
    sentence1 (str): The first sentence.
    sentence2 (str): The second sentence.

    Returns:
    float: The cosine similarity score between the two sentences.
    """

    # Create a TF-IDF vectorizer

    # Create a TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Transform the sentences into TF-IDF vectors
    tfidf_matrix = vectorizer.fit_transform([sentence1, sentence2])

    # Compute the cosine similarity between the sentences
    similarity_matrix = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])

    # Print the similarity score
    similarity_score = similarity_matrix[0][0]

    return similarity_score


def exact_match(prediction, reference) -> float:
    """
    """
    exact_match_metric = ExactMatch()
    results = exact_match_metric.evaluate(prediction, reference)

    return results



def scores(responde_file, K):
    """
    Calculate the BERT similarity scores between correct and predicted answers,
    and create a new CSV file with these scores.

    :param question: The list of questions corresponding to the answers
    :param correct_answers: A list of correct answers
    :param predicted_answers: A list of predicted answers
    :param evaluation_csv_path: Path to the CSV file to be created
    :param K: Column name for the similarity scores in the CSV
    """
    questions = responde_file['Prompt']
    predicted_answers = responde_file['GPT Response']
    correct_answers = responde_file['Ground Truth Answer']
    # Load the pre-trained Sentence-BERT model
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

    # Check if the input lists have the same length
    if len(correct_answers) != len(predicted_answers):
        raise ValueError("The lists of correct and predicted answers must have the same length")

    # Prepare data for the DataFrame
    data_list = []
    index = 0
    # Compute the similarity score for each pair of answers
    for correct, predicted, question in zip(correct_answers, predicted_answers, questions):
        # Encode the sentences to get their embeddings
        embedding1 = model.encode(correct, convert_to_tensor=True)
        embedding2 = model.encode(predicted, convert_to_tensor=True)

        # Compute the cosine similarity between the embeddings
        bert_score = util.pytorch_cos_sim(embedding1, embedding2).item()
        cosine_score = compute_cosine_similarity(correct, predicted)
        exact_match_score = exact_match(correct, predicted)
        # Collect data
        data_list.append({'index': index,
                          'question': question,
                          'predicted_answer': predicted,
                          'actual_answer': correct,
                          'bert_score': bert_score,
                          'cosine_score': cosine_score,
                          'exact_match': exact_match_score
                          })
        index += 1
    # Create a DataFrame from collected data
    data = pd.DataFrame(data_list)

    # Save the DataFrame to a new CSV file
    return data


def calculate_bert_scores(file_path1, file_path2):
    """
    Calculate the BERT scores between corresponding predicted answers in two result files.

    :param file_path1: Path to the first CSV file (results_retrival_1.csv).
    :param file_path2: Path to the second CSV file (results_retrival_3.csv).
    :return: DataFrame with the BERT scores for corresponding rows.
    """
    # Load the CSV files
    df1 = pd.read_csv(file_path1)
    df2 = pd.read_csv(file_path2)

    # Load the pre-trained Sentence-BERT model
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

    # Check if the files have the same number of rows
    if len(df1) != len(df2):
        raise ValueError("Both files must have the same number of rows for 1-to-1 comparison.")

    # List to hold the score data
    scores_list = []

    # Iterate over the rows in both DataFrames by index
    for index, (predicted1, predicted2) in enumerate(zip(df1['predicted_answer'], df2['predicted_answer'])):
        # Encode the predicted answers to get their embeddings
        embedding1 = model.encode(predicted1, convert_to_tensor=True)
        embedding2 = model.encode(predicted2, convert_to_tensor=True)

        # Compute the cosine similarity between the embeddings
        bert_score = util.pytorch_cos_sim(embedding1, embedding2).item()

        # Append the results to the list
        scores_list.append({
            'Index': index,
            'Predicted_Answer_File1': predicted1,
            'Predicted_Answer_File2': predicted2,
            'BERT_Score': bert_score
        })

    # Create a DataFrame from collected data
    result_df = pd.DataFrame(scores_list)
    result_df.to_csv(f'data/results_cross_scores.csv', index=False)



In [5]:
# set the root directory to start searching for results.csv files, should be the folder containing the results folder
root_directory = '.'
results_list = load_results_csv_paths(root_directory)

csv_list = []

# load each results.csv file into a dictionary and add it to csv_list
for name, path in results_list:
    results = load_csv_to_dict(path, name)
    csv_list.append(results)

# now csv_list contains a list of dictionaries, each dictionary representing a CSV file
# the fields in each dictionary are 'name' (experiment description) and the CSV headers

# example print statements to show some data from the frist csv dictionary in csv_list
# print("Experiment name:", csv_list[0]['name'])
# print("Number of Ground Truth Answers:", len(csv_list[0]['Ground Truth Answer']))
# print("First GPT Response:", csv_list[0]['GPT Response'][0])
# print("First Question ID:", csv_list[0]['Question ID'][0])
# print("First Prompt:", csv_list[0]['Prompt'][0])

In [None]:
# make dir for scores
os.makedirs('scores', exist_ok=True)

for csv_file in csv_list:
    data = scores(csv_file, 0)
    data.to_csv(f'scores/{csv_file["name"]}_scores.csv', index=False)
    print(f'scores for {csv_file["name"]} saved to scores/{csv_file["name"]}_scores.csv')
