In [31]:
import os
import re
import json
import pickle
from dotenv import load_dotenv
import regex
import string
import torch
import pandas as pd
from typing import List, Dict

In [32]:
# Load environment variables
load_dotenv(override=True)

# General variables
seed = int(os.getenv("SEED"))
batch_size = int(os.getenv("BATCH_SIZE"))
save_every = int(os.getenv("SAVE_EVERY"))

# Retrieval variables
queries_path = os.getenv("QUERIES_PATH")

# LLM variables
llm_id = os.getenv("LLM_ID")
num_docs = int(os.getenv("TOP_K"))
llm_response_dir = os.getenv("LLM_RESPONSE_DIR")

# Evaluation variables
evaluation_dir = os.getenv("EVALUATION_DIR")

K = 1
HARD_NEG = 2
RANDOM = 0
directory = f"../data/CS4360-NLP-result-files/contriever_docs_k_is_{K}_random_{RANDOM}_hard_neg_{HARD_NEG}"
filename_prefix = f"contriever_k_{K}_doc_{HARD_NEG}_neg_{RANDOM}_ran_info_"


In [33]:
llm_folder = llm_id.split("/")[1] if '/' in llm_id else llm_id
# directory = f"{llm_response_dir}{llm_folder}/oracle_2_negative/docs_k=1_random_0_hard_neg_2"

evaluation_directory = os.path.join(evaluation_dir, f"{llm_folder}")
print("Directory: ", directory)

Directory:  ../data/CS4360-NLP-result-files/contriever_docs_k_is_3_random_0_hard_neg_2


In [34]:
# Utility functions
def extract_number_from_filename(filename: str, pattern: re.Pattern) -> int:
    match = pattern.search(filename)
    return int(match.group(1)) if match else 0

def convert_tensors(cell):
    if isinstance(cell, list):
        return [[t.tolist() if torch.is_tensor(t) else t for t in inner_list] for inner_list in cell]
    return cell

def read_json(file_path: str):
    with open(file_path, "rb") as reader:
        data = json.load(reader)
    return data


def write_json(data, file_path: str):
    with open(file_path, "w") as writer:
        json.dump(data, writer)

In [35]:
def load_pickle_files(directory: str, filename_prefix: str) -> pd.DataFrame:
    """ Loads and concatenates data from all pickle files in the directory with the given prefix. """
    pattern = re.compile(r'(\d+).pkl')
    files = [f for f in os.listdir(directory) if f.endswith('.pkl') and filename_prefix in f]
    files.sort(key=lambda f: extract_number_from_filename(f, pattern))

    print(files)

    data_list = []
    for file in files:
        with open(os.path.join(directory, file), 'rb') as f:
            data = pickle.load(f)
            data_list.extend(data)
    
    data_df = pd.DataFrame(data_list)
    # data_df['document_indices'] = data_df['document_indices'].apply(convert_tensors)

    if 'prompt_tokens_len' in data_df.columns:
        data_df['prompt_tokens_len'] = data_df['prompt_tokens_len'].apply(lambda x: x.tolist())
    return data_df

In [36]:
def save_data_to_json(data_df: pd.DataFrame, directory: str, filename_prefix: str):
    """ Saves the given DataFrame to a JSON file. """
    data_path = os.path.join(directory, f'{filename_prefix}all.json')
    # Check if the file already exists
    if os.path.exists(data_path):
        overwrite = input(f"File {data_path} already exists. Overwrite? (y/n): ")
        if overwrite.lower() != 'y':
            print("No overwrite.")

            results_df = pd.read_json(f'{evaluation_directory}/{filename_prefix}all_extended.json')
            accuracy = round(results_df['ans_match_after_norm'].sum() / len(results_df), 4)
            print("ACCURACY: ", accuracy)
            return None
        
    data_df.to_json(data_path, orient='records', default_handler=str)
    return data_path

In [37]:
data_df = load_pickle_files(directory, filename_prefix)
data_path = save_data_to_json(data_df, directory, filename_prefix)

print(data_path)

if data_path:
    print(f"Data saved to {data_path}.")
    print("Data shape: ", data_df.shape)
    print("Data columns: ", data_df.columns)
    print("Data sample: ", data_df.head())

['contriever_k_3_doc_2_neg_0_ran_info_10.pkl', 'contriever_k_3_doc_2_neg_0_ran_info_20.pkl', 'contriever_k_3_doc_2_neg_0_ran_info_30.pkl', 'contriever_k_3_doc_2_neg_0_ran_info_40.pkl', 'contriever_k_3_doc_2_neg_0_ran_info_50.pkl', 'contriever_k_3_doc_2_neg_0_ran_info_60.pkl', 'contriever_k_3_doc_2_neg_0_ran_info_70.pkl', 'contriever_k_3_doc_2_neg_0_ran_info_80.pkl', 'contriever_k_3_doc_2_neg_0_ran_info_90.pkl', 'contriever_k_3_doc_2_neg_0_ran_info_100.pkl', 'contriever_k_3_doc_2_neg_0_ran_info_110.pkl', 'contriever_k_3_doc_2_neg_0_ran_info_120.pkl', 'contriever_k_3_doc_2_neg_0_ran_info_130.pkl', 'contriever_k_3_doc_2_neg_0_ran_info_140.pkl', 'contriever_k_3_doc_2_neg_0_ran_info_150.pkl', 'contriever_k_3_doc_2_neg_0_ran_info_160.pkl', 'contriever_k_3_doc_2_neg_0_ran_info_170.pkl', 'contriever_k_3_doc_2_neg_0_ran_info_180.pkl', 'contriever_k_3_doc_2_neg_0_ran_info_190.pkl', 'contriever_k_3_doc_2_neg_0_ran_info_200.pkl', 'contriever_k_3_doc_2_neg_0_ran_info_210.pkl', 'contriever_k_3_doc_2

In [38]:
df = pd.read_json(queries_path)

In [39]:
"""
adapted from chemdataextractor.text.normalize
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Tools for normalizing text.
https://github.com/mcs07/ChemDataExtractor
:copyright: Copyright 2016 by Matt Swain.
:license: MIT

Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
'Software'), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:

The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""

#: Control characters.
CONTROLS = {
    '\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008', '\u000e', '\u000f', '\u0011',
    '\u0012', '\u0013', '\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', '\u001a', '\u001b',
}
# There are further control characters, but they are instead replaced with a space by unicode normalization
# '\u0009', '\u000a', '\u000b', '\u000c', '\u000d', '\u001c',  '\u001d', '\u001e', '\u001f'


#: Hyphen and dash characters.
HYPHENS = {
    '-',  # \u002d Hyphen-minus
    '‐',  # \u2010 Hyphen
    '‑',  # \u2011 Non-breaking hyphen
    '⁃',  # \u2043 Hyphen bullet
    '‒',  # \u2012 figure dash
    '–',  # \u2013 en dash
    '—',  # \u2014 em dash
    '―',  # \u2015 horizontal bar
}

#: Minus characters.
MINUSES = {
    '-',  # \u002d Hyphen-minus
    '−',  # \u2212 Minus
    '－',  # \uff0d Full-width Hyphen-minus
    '⁻',  # \u207b Superscript minus
}

#: Plus characters.
PLUSES = {
    '+',  # \u002b Plus
    '＋',  # \uff0b Full-width Plus
    '⁺',  # \u207a Superscript plus
}

#: Slash characters.
SLASHES = {
    '/',  # \u002f Solidus
    '⁄',  # \u2044 Fraction slash
    '∕',  # \u2215 Division slash
}

#: Tilde characters.
TILDES = {
    '~',  # \u007e Tilde
    '˜',  # \u02dc Small tilde
    '⁓',  # \u2053 Swung dash
    '∼',  # \u223c Tilde operator #in mbert vocab
    '∽',  # \u223d Reversed tilde
    '∿',  # \u223f Sine wave
    '〜',  # \u301c Wave dash #in mbert vocab
    '～',  # \uff5e Full-width tilde #in mbert vocab
}

#: Apostrophe characters.
APOSTROPHES = {
    "'",  # \u0027
    '’',  # \u2019
    '՚',  # \u055a
    'Ꞌ',  # \ua78b
    'ꞌ',  # \ua78c
    '＇',  # \uff07
}

#: Single quote characters.
SINGLE_QUOTES = {
    "'",  # \u0027
    '‘',  # \u2018
    '’',  # \u2019
    '‚',  # \u201a
    '‛',  # \u201b

}

#: Double quote characters.
DOUBLE_QUOTES = {
    '"',  # \u0022
    '“',  # \u201c
    '”',  # \u201d
    '„',  # \u201e
    '‟',  # \u201f
}

#: Accent characters.
ACCENTS = {
    '`',  # \u0060
    '´',  # \u00b4
}

#: Prime characters.
PRIMES = {
    '′',  # \u2032
    '″',  # \u2033
    '‴',  # \u2034
    '‵',  # \u2035
    '‶',  # \u2036
    '‷',  # \u2037
    '⁗',  # \u2057
}

#: Quote characters, including apostrophes, single quotes, double quotes, accents and primes.
QUOTES = APOSTROPHES | SINGLE_QUOTES | DOUBLE_QUOTES | ACCENTS | PRIMES

def normalize(text):
    for control in CONTROLS:
        text = text.replace(control, '')
    text = text.replace('\u000b', ' ').replace('\u000c', ' ').replace(u'\u0085', ' ')

    for hyphen in HYPHENS | MINUSES:
        text = text.replace(hyphen, '-')
    text = text.replace('\u00ad', '')

    for double_quote in DOUBLE_QUOTES:
        text = text.replace(double_quote, '"')  # \u0022
    for single_quote in (SINGLE_QUOTES | APOSTROPHES | ACCENTS):
        text = text.replace(single_quote, "'")  # \u0027
    text = text.replace('′', "'")     # \u2032 prime
    text = text.replace('‵', "'")     # \u2035 reversed prime
    text = text.replace('″', "''")    # \u2033 double prime
    text = text.replace('‶', "''")    # \u2036 reversed double prime
    text = text.replace('‴', "'''")   # \u2034 triple prime
    text = text.replace('‷', "'''")   # \u2037 reversed triple prime
    text = text.replace('⁗', "''''")  # \u2057 quadruple prime

    text = text.replace('…', '...').replace(' . . . ', ' ... ')  # \u2026

    for slash in SLASHES:
        text = text.replace(slash, '/')

    for tilde in TILDES:
       text = text.replace(tilde, '~')

    return text


In [40]:
# Normalization adapted from SQuAD evaluation script https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/
def remove_articles(text: str) -> str:
    """
    Removes articles ('a', 'an', 'the') from the text.
    """
    return regex.sub(r'\b(a|an|the)\b', ' ', text)

def white_space_fix(text: str) -> str:
    """
    Fixes extra whitespace in the text by collapsing multiple spaces into one.
    """
    return ' '.join(text.split())

def remove_punc(text: str) -> str:
    """
    Removes punctuation from the text and replaces it with a space.
    """
    for punct in string.punctuation:
        text = text.replace(punct, ' ')
    return text

def lower(text: str) -> str:
    """
    Converts all characters in the text to lowercase.
    """
    return text.lower()

def normalize_answer(s: str, lowercase: bool = True) -> str:
    """
    Normalizes answers by removing articles, punctuation, fixing whitespace, and optionally converting to lowercase.
    """
    if lowercase:
        s = lower(s)
    s = normalize(s)
    return white_space_fix(remove_articles(remove_punc(s)))

In [41]:
def are_answers_matching(prediction: str, ground_truth: str) -> float:
    normalized_prediction = normalize_answer(prediction)

    normalized_ground_truth = normalize_answer(ground_truth)
    if normalized_ground_truth in normalized_prediction:
        return True
    
    return False

In [42]:
def is_answer_in_text(text: str, answer: str) -> bool:
    """
    Checks if any of the provided answers are present in the given text after normalization.
    """
    normalized_answer_lower = normalize_answer(answer, lowercase=True)
    normalized_answer = normalize_answer(answer, lowercase=False)
    normalized_text = white_space_fix(remove_punc(text))

    if (answer in text or 
        normalized_answer_lower in normalized_text or 
        normalized_answer in normalized_text):
        return True
    
    return False

In [43]:
def read_generation_results(file_path: str, dataframe: pd.DataFrame) -> List[Dict]:
    data = []
    with open(file_path, "r") as fin:
        file_data = json.load(fin)

        for _, sample in enumerate(file_data):
            id = sample['id'][0] 
            query = sample['query'][0]
            prompt = sample['prompt'][0]
            # document_indices = list(list(zip(*sample['document_indices']))[0])
            prompt_tokens_len = sample['prompt_tokens_len'][0]
            generated_answer = sample['generated_answer'][0]
            
            # print(id)
            answer = dataframe[dataframe['_id'] == str(id)]["answer"].values[0]

            ans_match_after_norm: bool = are_answers_matching(generated_answer, answer)
            ans_in_documents: bool = is_answer_in_text(prompt, answer)
            
            data.append({
                'example_id': str(id),
                'query': query,
                'prompt': prompt,
                # 'document_indices': document_indices,
                'generated_answer': generated_answer,
                'answer': answer,
                'ans_match_after_norm': ans_match_after_norm,
                'ans_in_documents': ans_in_documents,
                "prompt_tokens_len": prompt_tokens_len,
                })       

    return data


In [44]:
results = read_generation_results(data_path, df) # id, query, prompt, document_indices, prompt_tokens_len, generated_answer

In [45]:
print(f"EVALUATED EXPERIMENT CONTRIEVER WITH K={K}, HARD_NEG={HARD_NEG}, RANDOM={RANDOM}")
print(f"READ DIRECTORY {directory}")
results_df = pd.DataFrame(results)
matches = results_df['ans_match_after_norm'].sum()
accuracy = round(matches / len(results_df), 4)
print("ACCURACY: ", accuracy)

num_samples_with_answer = results_df[results_df['ans_in_documents'] == True]
relative_accuracy = round(matches / len(num_samples_with_answer), 4)
print("RELATIVE ACCURACY: ", relative_accuracy)

os.makedirs(evaluation_directory, exist_ok=True)
results_df.to_json(os.path.join(evaluation_directory, f'{filename_prefix}all_extended.json'), orient='records')

EVALUATED EXPERIMENT CONTRIEVER WITH K=3, HARD_NEG=2, RANDOM=0
READ DIRECTORY ../data/CS4360-NLP-result-files/contriever_docs_k_is_3_random_0_hard_neg_2
ACCURACY:  0.0125
RELATIVE ACCURACY:  0.163
