In [1]:
import json
import openai
import random
from typing import Any
from tqdm import tqdm

def load_json(path: str) -> Any:
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)
    
def save_json(path: str, data: Any) -> None:
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4)

the `DevQuestions` class loads and manages questions from a json file. it contains a nested `Question` class for individual questions with an id, question text, and answer. methods are provided to get the number of questions and access questions by index or id.

In [2]:
class DevQuestions:
    class Question:
        def __init__(self, identifier: str, question: str, answer: str):
            """
            initialize a Question instance.

            Args:
                identifier (str): the identifier for the question.
                question (str): the question text.
                answer (str): the answer text.
            """
            self.id = identifier
            self.question = question
            self.answer = answer

    def __init__(self, path: str):
        """
        initialize the DevQuestions instance by loading questions from a JSON file.

        Args:
            path (str): the file path to the JSON file containing the questions.
        """
        # load json file
        with open(path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        self.questions = {}
        self.lut = []

        for question in data:
            self.lut.append(question['_id'])
            self.questions[question['_id']] = self.Question(question['_id'], question['question'], question['answer'])

    def __len__(self):
        """
        return the number of questions.

        Returns:
            int: the number of questions.
        """
        return len(self.questions)

    def __getitem__(self, key: int | str):
        """
        retrieve a question by index or id.

        Args:
            key (int | str): the idx or id of the question.

        Returns:
            DevQuestions.Question: the question corresponding to the given index or id.
        """
        if isinstance(key, int):
            return self.questions.get(self.lut[key])
        elif isinstance(key, str):
            return self.questions[key]

the `QA` class manages and prompts questions based on given contexts. it loads questions from a specified path and allows setting a system prompt. methods include asking questions by context, prompting the model, setting the system prompt, getting a context by name, and adding new contexts.

In [3]:
class QA:
    def __init__(self, path_to_dev: str, model:str='gpt-3.5-turbo', system_prompt:str='', actually_prompt:bool= True):
        """
        initialize a QA instance

        args:
            path_to_dev (str): the file path to the development question.
            system_prompt (str, optional): the system prompt to use. defaults to ''.
            actually_prompt (bool, optional): whether to actually prompt the model. defaults to True
        """
        self.client = openai.Client(api_key='sk-proj-OD9UpwZjabyMJ0bPtPnMT3BlbkFJKDkmftX3AjUy6zp6di0P')
        self.system_prompt = system_prompt
        self.actually_prompt = actually_prompt
        self.dev = DevQuestions(path_to_dev)
        self.contexts = {}
        self.model = model

    def ask_by_context(self, context_name:str, num_questions:int=0, k:int=0, write_to_file:tuple[bool, str]=(False, None)) -> dict:
        """
        ask questions by context.

        args:
            context_name (str): the name of the context
            num_questions (int, optional): the number of questions to ask. defaults to 0 (all questions)
            k (int, optional): the number of contexts to include in the prompt. defaults to 0, which includes (all available contexts)
            write_to_file (tuple[bool, str], optional): write the prompts, responses, and ground truth to a file. defaults to (False, None)

        returns:
            dict: a dictionary containing prompts, answers, and responses.
        """
        if context_name not in self.contexts:
            raise ValueError('context not found!')
        if num_questions <= 0 or num_questions > len(self.contexts[context_name]):
            num_questions = len(self.contexts[context_name])
            print(f"WARNING: num_questions must be between 1 and {num_questions}! Setting num_questions to {num_questions}.")

        prompts, answers, responses = [], [], []
        k_adjustments = 0

        for c, (q_id, contexts) in tqdm(enumerate(self.contexts[context_name].items()), total=num_questions, desc="Processing Questions"):
            if c == num_questions:
                break

            k_ = k

            question = self.dev[q_id]

            if k <= 0 or k > len(contexts):
                k_ = len(contexts)
                if k_ != 0: k_adjustments += 1

            contexts = contexts[:k_]

            prompt = self._create_prompt(contexts, question.question)
            prompts.append(prompt)
            answers.append(question.answer)
            responses.append(self.ask(prompt) if self.actually_prompt else 'dummy response! ;)')

        if write_to_file[0]: self._write_to_file(write_to_file[1], prompts, responses, answers)

        if k_adjustments:
            print(f"WARNING: total adjustments of 'k': {k_adjustments} of {len(prompts)}")
            print("this warning is because the specified k is outside the valid range or exceeds available contexts.")

        return {'prompts': prompts, 'answers': answers, 'responses': responses}

    def ask_by_mixed_context(self, context_name1:str, context_name2:str, num_questions:int=0, k1:int= 0, k2:int=0, write_to_file:tuple[bool, str]=(False, None)) -> dict:
        """
        ask questions by using the ids from context_name1 and randomly add contexts from context_name2.

        args:
            context_name1 (str): the name of the first context.
            context_name2 (str): the name of the second context.
            num_questions (int, optional): the number of questions to ask. defaults to 0 (all questions).
            k1 (int, optional): the number of contexts to include from the first context dictionary. defaults to 0 (all available contexts)
            k2 (int, optional): the number of contexts to include from the second context dictionary. defaults to 0 (all available contexts)
            write_to_file (tuple[bool, str], optional): write the prompts, responses, and ground truth to a file. defaults to (false, none).

        returns:
            dict: a dictionary containing prompts, answers, and responses.
        """
        if set(list(self.contexts[context_name1].keys())) != set(list(self.contexts[context_name2].keys())):
            raise ValueError('context keys do not match!')

        if context_name1 not in self.contexts or context_name2 not in self.contexts:
            raise ValueError('context not found!')
        
        if num_questions <= 0 or num_questions > len(self.contexts[context_name1]):
            num_questions = len(self.contexts[context_name1])
            print(f"WARNING: num_questions must be between 1 and {num_questions}! Setting num_questions to {num_questions}.")

        prompts, answers, responses = [], [], []
        k1_adjustments = 0
        k2_adjustments = 0
        k1_ = k1
        k2_ = k2

        for c, (q_id, contexts1) in tqdm(enumerate(self.contexts[context_name1].items()), total=num_questions, desc="Processing Questions"):
            if c == num_questions:
                break

            question = self.dev[q_id]
            contexts2 = self.contexts[context_name2].get(q_id, [])

            if k1 <= 0 or k1 > len(contexts1):
                k1_ = len(contexts1)
                if k1 != 0: k1_adjustments += 1
            
            if k2 <= 0 or k2 > len(contexts2):
                k2_ = len(contexts2)
                if k2 != 0: k2_adjustments += 1

            combined_contexts = contexts1[:k1_] + contexts2[:k2_]
            random.shuffle(combined_contexts)

            prompt = self._create_prompt(combined_contexts, question.question)
            
            prompts.append(prompt)
            answers.append(question.answer)
            responses.append(self.ask(prompt) if self.actually_prompt else 'dummy response! ;)')

        if write_to_file[0]: self._write_to_file(write_to_file[1], prompts, responses, answers)

        if k1_adjustments:
            print(f"WARNING: total adjustments of 'k1': {k1_adjustments} of {len(prompts)}")
            print("this warning is because the specified k1 is outside the valid range or exceeds available contexts.")
        if k2_adjustments:
            print(f"WARNING: total adjustments of 'k2': {k2_adjustments} of {len(prompts)}")
            print("this warning is because the specified k2 is outside the valid range or exceeds available contexts.")

        return {'prompts': prompts, 'answers': answers, 'responses': responses}

    def ask_by_context_without_context(self, context_name: str, num_questions: int = 0, write_to_file: tuple[bool, str] = (False, None)) -> dict:
        """
        ask questions by context without using the context.

        args:
            context_name (str): the name of the context.
            num_questions (int, optional): the number of questions to ask. defaults to 0 (all questions)
            write_to_file (tuple[bool, str], optional): write the prompts, responses, and ground truth to a file. defaults to (False, None)

        returns:
            dict: a dictionary containing prompts, answers, and responses.
        """
        if context_name not in self.contexts:
            raise ValueError('context not found!')
        
        if num_questions <= 0 or num_questions > len(self.contexts[context_name]):
            num_questions = len(self.contexts[context_name])
            print(f"WARNING: num_questions must be between 1 and {num_questions}! Setting num_questions to {num_questions}.")

        prompts, answers, responses = [], [], []

        for c, (q_id, _) in enumerate(self.contexts[context_name].items()):
            if c == num_questions:
                break

            question = self.dev[q_id]
            prompt = question.question

            prompts.append(prompt)
            answers.append(question.answer)
            responses.append(self.ask(prompt) if self.actually_prompt else 'dummy response! ;)')

        if write_to_file[0]:
            self._write_to_file(write_to_file[1], prompts, responses, answers)

        return {'prompts': prompts, 'answers': answers, 'responses': responses}

    def ask(self, prompt: str) -> str:
        """
        ask a question using the given prompt.

        args:
            prompt (str): the prompt to ask.

        returns:
            str: the response from the model.
        """
        if not isinstance(prompt, str):
            raise TypeError('prompt must be a string!')

        completion = self.client.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": self.system_prompt},
                {"role": "user", "content": prompt}
            ]
        )
        return completion.choices[0].message.content

    def set_system_prompt(self, prompt: str):
        """
        sets the system prompt for gpt to use

        args:
            prompt (str): the prompt to set.
        """
        if not isinstance(prompt, str):
            raise TypeError('prompt must be a string!')
        self.system_prompt = prompt

    def get_context(self, name: str) -> dict:
        """
        get a context by name

        args:
            name (str): the name of the context.

        returns:
            dict: the context dictionary.
        """
        if not isinstance(name, str) or name not in self.contexts:
            raise TypeError('context not found!')
        return self.contexts.get(name)

    def print_context_names(self):
        """
        print the names of all contexts available
        """
        for name in self.contexts.keys():
            print(name)

    def add_context(self, context:dict, name:str):
        """
        add a context. it should be a dictionary with the following structure:
        {
            'question_idA': ['contextA1', 'contextA2', ...],
            'question_idB': ['contextB1', 'contextB2', ...],
            ...
        }

        args:
            context (dict): the context to add.
            name (str): the name of the context.
        """
        if not isinstance(context, dict):
            raise TypeError('context must be a dictionary!')
        if not isinstance(name, str):
            raise TypeError('name must be a string!')
        if name in self.contexts:
            raise ValueError('context already exists!')
        if not all(isinstance(k, str) and isinstance(v, list) for k, v in context.items()):
            raise TypeError('invalid format!')

        self.contexts[name] = context

    def _create_prompt(self, contexts:list[str], question:str) -> str:
        """
        create a prompt from the contexts and question.

        args:
            contexts (list[str]): the contexts
            question (str): the questionn
        """
        prompt = ''
        for context in contexts:
            prompt += context + '\n\n'
        prompt += question
        return prompt

    def _write_to_file(self, file_path:str, prompts:list[str], responses:list[str], answers:list[str]):
        """
        write the prompts, responses, and ground truth to a file for debuging

        args:
            file_path (str): the file path to write to.
            prompts (list[str]): the prompts
            responses (list[str]): the responses
            answers (list[str]): the groundtruths
        """
        with open(file_path, 'w', encoding='utf-8') as file:
            for i in range(len(prompts)):
                file.write(f"Prompt {i+1}:\n{prompts[i]}\n\n")
                file.write(f"Response {i+1}:\n{responses[i]}\n\n")
                file.write(f"Ground Truth {i+1}:\n{answers[i]}\n\n")
                file.write("----------------------------------\n\n")


top10 contains the top 10 contexts for 1199 questions retrieved by matteo.

**in general** contexts should now have the following format:

some_context = 
{
  'question_idA': ['contextA1', 'contextA2', ...],
  'question_idB': ['contextB1', 'contextB2', ...],
  ...
}

to be compatible with the `QA` class.

In [4]:
# ! cd 

top10 = load_json('data/results_10.json')
keys = list(top10.keys())
_dev = load_json('data/_dev.json')

the following two cells builds the gold context and the hard negative contexts by using the supporting facts from the `dev.json` file **for all questions used in top10 by matteo**

It looks up the supporting facts / the title of the gold context for each question and adds the sentences with the same title under the context field to the gold context dictionary.

The hard negative context is built by iterating over all questions and checking if the title of the contexts retrieved by the contriever show up in the supporting facts of the question. If not, the context is added to the hard negative context dictionary.

**NOTE: I DID NOT THOROUGHLY CHECK IF ITS ALWAYS CORRECT, BUT I THINK BOTH CELLS WORK! ONE THING IS THAT THERE ARE OFTEN TWO IDENTICAL CONTEXTS RETRIEVED BY THE CONTRIEVER. THEY HAVE SOME DIFFERENT FORMATTING (SOME HAVE A SPACE AT THE END, SOME NOT ETC.). THIS COULD BE A PROBLEM SINCE THE HARD NEGATIVE CELL COMPARES THE TITLES OF THE CONTEXTS (EG. THE STRING FROM START TO THE FIRST \n) IN TOP10 WITH THE SUPPORTING FACTS OF THE QUESTIONS.**

In [34]:
gold_context = {}

for question in _dev:  # iterate over each question in the full dataset
    question_id = question['_id'] # get the id of the question
    if question_id not in keys: continue  # skip if the question is not part of the ones from top10.json

    full_context = question['context']  # get all contexts of the question
    supporting_facts = question['supporting_facts']  # get the supporting facts of the question

    context_list = []  # initialize an empty list to store context strings
    gold_context[question_id] = []  # initialize an empty list for the current question id in gold_context dict

    for idx_context, context in enumerate(full_context):  # iterate over each given context in the dev dataset
        title = context[0]  # get the title of the context
        if title not in [sf[0] for sf in supporting_facts]: continue  # skip if the title is not in the supporting facts
        # because we only want to include the context if it is a supporting fact

        string = title + '\n' # start the context string with the title and a newline
        for i, c in enumerate(context[1]): # iterate over each sentence in the context
            string += c  # append the sentence to the context string

        gold_context[question_id].append(string) # add the context string to the list for the currennt question id

save_json('data/gold_context.json', gold_context)

In [35]:
hard_negatives = {}

for question in _dev: # iterate over each question in the _dev dataset
    question_id = question['_id'] # get the id of the question
    if question_id not in keys: continue # skip if the question is not part of the ones from top10.json
    
    hard_negatives[question_id] = []  # initialize an empty list for the current question id in hard_negatives
    supporting_facts = question['supporting_facts'] # get the supporting facts of the question

    titles_top10 = [t.split('\n')[0] for t in top10[question_id]] # get the titles from the top10 for the current question id

    for idx_title, title in enumerate(titles_top10): # iterate over each title in the top10 titles
        if title not in [sf[0] for sf in supporting_facts]: # if the title is not in the supporting facts
            hard_negatives[question_id].append(top10[question_id][idx_title]) # we only want to include the context if it is not a
            # supporting fact but seems useful according to the retriever

save_json('data/hard_negatives.json', hard_negatives)

the following cell loads the different contexts from the `.json` file and adds them to the `QA` class.
I also generated a "gibberish" context which is just random characters to test the `QA.ask_by_mixed_context()` method.

**IMPORTANT:** The structure of the final prompt is as follows:
- Title of the context 1
- linebreak
- context 1 sentences without linebreaks
- linebreak
- linebreak
- title of the context 2
- linebreak
- context 2 sentences without linebreaks
- ...
- linebreak
- linebreak
- question

A short example of the functionality...

First load the contexts and questions from the JSON files...

In [4]:
hard_negatives = load_json('data/hard_negatives.json')
gold_context = load_json('data/gold_context.json')
top10 = load_json('data/results_10.json')
# gibberish = load_json('project/json_thingy/gibberish.json') # load gibberish dict from json

# this is to remove the double newlines in the contexts, that were added by matteo
for key, value in top10.items():
    for i, v in enumerate(value):
        top10[key][i] = top10[key][i].replace('\n\n', '\n')

create a `QA` object. this will load **ALL** questions in `dev.json`. The `model` argument specifies which openai model to use. The `system_prompt` argument is used to set the system prompt, which can also be done later with the `set_system_prompt()` method. `actually_prompt` is a boolean that determines if the model should be prompted or not. If set to `False`, the model just returns some dummy responses.

the class internally works with the contexts and questions in the following way:

when a `QA` instance is initialized, it loads `dev.json` questions from a JSON file into the `DevQuestions` object. this object stores each question as an instance of the nested `Question` class, wich includes the questions identifier, text and answer. contexts for these questions are stored in a dictionary within the `QA` instance, where each key is a question ID and each value is a list of context strings.

the questions actually getting asked are always based on the contexts. The keys in the context dictionary tell the QA class which question to look up in the `DevQuestions` object. the corresponding contexts are then used to prompt the model.

Use `add_context()` to add a new context to the QA object. The context should be a dictionary with question ids as keys and a list of context strings as values. it also takes a name for the context. for example, using

```python
qa.add_context(context_name='example_context', context={'question_idA': ['contextA1', 'contextA2'], 'question_idB': ['contextB1', 'contextB2']})
```


the `ask_by_context` asks questions based on a specific context. provide the name of the context, the number of questions to ask and the number of contexts to include in the prompt. If the number of questions or contexts `k` is not specified or is out of range the method uses all available questions / contexts. the prompts, responses and answers can be written to a .txt file for debugging and to check if everything works. for example, using

```python
results = qa.ask_by_context(context_name='example_context', num_questions=5, k=3, write_to_file=(True, 'output.txt'))
```

will ask 5 questions using the `example_context` including up to 3 contexts for each question (if theres less than that for a question, it will include this max. amount), and save the results to `output.txt`.

the `ask_by_mixed_context` method combines contexts from two different context dictionaries. It uses question ids from `context_name1` and randomly adds contexts from `context_name2`. pass the number of questions to ask and the number of contexts to include from each dictionary (`k1`, `k2`). the method ensures that the contexts from both dictionaries are combined and shuffled before constructing the prompt. if needed, the prompts, responses, and answers can be saved to a file. again, if `k1` or `k2` are not specified or out of range, the method uses all available contexts. for example, using

```python
results = qa.ask_by_mixed_context(context_name1='example_context', context_name2='noisy_context', num_questions=5, k1=2, k2=2, write_to_file=(True, 'mixed_output.txt'))
```

will ask 5 questions specified by the ids in the `example_context` and add 2 contexts from it. additionaly it adds 2 random contexts from the `noisy_context` dictionary. the dictionary's keys should contain the same question ids.

The `ask_by_context_without_context` just asks the questions without any context. Pretty lame.



In [5]:
qa = QA('data/_dev.json', model='gpt-3.5-turbo', system_prompt='Q: ', actually_prompt=False)

qa.set_system_prompt("For this task, you should provide a factoid answer. This means that you are limited to returning the exact answer to the question"
                          "(which might be a person's name, a date, a place and so on), without any additional word and without putting the factoid answer in"
                          "a sentence. For instance, if the question is \"Who was the lead singer of the rock band Queen?\", you should reply \"Freddy Mercury\","
                          "and not \"The lead singer of the band Queen was Freddy Mercury\".", )

qa.add_context(gold_context, 'gold_context')
qa.add_context(hard_negatives, 'hard_negatives')
qa.add_context(top10, 'top10')
# qa.add_context(gibberish, 'gibberish')

In [19]:
qa.actually_prompt = False
a = qa.ask_by_context('gold_context', num_questions=10, k=0, write_to_file=(False, 'qa_output.txt'))
a = qa.ask_by_context_without_context('gold_context', num_questions=10, write_to_file=(False, 'qa_output_nc.txt'))
# a = qa.ask_by_mixed_context('top10', 'gibberish', num_questions=10, k1=3, k2=2, write_to_file=(False, 'qa_output_mixed.txt'))

Processing Questions: 100%|██████████| 10/10 [00:00<00:00, 9988.82it/s]






In [20]:
# you can also print the available contexts
qa.print_context_names()

gold_context
hard_negatives
top10


## METRICS

In [27]:
# %pip install pandas # might be needed for next import

Collecting pandas
  Obtaining dependency information for pandas from https://files.pythonhosted.org/packages/69/a6/81d5dc9a612cf0c1810c2ebc4f2afddb900382276522b18d128213faeae3/pandas-2.2.2-cp310-cp310-win_amd64.whl.metadata
  Using cached pandas-2.2.2-cp310-cp310-win_amd64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Obtaining dependency information for pytz>=2020.1 from https://files.pythonhosted.org/packages/9c/3d/a121f284241f08268b21359bd425f7d4825cffc5ac5cd0e1b3d82ffd2b10/pytz-2024.1-py2.py3-none-any.whl.metadata
  Using cached pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Obtaining dependency information for tzdata>=2022.7 from https://files.pythonhosted.org/packages/65/58/f9c9e6be752e9fcb8b6a0ee9fb87e6e7a1f6bcab2cdc73f02bb7ba91ada0/tzdata-2024.1-py2.py3-none-any.whl.metadata
  Using cached tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.2.2-cp310-cp310-win_amd64.whl (11.6 MB)
Using cached pyt


[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
from project_work_group_12.evaluation import exact_match, bertscore 

## TOP K

In [13]:
K = [1, 3, 5]

qa.actually_prompt = True
context_to_use = 'top10'

for k in K:
    result = qa.ask_by_context(context_to_use, num_questions=0, k=k, write_to_file=(False, f'test/qa_{context_to_use}_k{k}.txt'))
    save_json(f'test/qa_{context_to_use}_k{k}.json', result)



Processing Questions:   3%|▎         | 38/1199 [00:20<10:11,  1.90it/s]


KeyboardInterrupt: 

In [12]:
for k in [1,3,5]:    
    r = load_json(f'test/qa_top10_k{k}.json')
    
    print(f"Exact match with k={k}:", exact_match(r['responses'], r['answers']))
    print(f"Bertscore with k={k}:", bertscore(r['answers'], r['responses']))

Exact match with k=1: 0.6
Bertscore with k=1: 0.7213021427392959
Exact match with k=3: 0.3
Bertscore with k=3: 0.5079012331552804
Exact match with k=5: 0.3
Bertscore with k=5: 0.49595814319327475


## GOLD CONTEXT

In [48]:
K = [1, 3, 5]

qa.actually_prompt = False
context_to_use = 'gold_context'

for k in K:
    result = qa.ask_by_context(context_to_use, num_questions=10, k=k, write_to_file=(True, f'test/qa_{context_to_use}_k{k}.txt'))
    save_json(f'test/qa_{context_to_use}_k{k}.json', result)

Processing Questions: 100%|██████████| 10/10 [00:00<00:00, 9508.74it/s]


UnicodeEncodeError: 'charmap' codec can't encode characters in position 118-119: character maps to <undefined>

In [71]:
result = qa.ask_by_mixed_context('top10', 'gibberish', num_questions=10, k1=3, k2=12, write_to_file=(True, 'test/qa_mixed.txt'))

Processing Questions: 100%|██████████| 10/10 [00:00<00:00, 53498.78it/s]




