# Two step approach for QA-generation and evaluation 

This notebook is used to generate QA-pairs and get their evaluations according to the two-step approach described in the thesis. Three models are utlizied: Llama2 7b, Llama2 70b, and GPT-3.5. Thus, the code is run iteratively, by first using the get_gpt-function to generate questions and answers, and then get_llama where the model was firstly Llama2 70b and lastly Llama2 7b. This resulted in three outputted CSV-files: '2s_gpt.csv', '2s_llama2_70b.csv', and '2s_llama2_7b.csv' which are all attatched to the submission. 

In [1]:
from getpass import getpass
import os
import replicate
import pandas as pd

REPLICATE_API_TOKEN = getpass()
os.environ["REPLICATE_API_TOKEN"] = 'insert_api_key'

In [66]:
import time

def get_llama(prompt):
    max_retries = 3  # Maximum number of retries
    retry_delay = 20  # Delay between retries in seconds
    retries = 0

    while retries < max_retries:
        try:
            output = replicate.run(
                "meta/llama-2-7b-chat",
                input={
                    "prompt": prompt,
                    "max_new_tokens": 500,
                    "system_prompt": "",
                    "Temperature": 1
                }
            )

            output = ''.join(output)
            return output
        except Exception as e:
            print(f"An error occurred: {e}")
            print(f"Retrying in {retry_delay} seconds...")
            time.sleep(retry_delay)
            retries += 1

    print("Max retries reached. Unable to get response.")
    return None

In [3]:
df = pd.read_csv(r'blogposts.csv')

### Question generation

In [67]:
questions = []

for index, row in df.iterrows():
    prompt = f""" 
              This is an example of a blogpost from Bang and Olufsen's forum in JSON format:

              {row['text']}
    
              Read the post, act as a Bang&Olufsen product expert and form a single question mentioning the products or problems, if any, to capture the most important problem of the post: 
    
            There are some rules:
           - Do not mention user names
           - Answer only with a single question
           """

    question = get_llama(prompt)
    questions.append(question)
    if index % 10 or index < 5:
        #print(question)
        print(round(index / len(df), 4))

df['questions'] = questions 

0.0
0.0007
0.0013
0.002
0.0026
0.0033
0.004
0.0046
0.0053
0.006
0.0073
0.0079
0.0086
0.0093
0.0099
0.0106
0.0113
0.0119
0.0126
0.0139
0.0146
0.0152
0.0159
0.0165
0.0172
0.0179
0.0185
0.0192
0.0205
0.0212
0.0218
0.0225
0.0232
0.0238
0.0245
0.0251
0.0258
0.0271
0.0278
0.0285
0.0291
0.0298
0.0304
0.0311
0.0318
0.0324
0.0338
0.0344
0.0351
0.0357
0.0364
0.0371
0.0377
0.0384
0.039
0.0404
0.041
0.0417
0.0424
0.043
0.0437
0.0443
0.045
0.0457
0.047
0.0477
0.0483
0.049
0.0496
0.0503
0.051
0.0516
0.0523
0.0536
0.0543
0.0549
0.0556
0.0563
0.0569
0.0576
0.0582
0.0589
0.0602
0.0609
0.0615
0.0622
0.0629
0.0635
0.0642
0.0649
0.0655
0.0668
0.0675
0.0682
0.0688
0.0695
0.0702
0.0708
0.0715
0.0721
0.0735
0.0741
0.0748
0.0754
0.0761
0.0768
0.0774
0.0781
0.0788
0.0801
0.0807
0.0814
0.0821
0.0827
0.0834
0.0841
0.0847
0.0854
0.0867
0.0874
0.088
0.0887
0.0893
0.09
0.0907
0.0913
0.092
0.0933
0.094
0.0946
0.0953
0.096
0.0966
0.0973
0.0979
0.0986
0.0999
0.1006
0.1013
0.1019
0.1026
0.1032
0.1039
0.1046
0.1052
0.10

In [144]:
# For the last model, llama2 7b, the some outputs included unnecessary citation signs. This code removes them 
import re
cleaned_questions = [] 
for index, row in df.iterrows():
    string = row['questions']
    match = re.search('\n+(.*)', string)
    if match:
        cleaned_questions.append(match.group(1).strip('"'))
    else:
        cleaned_questions.append(string)

df['cleaned_questions'] = cleaned_questions

In [141]:
# Define get_gpt
def get_gpt(prompt):
    openai.api_key = 'insert_api_key'

    attempts = 0
    max_attempts = 5  # Max attempts before giving up
    backoff_time = 1  # Initial backoff time in seconds

    while attempts < max_attempts:
        try:
            completion = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[{"role": "user", "content": prompt}],
                max_tokens=2000,
                temperature=1)
            resp = completion.choices[0].message["content"]
            return resp

        except openai.error.APIError as e:
            # Handle API-specific errors here
            print(f"OpenAI API returned an API Error: {e}")
            break  # Stop after an API error
        except Exception as e:
            # This catches any other exceptions
            print(f"An error occurred: {e}")
            attempts += 1
            time.sleep(backoff_time)
            backoff_time *= 2  # Exponential backoff

    # Return a default response or raise an error after max attempts
    raise Exception("Failed to get a response from OpenAI API after multiple attempts.")

### Question evaluation 

In [148]:
import ast 
import openai


def evaluate_question(row):
    post, question = row['text'], row['cleaned_questions']
    example_dict = {'Relevance': 1, 'Coverage': 4, 'Details': 2, 'Fluency': 3}
    

    prompt = f""" 

You will be given one question based on a blogpost.

Your task is to rate the question on three metrics.

Please make sure you read and understand these instructions carefully. 

Instructions:
1. Read the question the blogpost carefully.
2. Read the blogpost and identify the issue the article.
3. Read the evaluation criterions and ensure you understand them well.
4. Assess for each evaluation criteria the question given blogpost
5. Assign a score from 1 to 5 (5 being best) for each evaluation criteria.

Evaluation Criterias:
Fluency (1-5): the quality of the question in terms of grammar, spelling, punctuation, word choice, and sentence structure.
Relevance (1-5) - selection of important content from the source. The question should include only important issues or problems from the source document. 
Coherence (1-5) - the collective quality of the question. The question should be well-structured and well-organized


Example:

Question:
{row['cleaned_questions']}

Blogpost (in JSON format):
{row['text']}

Evaluation Form (scores ONLY):
- Fluency (1-5):
- Relevance (1-5):
- Coherence (1-5):
                
Your answer must be in the format of a Python dictionary (and nothing else), e.g: 
{example_dict}


"""
        
    question_metrics = ast.literal_eval(get_gpt(prompt))

    return question_metrics

In [151]:
# Add evaluations to df 
evaluations = df.apply(evaluate_question, axis=1)
df['evaluations_llama7b_q'] = evaluations

An error occurred: The server is overloaded or not ready yet.
An error occurred: The server is overloaded or not ready yet.
An error occurred: Rate limit reached for gpt-3.5-turbo in organization org-QqrrdoR3kZPjpGbc33YokvRb on tokens per min (TPM): Limit 60000, Used 58728, Requested 2465. Please try again in 1.193s. Visit https://platform.openai.com/account/rate-limits to learn more.
An error occurred: Rate limit reached for gpt-3.5-turbo in organization org-QqrrdoR3kZPjpGbc33YokvRb on tokens per min (TPM): Limit 60000, Used 57609, Requested 2465. Please try again in 74ms. Visit https://platform.openai.com/account/rate-limits to learn more.
An error occurred: Rate limit reached for gpt-3.5-turbo in organization org-QqrrdoR3kZPjpGbc33YokvRb on tokens per min (TPM): Limit 60000, Used 56861, Requested 4096. Please try again in 956ms. Visit https://platform.openai.com/account/rate-limits to learn more.
An error occurred: Rate limit reached for gpt-3.5-turbo in organization org-QqrrdoR3kZP

### Answer generation 

In [167]:
answers = []
for index, row in df.iterrows():
    prompt = f""" 
            This is an example of a blogpost from Bang and Olufsen's forum in JSON format:

            {row['text']}

            Your task is the answer this question:
            {row['questions']}

            Read the post and answer the question concisely.

            There are some rules:
            - Only reply with the answer to the question above
            - If the thread does not consists of a solution, please state that the answer cannot be answered
            - Do not mention usernames 
            """
    answers.append(get_llama(prompt))
    print(round(index/len(df),4))

df['answers'] = answers

0.0
0.0007
0.0013
0.002
0.0026
0.0033
0.004
0.0046
0.0053
0.006
0.0066
0.0073
0.0079
0.0086
0.0093
0.0099
0.0106
0.0113
0.0119
0.0126
0.0132
0.0139
0.0146
0.0152
0.0159
0.0165
0.0172
0.0179
0.0185
0.0192
0.0199
0.0205
0.0212
0.0218
0.0225
0.0232
0.0238
0.0245
0.0251
0.0258
0.0265
0.0271
0.0278
0.0285
0.0291
0.0298
0.0304
0.0311
0.0318
0.0324
0.0331
0.0338
0.0344
0.0351
0.0357
0.0364
0.0371
0.0377
0.0384
0.039
0.0397
0.0404
0.041
0.0417
0.0424
0.043
0.0437
0.0443
0.045
0.0457
0.0463
0.047
0.0477
0.0483
0.049
0.0496
0.0503
0.051
0.0516
0.0523
0.0529
0.0536
0.0543
0.0549
0.0556
0.0563
0.0569
0.0576
0.0582
0.0589
0.0596
0.0602
0.0609
0.0615
0.0622
0.0629
0.0635
0.0642
0.0649
0.0655
0.0662
0.0668
0.0675
0.0682
0.0688
0.0695
0.0702
0.0708
0.0715
0.0721
0.0728
0.0735
0.0741
0.0748
0.0754
0.0761
0.0768
0.0774
0.0781
0.0788
0.0794
0.0801
0.0807
0.0814
0.0821
0.0827
0.0834
0.0841
0.0847
0.0854
0.086
0.0867
0.0874
0.088
0.0887
0.0893
0.09
0.0907
0.0913
0.092
0.0927
0.0933
0.094
0.0946
0.0953
0.09

### Answer evaluation 

In [174]:
example_dict = {'Fluency': 2, 'Relevance': 3, 'Consistency': 3, 'Coherence': 1}
answers_evals = []
    
for index, row in df.iterrows():
    if index < len(answers_evals):
        pass
    else:
        
        prompt = f""" 
                You will be given one answer written for a question based on a blogpost.

                Your task is to rate the answer four metrics.

                Please make sure you read and understand these instructions carefully. 

                Instructions:
                    1. Read the question, answers and the blogposts carefully.
                    2. Read blog post and identify the issue and solution of the article.
                    3. Read the evaluation criterias and ensure you understand them well.
                    3. Assess for each evaluation criteria the answer given the question and the blogpost
                    4. Assign a score from 1 to 5 for each evaluation criteria.

                Evaluation Criterias:
                Fluency (1-5): the quality of the answer in terms of grammar, spelling, punctuation, word choice, and sentence structure.
                Relevance (1-5) - selection of important content from the source. The answer should include only important information from the source document. Penalize answers which contained redundancies and excess information.
                Consistency (1-5) - the factual alignment between the answer and the blogpost. A factually consistent answer contains only statements that are entailed by the blogpost. Penalize summaries that contained hallucinated facts. 
                Coherence (1-5) - the collective quality of all sentences. The answer should be well-structured and well-organized. The answer should not just be a heap of related information, but should build from sentence to a coherent body of information about a topic.

                Example:

                Question:
                {row['cleaned_questions']}

                Answer:
                {row['answers']}

                Blogpost (in JSON format):
                {row['text']}

                Evaluation Form (scores ONLY):
                - Fluency (1-5):
                - Relevance (1-5):
                - Consistency (1-5):
                - Coherence (1-5):
                
                Your answer must be in the format of a Python dictionary (and nothing else), e.g: 
                {example_dict}
                """
        try:
            ans = get_gpt(prompt)  
            # Initialize dict with metrics from GPT
            answer_metrics = ast.literal_eval(ans)
            if type(answer_metrics) == dict:
                answers_evals.append(answer_metrics)
            else:
                answers_evals.append(None)
        except IndentationError:
            answers_evals.append(answer_metrics)

        print(round(index / len(df),4))

df['answers_evaluations'] = answers_evals

0.9332
0.9338
0.9345
0.9351
0.9358
0.9365
0.9371
0.9378
0.9385
0.9391
0.9398
0.9404
0.9411
0.9418
0.9424
0.9431
0.9437
0.9444
0.9451
0.9457
0.9464
0.9471
0.9477
0.9484
0.949
0.9497
0.9504
0.951
0.9517
0.9523
0.953
0.9537
0.9543
0.955
0.9557
0.9563
0.957
0.9576
0.9583
0.959
0.9596
0.9603
0.961
0.9616
0.9623
0.9629
0.9636
0.9643
0.9649
0.9656
0.9662
0.9669
0.9676
0.9682
0.9689
0.9696
0.9702
0.9709
0.9715
0.9722
0.9729
0.9735
0.9742
0.9749
0.9755
0.9762
0.9768
0.9775
0.9782
0.9788
0.9795
0.9801
0.9808
0.9815
0.9821
0.9828
0.9835
0.9841
0.9848
0.9854
0.9861
0.9868
0.9874
0.9881
0.9887
0.9894
0.9901
0.9907
0.9914
0.9921
0.9927
0.9934
0.994
0.9947
0.9954
0.996
0.9967
0.9974
0.998
0.9987
0.9993


In [176]:
df.to_csv('2s_llama2_7b.csv')