# 1. Import QG

In [32]:
import sys
import os

# Get parent directory
notebook_path = os.path.abspath("evaluation.ipynb")
parent_dir = os.path.dirname(os.path.dirname(notebook_path))

# Add the parent directory to sys.path
sys.path.insert(0, parent_dir)

In [33]:
%load_ext autoreload
%autoreload 2

from models.qg import QG

context = "Python is an interpreted, high-level, general-purpose programming language. Created by Guido van Rossum and first released in 1991, Python's design philosophy emphasizes code readability with its notable use of significant whitespace."

qg = QG(model="valhalla/t5-base-e2e-qg", tokenizer="t5-base")

contexts_questions = []

result = qg(context)

print(result)

{'context': "Python is an interpreted, high-level, general-purpose programming language. Created by Guido van Rossum and first released in 1991, Python's design philosophy emphasizes code readability with its notable use of significant whitespace.", 'questions': ['Who created Python?', 'When was Python first released?', "What is Python's design philosophy?"]}


# 2. Load Squad V2 and compare the result

In [38]:
%pip install datasets -q
%pip install pandas -q
%pip install evaluate -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need 

In [59]:
# It needs 
from datasets import load_dataset
import pandas as pd
import json

dataset = load_dataset("squad_v2")["train"].select(range(10))

df = pd.DataFrame(dataset)


df = df[["context", "question"]]

df = df.groupby("context")['question'].apply(list).reset_index()

# remove '\n' and spaces from the context column
# df['context'] = df['context'].apply(lambda x: x.replace('\n', ' ').strip())
df['context'] = df['context'].apply(lambda x: x.replace('\n', ' ').strip())

# Change column name from question to questions
df.rename(columns={'question': 'references'}, inplace=True)

# Split questions in validation questions column
df['references'] = df['references'].apply(lambda x: [question for question in x])

# Run QG on each context and insert into seperate column
# df['generated_questions'] = df['context'].apply(lambda x: qg(x)['questions'])
df['predictions'] = df['context'].apply(lambda x: [question for question in qg(x)['questions']])

# Drop context column
df = df.drop(columns=["context"])

result = df.to_dict(orient="records")

with open("output.json", "w") as outfile:
    json.dump(result, outfile, indent=4)

Found cached dataset squad_v2 (/Users/philiphyltoft/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d)
100%|██████████| 2/2 [00:00<00:00, 226.03it/s]


1. Run QGAR through each context and make it output questions.
2. Compare the questions QGAR generates to the actually created questions

# 3. Run BLEU on generated output

In [21]:
%pip install nltk -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.


Sources of Inspiration:
- [BLEU score in Python - Beginners Overview](https://www.askpython.com/python/bleu-score)

In [55]:
from nltk.translate.bleu_score import sentence_bleu
import warnings
warnings.filterwarnings('ignore')

ref = [
    'this is a dog'.split(),
    'it is dog'.split(),
    'dog it is'.split(),
    'a dog, it is'.split() 
]

print(ref)

candidate = 'it is dog'.split()
print('BLEU score for test -> {}'.format(sentence_bleu(ref, candidate)))
test2 = 'why are you gay?'.split()
print('BLUE score for test2 -> {}'.format(sentence_bleu(ref, test2)))

[['this', 'is', 'a', 'dog'], ['it', 'is', 'dog'], ['dog', 'it', 'is'], ['a', 'dog,', 'it', 'is']]
BLEU score for test -> 1.2213386697554703e-77
BLUE score for test2 -> 0


## BLEU implementation on QG's output
It must for each context loop through each question generated and compare it to the expected output. But it has to split each sentence!

In [37]:
# Loop through each context
with open("output.json", "r") as f:
    data = json.load(f)

for context in data:
    for validation_question in context['references']:
        print('BLUE score is: {}'.format(sentence_bleu(context['predictions'], validation_question)))

BLUE score is: 1.384292958842266e-231
BLUE score is: 1.3659076482413118e-231
BLUE score is: 1.2917956969975423e-231
BLUE score is: 7.657404561915943e-155
BLUE score is: 8.286571670851008e-155
BLUE score is: 9.53091075863908e-155
BLUE score is: 1.258141043412406e-231
BLUE score is: 1.384292958842266e-231
BLUE score is: 9.013778876140909e-155
BLUE score is: 7.1958300848837144e-155


In [56]:
import evaluate
import json

predictions = []
references = []

with open("output.json", "r") as outputs:
    outputs = json.load(outputs)
    for output in outputs:
        for item in output['predictions']:
            predictions.append(item)
        references.append(output['references'])

print(predictions)
print(references[0])

# bleu = evaluate.load("bleu")
# results = bleu.compute(predictions=predictions, references=references[0])
# print(results)

[['Beyonce', 'was', 'born', 'on', 'what', 'date?'], ['What', 'is', 'the', 'name', 'of', 'the', 'singer,', 'songwriter,', 'record', 'producer,', 'and', 'actress?'], ['Where', 'was', 'Beyoncé', 'born', 'and', 'raised?'], ['In', 'what', 'decade', 'did', 'she', 'rise', 'to', 'fame', 'as', 'the', 'lead', 'singer', 'of', "Destiny's", 'Child?'], ['How', 'many', 'Grammy', 'Awards', 'did', 'her', 'debut', 'album', 'earn?']]
[['When', 'did', 'Beyonce', 'start', 'becoming', 'popular?'], ['What', 'areas', 'did', 'Beyonce', 'compete', 'in', 'when', 'she', 'was', 'growing', 'up?'], ['When', 'did', 'Beyonce', 'leave', "Destiny's", 'Child', 'and', 'become', 'a', 'solo', 'singer?'], ['In', 'what', 'city', 'and', 'state', 'did', 'Beyonce', 'grow', 'up?'], ['In', 'which', 'decade', 'did', 'Beyonce', 'become', 'famous?'], ['In', 'what', 'R&B', 'group', 'was', 'she', 'the', 'lead', 'singer?'], ['What', 'album', 'made', 'her', 'a', 'worldwide', 'known', 'artist?'], ['Who', 'managed', 'the', "Destiny's", 'Ch

# 5. HuggingFace Evaluate library

Use example with 2 metrics: [example link](https://huggingface.co/spaces/evaluate-metric/bleu)

In [25]:
import evaluate

predictions = ["hello there general slut", "foo bar foobar"]
references = [["hello there general kenobi", "hello there !"],["foo bar foobar"]]

bleu = evaluate.load("bleu")
results = bleu.compute(predictions=predictions, references=references)
print(results)


{'bleu': 0.0, 'precisions': [0.8571428571428571, 0.8, 0.6666666666666666, 0.0], 'brevity_penalty': 1.0, 'length_ratio': 1.1666666666666667, 'translation_length': 7, 'reference_length': 6}


In [30]:
import json

with open("output.json", "r") as output:
    output1 = json.load(output)[0]
    print(output1['validation_questions'])

[['When', 'did', 'Beyonce', 'start', 'becoming', 'popular?'], ['What', 'areas', 'did', 'Beyonce', 'compete', 'in', 'when', 'she', 'was', 'growing', 'up?'], ['When', 'did', 'Beyonce', 'leave', "Destiny's", 'Child', 'and', 'become', 'a', 'solo', 'singer?'], ['In', 'what', 'city', 'and', 'state', 'did', 'Beyonce', 'grow', 'up?'], ['In', 'which', 'decade', 'did', 'Beyonce', 'become', 'famous?'], ['In', 'what', 'R&B', 'group', 'was', 'she', 'the', 'lead', 'singer?'], ['What', 'album', 'made', 'her', 'a', 'worldwide', 'known', 'artist?'], ['Who', 'managed', 'the', "Destiny's", 'Child', 'group?'], ['When', 'did', 'Beyoncé', 'rise', 'to', 'fame?'], ['What', 'role', 'did', 'Beyoncé', 'have', 'in', "Destiny's", 'Child?']]
