# 1. Import QG

In [6]:
import sys
import os

# Get parent directory
notebook_path = os.path.abspath("evaluation.ipynb")
parent_dir = os.path.dirname(os.path.dirname(notebook_path))

# Add the parent directory to sys.path
sys.path.insert(0, parent_dir)

In [7]:
%load_ext autoreload
%autoreload 2

from models.qg import QG

context = "Python is an interpreted, high-level, general-purpose programming language. Created by Guido van Rossum and first released in 1991, Python's design philosophy emphasizes code readability with its notable use of significant whitespace."

qg = QG(model="valhalla/t5-base-e2e-qg", tokenizer="t5-base")

contexts_questions = []

result = qg(context)

print(result)

{'context': "Python is an interpreted, high-level, general-purpose programming language. Created by Guido van Rossum and first released in 1991, Python's design philosophy emphasizes code readability with its notable use of significant whitespace.", 'questions': ['Who created Python?', 'When was Python first released?', "What is Python's design philosophy?"]}


# 2. Load Squad V2 and compare the result

In [8]:
%pip install datasets -q
%pip install pandas -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [33]:
# It needs 
from datasets import load_dataset
import pandas as pd
import json

dataset = load_dataset("squad_v2")["train"].select(range(10))

df = pd.DataFrame(dataset)


df = df[["context", "question"]]

df = df.groupby("context")['question'].apply(list).reset_index()

# remove '\n' and spaces from the context column
# df['context'] = df['context'].apply(lambda x: x.replace('\n', ' ').strip())
df['context'] = df['context'].apply(lambda x: x.replace('\n', ' ').strip())

# Change column name from question to questions
df.rename(columns={'question': 'validation_questions'}, inplace=True)

# Split questions in validation questions column
df['validation_questions'] = df['validation_questions'].apply(lambda x: [question.split() for question in x])

# Run QG on each context and insert into seperate column
# df['generated_questions'] = df['context'].apply(lambda x: qg(x)['questions'])
df['generated_questions'] = df['context'].apply(lambda x: [question.split() for question in qg(x)['questions']])

result = df.to_dict(orient="records")

with open("output.json", "w") as outfile:
    json.dump(result, outfile, indent=4)

Found cached dataset squad_v2 (/Users/philiphyltoft/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d)
100%|██████████| 2/2 [00:00<00:00, 210.41it/s]


1. Run QGAR through each context and make it output questions.
2. Compare the questions QGAR generates to the actually created questions

# 3. Run BLEU on generated output

In [21]:
%pip install nltk -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.


Sources of Inspiration:
- [BLEU score in Python - Beginners Overview](https://www.askpython.com/python/bleu-score)

In [27]:
from nltk.translate.bleu_score import sentence_bleu
import warnings
warnings.filterwarnings('ignore')

ref = [
    'this is moonlight'.split(),
    'Look, this is moonlight'.split(),
    'moonlight it is'.split()
]

test = 'it is moonlight'.split()
print('BLEU score for test -> {}'.format(sentence_bleu(ref, test)))
test2 = 'why are you gay?'.split()
print('BLUE score for test2 -> {}'.format(sentence_bleu(ref, test2)))

BLEU score for test -> 1.491668146240062e-154
BLUE score for test2 -> 0


## BLEU implementation on QG's output
It must for each context loop through each question generated and compare it to the expected output. But it has to split each sentence!

In [39]:
# Loop through each context
with open("output.json", "r") as f:
    data = json.load(f)

for context in data:
    for validation_question in context['validation_questions']:
        print('BLUE score is: {}'.format(sentence_bleu(context['generated_questions'], validation_question)))

BLUE score is: 1.384292958842266e-231
BLUE score is: 1.3659076482413118e-231
BLUE score is: 1.2917956969975423e-231
BLUE score is: 7.657404561915943e-155
BLUE score is: 8.286571670851008e-155
BLUE score is: 9.53091075863908e-155
BLUE score is: 1.258141043412406e-231
BLUE score is: 1.384292958842266e-231
BLUE score is: 9.013778876140909e-155
BLUE score is: 7.1958300848837144e-155
