<a href="https://colab.research.google.com/github/Sakuni-Weerasinghe/Automatic-Question-and-Answer-Generation-based-on-Large-Language-Models/blob/master/ResearchDistractorGeneration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install tokenizers
!pip install pytorch-lightning

In [None]:
# Import packages
from typing import List, Dict
import tqdm.notebook as tq
from tqdm.notebook import tqdm
import json
import pandas as pd
import numpy as np

import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5TokenizerFast as T5Tokenizer
    )
import os

In [None]:
def parse_json(filepath):
    data = []

    with open(filepath) as file:
        data = json.load(file)

    return data

In [None]:
def extract_from_my_dataset(data):
    topics = []
    sub_topics = []
    contexts = []
    questions = []
    correct_answers = []
    options1 = []  # Renamed option1
    options2 = []  # Renamed option2
    options3 = []  # Renamed option3
    options4 = []  # Renamed option4

    for topic in data:
        topic_name = topic.get("topic", "")
        sub_topic_name = topic.get("sub-topic", "")
        context = topic.get("context", "")
        topic_questions = topic.get("questions", [])

        for qna_set in topic_questions:
            question = qna_set.get("question", "")
            correct_answer = qna_set.get("correct_answer", "")
            option_1 = qna_set.get("option1", "")  # Renamed option1
            option_2 = qna_set.get("option2", "")  # Renamed option2
            option_3 = qna_set.get("option3", "")  # Renamed option3
            option_4 = qna_set.get("option4", "")  # Renamed option4

            topics.append(topic_name)
            sub_topics.append(sub_topic_name)
            contexts.append(context)
            questions.append(question)
            correct_answers.append(correct_answer)
            options1.append(option_1)  # Appended to options1
            options2.append(option_2)  # Appended to options2
            options3.append(option_3)  # Appended to options3
            options4.append(option_4)  # Appended to options4

    return topics, sub_topics, contexts, questions, correct_answers, options1, options2, options3, options4

In [None]:
data = parse_json("data.json")  # Replace with your actual dataset

topics, sub_topics, contexts, questions, correct_answers, option1,option2,option3,option4 = extract_from_my_dataset(data)

# Get the lengths of the extracted lists
num_topics = len(topics)
num_sub_topics = len(sub_topics)
num_contexts = len(contexts)
num_questions = len(questions)
num_correct_answers = len(correct_answers)
num_option1 = len(option1)
num_option2 = len(option2)
num_option3 = len(option3)
num_option4 = len(option4)


In [None]:
# Create a DataFrame based on your extracted data
my_dataset_df = pd.DataFrame({
    'topic': topics,
    'sub_topic': sub_topics,
    'context': contexts,
    'question': questions,
    'correct_answer': correct_answers,
    'option1': option1,
    'option2': option2,
    'option3': option3,
    'option4': option4
})

In [None]:
# Split the data into train, test, and validation sets
train, test = train_test_split(my_dataset_df, test_size=0.2, random_state=42)
train, val = train_test_split(train, test_size=0.2, random_state=42)

print(train.shape)
print(test.shape)
print(val.shape)
# Now, you have train, test, and val DataFrames to work with

(606, 9)
(190, 9)
(152, 9)


In [None]:

train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)
val.to_csv("val.csv", index=False)

#!mv train.csv test.csv val.csv drive/MyDrive/Research/QG/DataSet

Extract data from json

Export as *.csv and upload to GDrive

In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

In [None]:
#https://drive.google.com/file/d/18la5JXgyjAN2ww3_BP3NdBnXodU5X_li/view?usp=sharing
#https://drive.google.com/file/d/1Jobp--lSRr1mn0me4YOUg0o3B58YksPI/view?usp=sharing
#https://drive.google.com/file/d/1CCpBCvLCRusytxrA9hUdew0KaIwUptE_/view?usp=sharing

!gdown --id 1ggZqb_jCmq12gwmfBBQhOWUV4dN00S-o #train.csv
!gdown --id 1VrBghMWob_-mTz0sB5B8KLkq0MgdRrC9 #test.csv
!gdown --id 192ibBUshejWnZm3vepWNMi-9kn5GypiD #val.csv

In [None]:
train_dataset = pd.read_csv('train.csv')
test_dataset = pd.read_csv('test.csv')
val_dataset = pd.read_csv('val.csv')

In [None]:
print(train_dataset.shape,'train_dataset')
print(test_dataset.shape, 'test_dataset')
print(val_dataset.shape, 'val_dataset')

(606, 9) train_dataset
(190, 9) test_dataset
(152, 9) val_dataset


In [None]:
train_dataset = pd.read_csv('train.csv')
test_dataset = pd.read_csv('test.csv')
val_dataset = pd.read_csv('val.csv')

In [None]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")

In [None]:
context_token_lens = []
question_token_lens = []
answer_token_lens = []
incorrect_token_lens = []

for i in tq.tqdm(range(len(train_dataset))):
    context_token_lens.append(len(tokenizer(train_dataset.iloc[i]['context'])['input_ids']))
    question_token_lens.append(len(tokenizer(train_dataset.iloc[i]['question'])['input_ids']))
    answer_token_lens.append(len(tokenizer(train_dataset.iloc[i]['correct_answer'])['input_ids']))
    incorrect_token_lens.append(len(tokenizer(
        str(train_dataset.iloc[i]['option1']) +
        str(train_dataset.iloc[i]['option2']) +
        str(train_dataset.iloc[i]['option3']) +
        str(train_dataset.iloc[i]['option4']))['input_ids']))

for i in tq.tqdm(range(len(test_dataset))):
    context_token_lens.append(len(tokenizer(test_dataset.iloc[i]['context'])['input_ids']))
    question_token_lens.append(len(tokenizer(test_dataset.iloc[i]['question'])['input_ids']))
    answer_token_lens.append(len(tokenizer(test_dataset.iloc[i]['correct_answer'])['input_ids']))
    incorrect_token_lens.append(len(tokenizer(
        str(test_dataset.iloc[i]['option1'] )+
        str(test_dataset.iloc[i]['option2'] )+
        str(test_dataset.iloc[i]['option3'] )+
        str(test_dataset.iloc[i]['option4']))['input_ids']))


for i in tq.tqdm(range(len(val_dataset))):
    context_token_lens.append(len(tokenizer(val_dataset.iloc[i]['context'])['input_ids']))
    question_token_lens.append(len(tokenizer(val_dataset.iloc[i]['question'])['input_ids']))
    answer_token_lens.append(len(tokenizer(val_dataset.iloc[i]['correct_answer'])['input_ids']))
    incorrect_token_lens.append(len(tokenizer(
        str(val_dataset.iloc[i]['option1']) +
        str(val_dataset.iloc[i]['option2']) +
        str(val_dataset.iloc[i]['option3']) +
        str(val_dataset.iloc[i]['option4']))['input_ids']))

## PyTorch Lightning modules

In [None]:
SEP_TOKEN = '<sep>'

In [None]:
class QGDataset(Dataset):

    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: T5Tokenizer,
        source_max_token_len: int,
        target_max_token_len: int
    ):

        self.tokenizer = tokenizer
        self.data = data.astype({
            'correct_answer': str,
            'question': str,
            'context': str,
            'option1': str,
            'option2': str,
            'option3': str,
            'option4': str,
        })  # Explicitly set the data types
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]

        answer_encoding = self.tokenizer(
            str(data_row['correct_answer']),  # Convert to string explicitly
            max_length=self.target_max_token_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        question_encoding = self.tokenizer(
            str(data_row['question']),  # Convert to string explicitly
            max_length=self.target_max_token_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        context_encoding = self.tokenizer(
            str(data_row['context']),  # Convert to string explicitly
            max_length=self.source_max_token_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        options_encoding = self.tokenizer(
            '{} {} {} {} {}'.format(str(data_row['option1']), str(data_row['option2']), str(data_row['option3']), str(data_row['option4']), SEP_TOKEN),
            max_length=self.target_max_token_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        input_ids = torch.cat((answer_encoding['input_ids'], question_encoding['input_ids'], context_encoding['input_ids']), dim=-1).flatten()
        attention_mask = torch.cat((answer_encoding['attention_mask'], question_encoding['attention_mask'], context_encoding['attention_mask']), dim=-1).flatten()

        labels = options_encoding['input_ids']
        labels[labels == 0] = -100

        return dict(
            answer_text=data_row['correct_answer'],
            context=data_row['context'],
            question=data_row['question'],
            incorrect1=data_row['option1'],
            incorrect2=data_row['option2'],
            incorrect3=data_row['option3'],
            incorrect4=data_row['option4'],
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels.flatten().to(torch.long)
        )



In [None]:
class QGDataModule(pl.LightningDataModule):

    def __init__(
        self,
        train_dataset: pd.DataFrame,
        val_dataset: pd.DataFrame,
        test_dataset: pd.DataFrame,
        tokenizer: T5Tokenizer,
        batch_size,
        source_max_token_len: int,
        target_max_token_len: int
        ):
        super().__init__()
        self.batch_size = batch_size
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.test_dataset = test_dataset
        self.tokenizer = tokenizer
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len

    def setup(self,stage=None):
        self.train_dataset_1 = QGDataset(self.train_dataset, self.tokenizer, self.source_max_token_len, self.target_max_token_len)
        self.val_dataset_1 = QGDataset(self.val_dataset, self.tokenizer, self.source_max_token_len, self.target_max_token_len)
        self.test_dataset_1 = QGDataset(self.test_dataset, self.tokenizer, self.source_max_token_len, self.target_max_token_len)

    def train_dataloader(self):
        return DataLoader(self.train_dataset_1, batch_size = self.batch_size, shuffle=True, num_workers = 2)

    def val_dataloader(self):
        return DataLoader(self.val_dataset_1, batch_size=1, num_workers=2)

    def test_dataloader(self):
        return DataLoader(self.test_dataset_1, batch_size=1, num_workers=2)

## Hyperparameters

In [None]:
# use t5-base models
MODEL_NAME = 't5-small'
SOURCE_MAX_TOKEN_LEN = 512
TARGET_MAX_TOKEN_LEN = 64

N_EPOCHS = 15
BATCH_SIZE = 16
LEARNING_RATE = 0.0001

MODEL_SAVE_NAME = '100200'

In [None]:
DF_TAKE_PERCENTAGE = 1

TAKE_TRAIN = int(len(train_dataset) * DF_TAKE_PERCENTAGE)
TAKE_VAL = int(len(val_dataset) * DF_TAKE_PERCENTAGE)
TAKE_TEST = int(len(test_dataset) * DF_TAKE_PERCENTAGE)


### Initializing training module

####Setting DataModule

In [None]:
print(train_dataset[:TAKE_TRAIN].shape, val_dataset[:TAKE_VAL].shape, test_dataset[:TAKE_TEST].shape)

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
print('tokenizer len before: ', len(tokenizer))
tokenizer.add_tokens(SEP_TOKEN)
print('tokenizer len after: ', len(tokenizer))
TOKENIZER_LEN = len(tokenizer)

data_module = QGDataModule(
    train_dataset[:TAKE_TRAIN],
    val_dataset[:TAKE_VAL],
    test_dataset[:TAKE_TEST],
    tokenizer,
    BATCH_SIZE,
    SOURCE_MAX_TOKEN_LEN,
    TARGET_MAX_TOKEN_LEN)

data_module.setup()

train_dataloader = data_module.train_dataloader()
val_dataloader = data_module.val_dataloader()
test_dataloader = data_module.test_dataloader()

(606, 9) (152, 9) (190, 9)
tokenizer len before:  32100
tokenizer len after:  32101


#### Setting Model

In [None]:
class QGModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)
        self.model.resize_token_embeddings(TOKENIZER_LEN) #resizing after adding new tokens to the tokenizer

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        loss, output = self(input_ids, attention_mask, labels)
        self.log('train_loss', loss, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        loss, output = self(input_ids, attention_mask, labels)
        self.log('val_loss', loss, prog_bar=True, logger=True)
        return loss

    def test_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        loss, output = self(input_ids, attention_mask, labels)
        self.log('test_loss', loss, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=LEARNING_RATE)

In [None]:
checkpoint_callback = ModelCheckpoint(
    dirpath='checkpoints',
    filename='best-checkpoint',
    save_top_k=-1,
    verbose=True,
    monitor='val_loss',
    mode='min'
    )

trainer = pl.Trainer(
    callbacks=[checkpoint_callback],
    max_epochs=N_EPOCHS,
    devices=1,
    )

## Training

In [None]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [None]:
%tensorboard --logdir ./lightning_logs

In [None]:
model = QGModel()
# model = QGModel.load_from_checkpoint('checkpoints/best-checkpoint-v42.ckpt')

trainer.fit(model, data_module)
trainer.test(model, data_module)

### Load model

In [None]:
checkpoint_path = 'checkpoints/best-checkpoint-v14.ckpt'

best_model = QGModel.load_from_checkpoint(checkpoint_path)
best_model.freeze()
best_model.eval()

print()




### Common functions

### View results manually

### Common functions

In [None]:
def generate(qgmodel: QGModel, correct: str, question: str, context: str) -> str:
    source_encoding = tokenizer(
        '{} {} {} {} {}'.format(correct, SEP_TOKEN, question, SEP_TOKEN, context),
        max_length= SOURCE_MAX_TOKEN_LEN,
        padding='max_length',
        truncation= True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors='pt'
        )

    generated_ids = qgmodel.model.generate(
        input_ids=source_encoding['input_ids'].to('cuda'),
        attention_mask=source_encoding['attention_mask'].to('cuda'),
        num_beams=1,
        max_length=TARGET_MAX_TOKEN_LEN,
        repetition_penalty=2.5,
        length_penalty=1.0,
        early_stopping=True,
        use_cache=True
    )

    preds = {
        tokenizer.decode(generated_id, skip_special_tokens=False, clean_up_tokenization_spaces=True)
        for generated_id in generated_ids
    }

    return ''.join(preds)

In [None]:
from typing import List
import nltk
from nltk.tokenize import word_tokenize
#nltk.download('punkt')
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

def calculate_nltk_bleu_single(references: List[str], hypothesis: str):

    if hypothesis == '':
        return 0, 0, 0, 0

    # Word tokenize
    refs_tokenized = list(map(lambda x: word_tokenize(x), references))
    hyp_tokenized = word_tokenize(hypothesis)

    # Smoothing function to avoid the cases where it resuts 1.0 in the cases when // Corpus/Sentence contains 0 counts of 2-gram overlaps. BLEU scores might be undesirable; use SmoothingFunction() //
    chencherry = SmoothingFunction()

    bleu_1 = sentence_bleu(refs_tokenized, hyp_tokenized, weights=(1, 0, 0, 0), smoothing_function=chencherry.method2)
    bleu_2 = sentence_bleu(refs_tokenized, hyp_tokenized, weights=(0.5, 0.5, 0, 0), smoothing_function=chencherry.method2)
    bleu_3 = sentence_bleu(refs_tokenized, hyp_tokenized, weights=(0.33, 0.33, 0.33, 0), smoothing_function=chencherry.method2)
    bleu_4 = sentence_bleu(refs_tokenized, hyp_tokenized, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=chencherry.method2)

    return bleu_1, bleu_2, bleu_3, bleu_4

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# from typing import List
# from nltk.translate.bleu_score import sentence_bleu
# from tqdm import tqdm

def calculate_nltk_bleu(references: List[List[str]], hypothesis: List[str]):

    assert len(references) == len(hypothesis)

    bleu_totals = [0, 0, 0, 0]

    for i in tqdm(range(len(references))):
        # Convert each element to string before joining
        reference = " ".join(str(item) for item in references[i])
        curr_hypothesis = " ".join(hypothesis[i])

        curr_bleu = calculate_nltk_bleu_single(reference, curr_hypothesis)

        bleu_totals[0] += curr_bleu[0]
        bleu_totals[1] += curr_bleu[1]
        bleu_totals[2] += curr_bleu[2]
        bleu_totals[3] += curr_bleu[3]

    return (round(bleu_totals[0] / len(references) * 100, 2),
            round(bleu_totals[1] / len(references) * 100, 2),
            round(bleu_totals[2] / len(references) * 100, 2),
            round(bleu_totals[3] / len(references) * 100, 2))



### Generate results

In [None]:
results = []

for i in tqdm(range(len(test_dataset))):
    sample = test_dataset.iloc[i]

    results.append(generate(best_model, sample['correct_answer'], sample['question'], sample['context']))

  0%|          | 0/190 [00:00<?, ?it/s]

In [None]:
path = '/content/drive/MyDrive/Research/' + 'Distractors/' + 'results-epoch4' + '.txt'

with open(path, "w",encoding="utf-8") as output:
    for row in results:
        output.write(str(row) + '\n')

### Load predictions

In [None]:
from typing import List

def load_lines_from_txt(file_path: str) -> List[str]:
    lines = []

    with open(file_path,encoding="utf-8") as f:
        lines = f.readlines()

    for i in range(len(lines)):
        lines[i] = lines[i].strip()

    return lines

In [None]:
results = load_lines_from_txt('/content/drive/MyDrive/Research/Distractors/results-epoch4.txt')

### Split distractors

In [None]:
def correct_index_of(text:str, substring: str, start_index: int = 0):
    try:
        index = text.index(substring, start_index)
    except ValueError:
        index = -1

    return index

def replace_all_extra_id(text: str):
    new_text = text
    start_index_of_extra_id = 0

    while (correct_index_of(new_text, '<extra_id_') >= 0):
        start_index_of_extra_id = correct_index_of(new_text, '<extra_id_', start_index_of_extra_id)
        end_index_of_extra_id = correct_index_of(new_text, '>', start_index_of_extra_id)

        new_text = new_text[:start_index_of_extra_id] + '<sep>' + new_text[end_index_of_extra_id + 1:]

    return new_text

In [None]:
incorrect1s = []
incorrect2s = []
incorrect3s = []
incorrect4s = []

for result in results:
    cleaned_result = result.replace('<pad>', '').replace('</s>', '')
    cleaned_result = replace_all_extra_id(cleaned_result)
    distractors = cleaned_result.split('<sep>')

    if len(distractors) != 4:
        if len(distractors) == 3:
            print('3 answers at', result)
            distractors.append('')
        else:
            print('1 distractor', result, 'not enough distractors??')
            distractors.append('')
            distractors.append('')
            distractors.append('')
            distractors.append('')

    incorrect1s.append(distractors[0])
    incorrect2s.append(distractors[1])
    incorrect3s.append(distractors[2])
    incorrect4s.append(distractors[3])

1 distractor <pad> prolactin aplicatiiPresence of a hormone in the thyroid gland Promotes only non-tropic eff ects. Growth hormone promotes only non-tropic eff ectes. ingrijire imunitar</s> not enough distractors??
1 distractor <pad> It is a highly regulated and efficient food processor. it can be produced from wild sources of fish, crustaceans and other aquatic plants. Produced from extensive aquaculture Oreochromis niloticus contributes to fresh water fish production ingrijire imunitar</s> not enough distractors??
1 distractor <pad> Oxidation of pyruvate is the link reaction to glycolysis. It is formed by co-enzyme A and CO2. At the end of glycolyse, pyrula is converted to two NADH molecules. Acetyl Co-A will feed its  not enough distractors??
1 distractor <pad> telophase is the longer phase of cell division. it covers only about 10% of cell cycle. It is called nuclear division which gives rise to two genetically identical daughter nuclei from a mother nuleus, and nodeoli are visible

### Load references

### Evaluate

### NLTK BLEU EVAL

In [None]:
from typing import List
import nltk
from nltk.tokenize import word_tokenize
#nltk.download('punkt')
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

def calculate_nltk_bleu_single(references: List[str], hypothesis: str):

    if hypothesis == '':
        return 0, 0, 0, 0

    # Word tokenize
    refs_tokenized = list(map(lambda x: word_tokenize(x), references))
    hyp_tokenized = word_tokenize(hypothesis)

    # Smoothing function to avoid the cases where it resuts 1.0 in the cases when // Corpus/Sentence contains 0 counts of 2-gram overlaps. BLEU scores might be undesirable; use SmoothingFunction() //
    chencherry = SmoothingFunction()

    bleu_1 = sentence_bleu(refs_tokenized, hyp_tokenized, weights=(1, 0, 0, 0), smoothing_function=chencherry.method2)
    bleu_2 = sentence_bleu(refs_tokenized, hyp_tokenized, weights=(0.5, 0.5, 0, 0), smoothing_function=chencherry.method2)
    bleu_3 = sentence_bleu(refs_tokenized, hyp_tokenized, weights=(0.33, 0.33, 0.33, 0), smoothing_function=chencherry.method2)
    bleu_4 = sentence_bleu(refs_tokenized, hyp_tokenized, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=chencherry.method2)

    return bleu_1, bleu_2, bleu_3, bleu_4

In [None]:
nltk.download('punkt')

In [None]:
# from typing import List
# from nltk.translate.bleu_score import sentence_bleu
# from tqdm import tqdm

def calculate_nltk_bleu(references: List[List[str]], hypothesis: List[str]):

    assert len(references) == len(hypothesis)

    bleu_totals = [0, 0, 0, 0]

    for i in tqdm(range(len(references))):
        # Convert each element to string before joining
        reference = " ".join(str(item) for item in references[i])
        curr_hypothesis = " ".join(hypothesis[i])

        curr_bleu = calculate_nltk_bleu_single(reference, curr_hypothesis)

        bleu_totals[0] += curr_bleu[0]
        bleu_totals[1] += curr_bleu[1]
        bleu_totals[2] += curr_bleu[2]
        bleu_totals[3] += curr_bleu[3]

    return (round(bleu_totals[0] / len(references) * 100, 2),
            round(bleu_totals[1] / len(references) * 100, 2),
            round(bleu_totals[2] / len(references) * 100, 2),
            round(bleu_totals[3] / len(references) * 100, 2))



In [None]:
bleu_scores = []

# Assuming reference_incorrects and reference_correct are lists of lists of strings
bleu_scores.append(calculate_nltk_bleu(reference_incorrects, incorrect1s))
bleu_scores.append(calculate_nltk_bleu(reference_incorrects, incorrect2s))
bleu_scores.append(calculate_nltk_bleu(reference_incorrects, incorrect3s))
bleu_scores.append(calculate_nltk_bleu(reference_incorrects, incorrect4s))

bleu_scores.append(calculate_nltk_bleu([reference_correct], incorrect1s))
bleu_scores.append(calculate_nltk_bleu([reference_correct], incorrect2s))
bleu_scores.append(calculate_nltk_bleu([reference_correct], incorrect3s))
bleu_scores.append(calculate_nltk_bleu([reference_correct], incorrect4s))

print('###', 'bleu_1', 'bleu_2', 'bleu_3', 'bleu_4')
labels = ['d1i', 'd2i', 'd3i', 'd4i', 'd1c', 'd2c', 'd3c', 'd4c']

for i in range(len(bleu_scores)):
    print(labels[i], "{:<7}".format(bleu_scores[i][0]), "{:<7}".format(bleu_scores[i][1]), "{:<7}".format(bleu_scores[i][2]), "{:<7}".format(bleu_scores[i][3]))
