In [1]:
import argparse
import glob
import os

import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import pandas as pd
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
from termcolor import colored
import textwrap
from keras.callbacks import ModelCheckpoint


from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

2024-02-18 11:28:42.824836: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-18 11:28:42.824987: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-18 11:28:42.972599: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
!pip install transformers



In [3]:
!pip install tokenizers



In [4]:
!pip install sentencepiece



In [5]:
!pip install pytorch-lightning



In [6]:
pl.seed_everything(42)

42

In [7]:
def extract_questions_and_answers(factoid_path: Path):
    """
    Extracts questions and their corresponding answers from a factoid JSON file.

    Parameters:
        factoid_path (Path): Path to the factoid JSON file.

    Returns:
        DataFrame: Pandas DataFrame containing extracted data with columns: 
                   question, context, answer_text, answer_start, answer_end.
    """
    # Open the JSON file
    with factoid_path.open() as json_file:
        data = json.load(json_file)

    # Extract paragraphs containing questions
    questions = data["data"][0]["paragraphs"]

    data_rows = []

    # Iterate through each paragraph
    for question in questions:
        context = question["context"]
        # Iterate through each question and its answers
        for question_and_answers in question["qas"]:
            question_text = question_and_answers["question"]
            answers = question_and_answers["answers"]
            # Iterate through each answer
            for answer in answers:
                answer_text = answer["text"]
                answer_start = answer["answer_start"]
                answer_end = answer_start + len(answer_text)

                # Append extracted data to the list
                data_rows.append({
                    "question": question_text,
                    "context": context,
                    "answer_text": answer_text,
                    "answer_start": answer_start,
                    "answer_end": answer_end
                })

    # Convert the list of dictionaries to a Pandas DataFrame
    return pd.DataFrame(data_rows)


In [8]:
from pathlib import Path

# List of paths to factoid JSON files
factoid_paths = [sorted(list(Path("/kaggle/input/bioasq/BioASQ/").glob("BioASQ-train-*")))]

def filter_paths(paths):
    """
    Filter paths based on specific substrings.

    Parameters:
        paths (list of Path): List of paths to be filtered.

    Returns:
        list of Path: Filtered list of paths containing substrings '4b', '5b', or '6b'.
    """
    filtered_paths = [path for path in paths if '4b' in str(path) or '5b' in str(path) or '6b' in str(path)]
    return filtered_paths

# Filter factoid paths
filtered_paths = filter_paths(factoid_paths[0])

In [9]:
import pandas as pd

# List to store DataFrames extracted from factoid JSON files
dfs = []

# Iterate through each filtered factoid path
for factoid_path in filtered_paths:
    # Extract questions and answers from the factoid JSON file and append the resulting DataFrame
    dfs.append(extract_questions_and_answers(factoid_path))

# Concatenate all DataFrames into a single DataFrame
df = pd.concat(dfs)

In [10]:
df.head()

Unnamed: 0,question,context,answer_text,answer_start,answer_end
0,What is the inheritance pattern of Li–Fraumeni...,Balanced t(11;15)(q23;q15) in a TP53+/+ breast...,autosomal dominant,213,231
1,What is the inheritance pattern of Li–Fraumeni...,Genetic modeling of Li-Fraumeni syndrome in ze...,autosomal dominant,105,123
2,Which type of lung cancer is afatinib used for?,Clinical perspective of afatinib in non-small ...,EGFR-mutant NSCLC,1203,1220
3,Which hormone abnormalities are characteristic...,"DOCA sensitive pendrin expression in kidney, h...",thyroid,419,426
4,Which hormone abnormalities are characteristic...,Clinical and molecular characteristics of Pend...,thyroid,705,712


In [11]:
# Path to the pre-trained T5 model if you are using kaggle
MODEL_NAME = "/kaggle/input/flan-t5/pytorch/base/4"

In [12]:
# Import necessary modules
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Initialize a T5 tokenizer using a pre-trained model
tokenizer = T5Tokenizer.from_pretrained("/kaggle/input/flan-t5/pytorch/base/4")

# Initialize a T5 model for conditional generation using a pre-trained model
# The `device_map="auto"` parameter automatically selects the appropriate device
model = T5ForConditionalGeneration.from_pretrained("/kaggle/input/flan-t5/pytorch/base/4", device_map="auto")


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
class BioQADataset(Dataset):
    """
    Dataset class for BioASQ data to be used with T5 model.

    Parameters:
        data (pd.DataFrame): DataFrame containing the data.
        tokenizer (T5Tokenizer): Tokenizer for encoding text data.
        source_max_token_len (int): Maximum length for source encoding.
        target_max_token_len (int): Maximum length for target encoding.
    """

    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: T5Tokenizer,
        source_max_token_len: int = 396,
        target_max_token_len: int = 32
    ):
        """
        Initializes the BioQADataset.

        Args:
            data (pd.DataFrame): DataFrame containing the data.
            tokenizer (T5Tokenizer): Tokenizer for encoding text data.
            source_max_token_len (int, optional): Maximum length for source encoding. Defaults to 396.
            target_max_token_len (int, optional): Maximum length for target encoding. Defaults to 32.
        """
        self.tokenizer = tokenizer
        self.data = data
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len

    def __len__(self):
        """
        Returns the number of samples in the dataset.
        """
        return len(self.data)

    def __getitem__(self, index: int):
        """
        Gets a sample from the dataset at the specified index.

        Args:
            index (int): Index of the sample to retrieve.

        Returns:
            dict: A dictionary containing the encoded inputs and labels.
        """
        data_row = self.data.iloc[index]

        # Encode source (question and context)
        source_encoding = self.tokenizer(
            data_row["question"],
            data_row["context"],
            max_length=self.source_max_token_len,
            padding="max_length",
            truncation="only_second",
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )

        # Encode target (answer_text)
        target_encoding = self.tokenizer(
            data_row["answer_text"],
            max_length=self.target_max_token_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )

        # Prepare labels (setting padding tokens to -100)
        labels = target_encoding["input_ids"]
        labels[labels == 0] = -100

        return {
            "question": data_row["question"],
            "context": data_row["context"],
            "answer_text": data_row["answer_text"],
            "input_ids": source_encoding["input_ids"].flatten(),
            "attention_mask": source_encoding["attention_mask"].flatten(),
            "labels": labels.flatten()
        }

In [14]:
# Create a sample dataset using BioQADataset class
sample_dataset = BioQADataset(df, tokenizer)

In [15]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.20)

In [16]:
class BioQADataModule(pl.LightningDataModule):
    """
    LightningDataModule for preparing data for training and testing a T5 model on BioASQ data.

    Parameters:
        train_df (pd.DataFrame): DataFrame containing training data.
        test_df (pd.DataFrame): DataFrame containing test/validation data.
        tokenizer (T5Tokenizer): Tokenizer for encoding text data.
        batch_size (int): Batch size for data loaders. Defaults to 8.
        source_max_token_len (int): Maximum length for source encoding. Defaults to 396.
        target_max_token_len (int): Maximum length for target encoding. Defaults to 32.
    """

    def __init__(
        self,
        train_df: pd.DataFrame,
        test_df: pd.DataFrame,
        tokenizer: T5Tokenizer,
        batch_size: int = 8,
        source_max_token_len: int = 396,
        target_max_token_len: int = 32
    ):
        """
        Initializes the BioQADataModule.

        Args:
            train_df (pd.DataFrame): DataFrame containing training data.
            test_df (pd.DataFrame): DataFrame containing test/validation data.
            tokenizer (T5Tokenizer): Tokenizer for encoding text data.
            batch_size (int, optional): Batch size for data loaders. Defaults to 8.
            source_max_token_len (int, optional): Maximum length for source encoding. Defaults to 396.
            target_max_token_len (int, optional): Maximum length for target encoding. Defaults to 32.
        """
        super().__init__()
        self.batch_size = batch_size
        self.train_df = train_df
        self.test_df = test_df
        self.tokenizer = tokenizer
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len

    def setup(self, stage=None):
        """
        Setup datasets for training and testing.

        Args:
            stage (str): Stage of training, either 'fit' or 'test'. Defaults to None.
        """
        self.train_dataset = BioQADataset(
            self.train_df,
            self.tokenizer,
            self.source_max_token_len,
            self.target_max_token_len
        )

        self.test_dataset = BioQADataset(
            self.train_df,
            self.tokenizer,
            self.source_max_token_len,
            self.target_max_token_len
        )

    def train_dataloader(self):
        """
        Returns a DataLoader for the training dataset.
        """
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=4
        )

    def val_dataloader(self):
        """
        Returns a DataLoader for the validation dataset.
        """
        return DataLoader(
            self.test_dataset,
            batch_size=1,
            num_workers=4
        )

    def test_dataloader(self):
        """
        Returns a DataLoader for the test dataset.
        """
        return DataLoader(
            self.test_dataset,
            batch_size=1,
            num_workers=4
        )


In [17]:
# Set batch size and number of epochs
BATCH_SIZE = 8
N_EPOCHS = 3

# Create an instance of BioQADataModule
data_module = BioQADataModule(train_df, val_df, tokenizer, batch_size=BATCH_SIZE)

# Setup datasets for training and validation
data_module.setup()

In [18]:
class BioQAModel(pl.LightningModule):
    """
    LightningModule for fine-tuning a T5 model on BioASQ data.

    Attributes:
        model (T5ForConditionalGeneration): Pre-trained T5 model.
        validation_step_outputs (list): List to store validation step outputs.
    """

    def __init__(self):
        """
        Initializes the BioQAModel.
        """
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)
        self.validation_step_outputs = []

    def forward(self, input_ids, attention_mask, labels=None):
        """
        Forward pass of the model.

        Args:
            input_ids (tensor): Input tensor representing tokenized inputs.
            attention_mask (tensor): Tensor representing attention mask.
            labels (tensor, optional): Tensor representing labels. Defaults to None.

        Returns:
            tuple: Tuple containing loss and logits.
        """
        output = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        """
        Training step for the model.

        Args:
            batch (dict): Batch of training data.
            batch_idx (int): Index of the batch.

        Returns:
            tensor: Loss tensor.
        """
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        """
        Validation step for the model.

        Args:
            batch (dict): Batch of validation data.
            batch_idx (int): Index of the batch.

        Returns:
            tensor: Loss tensor.
        """
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        self.validation_step_outputs.append(loss)
        return loss

    def test_step(self, batch, batch_idx):
        """
        Test step for the model.

        Args:
            batch (dict): Batch of test data.
            batch_idx (int): Index of the batch.

        Returns:
            tensor: Loss tensor.
        """
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        """
        Configure optimizer for the model.

        Returns:
            torch.optim.Optimizer: Optimizer.
        """
        return AdamW(self.parameters(), lr=0.0001)

    def on_validation_epoch_end(self):
        """
        Performs actions at the end of each validation epoch.
        """
        epoch_average = torch.stack(self.validation_step_outputs).mean()
        self.log("validation_epoch_average", epoch_average)
        self.validation_step_outputs.clear()  # free memory


In [19]:
# Create an instance of the BioQAModel
model = BioQAModel()

In [20]:
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

# Define a ModelCheckpoint callback to save the best model
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min"
)

# Define a TensorBoardLogger for logging training metrics
logger = TensorBoardLogger("training-logs", name="bio-qa")

# Initialize the Trainer
trainer = Trainer(
    callbacks=[checkpoint_callback],  # List of callbacks
    max_epochs=N_EPOCHS,              # Maximum number of epochs
    accelerator="gpu",                # Use "gpu" for single GPU, "ddp" for distributed training
    logger=logger                     # Logger for logging training metrics
)

In [21]:
# Load the extension and start TensorBoard
%load_ext tensorboard
%tensorboard --logdir ./training-logs

In [22]:
df.shape

(12988, 5)

In [23]:
# Inline installation from within a notebook
# Uncomment these two lines if needed

!pip install ipywidgets
!jupyter nbextension enable --py --sys-prefix widgetsnbextension

Config option `kernel_spec_manager_class` not recognized by `EnableNBExtensionApp`.
Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [24]:
import ipywidgets as widgets

# For explicitly displaying widgets
from IPython.display import display

# Just need these for the demo purposes here
from datetime import datetime
import matplotlib.pyplot as plt

In [None]:
trainer.fit(model,data_module)



Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]