# Medical Chatbot

In [None]:
!nvidia-smi

In [None]:
!pip install transformers tokenizers sentencepiece torchtext pytorch_lightning numpy>=1.16.5 datasets --quiet

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

from datasets import load_dataset

from transformers import AutoTokenizer, AutoModelForQuestionAnswering

import warnings
warnings.filterwarnings('ignore')

In [None]:
train_data_url = '/kaggle/input/medical-chatbot-dataset/train_data_chatbot.csv'
val_data_url = '/kaggle/input/medical-chatbot-dataset/validation_data_chatbot.csv'

In [None]:
model_checkpoint = 'distilbert-base-uncased'

In [None]:
train_df = pd.read_csv(train_data_url)
train_df

In [None]:
test_df = pd.read_csv(val_data_url)
test_df.head(4)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
tokenizer

In [None]:
sample_encodings = tokenizer(train_df['short_question'][15])

In [None]:
sample_encodings.keys()

In [None]:
sample_encodings['input_ids']

In [None]:
sample_encodings['attention_mask']

In [None]:
len(sample_encodings['input_ids']), len(sample_encodings['attention_mask'])

In [None]:
" ".join([tokenizer.decode(input_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
         for input_id in sample_encodings['input_ids']])

In [None]:
sample_answer_encodings = tokenizer(train_df['short_answer'][15])

In [None]:
len(sample_answer_encodings['input_ids']), len(sample_answer_encodings['attention_mask'])

In [None]:
" ".join([tokenizer.decode(input_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
         for input_id in sample_answer_encodings['input_ids']])

In [None]:
train_df.shape

In [None]:
def clean_df(df):
    df = df.drop(['tags', 'label'])
    df = df.drop_duplicates()
    return df

In [None]:
train_df, val_df = train_test_split(train_df, test_size = 0.2, random_state = 4)

train_df.shape, val_df.shape

In [None]:
test_df, val_df = val_df, test_df

In [None]:
test_df.shape

In [None]:
class QADataset(Dataset):
  def __init__(
      self,
      data,
      tokenizer,
      source_max_token_len = 128,
      target_max_token_len = 32,
      ):
    
    self.data =  data
    self.tokenizer =  tokenizer
    self.source_max_token_len =  source_max_token_len
    self.target_max_token_len =  target_max_token_len


  def __len__(self):
    return len(self.data)

  def __getitem__(self, index: int):
    data_row = self.data.iloc[index]

    source_encoding = tokenizer(
      data_row['short_question'],
      max_length=self.source_max_token_len,
      padding='max_length',
      truncation="only_second",
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"
      )
    
    target_encoding = tokenizer(
      data_row['short_answer'],
      max_length=self.target_max_token_len,
      padding='max_length',
      truncation=True,
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"
      )
    
    labels = target_encoding['input_ids']
    labels[labels==0] = -100

    return dict(
        question=data_row['short_question'],
        answer_text=data_row['short_answer'],
        input_ids=source_encoding["input_ids"].flatten(),
        attention_mask=source_encoding['attention_mask'].flatten(),
        labels=labels.flatten()
    )

In [None]:
class DataModule(pl.LightningDataModule):
    def __init__(
          self,
          train_df,
          val_df,
          test_df,
          tokenizer,
          batch_size = 8,
          source_max_token_len = 128,
          target_max_token_len = 32,
          ):
        super().__init__()
        self.train_df = train_df
        self.val_df = val_df
        self.test_df = test_df
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len

    def setup(self, stage=None):
        self.train_dataset = QADataset(
            self.train_df,
            self.tokenizer,
            self.source_max_token_len,
            self.target_max_token_len
            )

        self.val_dataset = QADataset(
            self.val_df,
            self.tokenizer,
            self.source_max_token_len,
            self.target_max_token_len
        )
        self.test_dataset = QADataset(
            self.test_df,
            self.tokenizer,
            self.source_max_token_len,
            self.target_max_token_len
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=4
            )

    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            num_workers=4
            )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=1,
            num_workers=4
            )

In [None]:
BATCH_SIZE = 2
N_EPOCHS = 2

data_module = DataModule(train_df, val_df, test_df, tokenizer, batch_size=BATCH_SIZE)
data_module.setup()

In [None]:
class QAModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint, return_dict = True)


    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(
            input_ids, 
            attention_mask=attention_mask,
            labels=labels)

        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask=batch['attention_mask']
        labels = batch['labels']
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return {"loss": loss, "predictions":outputs, "labels": labels}

    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask=batch['attention_mask']
        labels = batch['labels']
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss

    def test_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask=batch['attention_mask']
        labels = batch['labels']
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=0.0001)
        return optimizer

In [None]:
model = QAModel()

In [None]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath="/kaggle/working/checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min"
)

In [None]:
trainer = pl.Trainer(
    callbacks=checkpoint_callback,
    max_epochs=N_EPOCHS,
    accelerator='gpu',
    devices = 1
)

In [None]:
trainer.fit(model, data_module)