# Medical Chatbot

In [1]:
!nvidia-smi

Tue Oct 24 03:19:32 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.161.03   Driver Version: 470.161.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   42C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|       

In [5]:
!pip install transformers tokenizers sentencepiece torchtext pytorch_lightning numpy>=1.16.5 datasets --quiet

In [6]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

from datasets import load_dataset

from transformers import AutoTokenizer, AutoModelForQuestionAnswering

import warnings
warnings.filterwarnings('ignore')

In [23]:
data_url = '/kaggle/input/medical-data-folder/finalPreprocessed.csv'

In [7]:
model_checkpoint = 'distilbert-base-uncased'

In [24]:
data = pd.read_csv(data_url)

In [28]:
data[data.duplicated()]

Unnamed: 0,Questions,Answers
11139,Are type D personality and depressive symptoms...,Type D personality and depressive symptoms wer...
11589,Are inflammatory markers unrelated to physical...,Although dietary intake and inflammation may i...
25561,What causes Causes of Diabetes ?,Other types of diabetes have a variety of poss...
30970,Does astragaloside IV attenuate inflammatory c...,The results of these studies indicate that Ast...
31612,Does sodium Intake During an Ultramarathon Pre...,"Exercise-associated muscle cramping, dehydrati..."
...,...,...
225870,What causes Causes of Diabetes ?,Type 1 diabetes is caused by a lack of insulin...
225884,Does mOLECULAR CHARACTERISATION AND ANTIMICROB...,This study demonstrated that there is a signif...
226805,Does physical activity overcome the effects of...,It is recommended that programs to combat sede...
227575,What are the treatments for Acromegaly ?,"Currently, treatment options include surgical ..."


In [29]:
data = data.drop_duplicates()
data[data.duplicated()]

Unnamed: 0,Questions,Answers


In [30]:
train_df, temp_df = train_test_split(data, test_size=0.3, random_state=42)

val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [31]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [32]:
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [34]:
sample_encodings = tokenizer(train_df['Questions'][15])

In [35]:
sample_encodings.keys()

dict_keys(['input_ids', 'attention_mask'])

In [36]:
sample_encodings['input_ids']

[101,
 2040,
 2003,
 2012,
 3891,
 2005,
 1051,
 10755,
 2937,
 1010,
 2991,
 7361,
 2937,
 7270,
 1010,
 1998,
 3078,
 2566,
 9956,
 22084,
 2140,
 4456,
 1029,
 1029,
 102]

In [38]:
sample_encodings['attention_mask']

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [39]:
len(sample_encodings['input_ids']), len(sample_encodings['attention_mask'])

(25, 25)

In [40]:
" ".join([tokenizer.decode(input_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
         for input_id in sample_encodings['input_ids']])

' who is at risk for o ##var ##ian , fall ##op ##ian tube , and primary per ##ito ##nea ##l cancer ? ? '

In [42]:
sample_answer_encodings = tokenizer(train_df['Answers'][15])

Token indices sequence length is longer than the specified maximum sequence length for this model (939 > 512). Running this sequence through the model will result in indexing errors


In [43]:
len(sample_answer_encodings['input_ids']), len(sample_answer_encodings['attention_mask'])

(939, 939)

In [44]:
" ".join([tokenizer.decode(input_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
         for input_id in sample_answer_encodings['input_ids']])

' key points - avoiding risk factors and increasing protective factors may help prevent cancer . - the following are risk factors for o ##var ##ian , fall ##op ##ian tube , and primary per ##ito ##nea ##l cancer : - family history of o ##var ##ian , fall ##op ##ian tube , and primary per ##ito ##nea ##l cancer - inherited risk - hormone replacement therapy - weight and height - the following are protective factors for o ##var ##ian , fall ##op ##ian tube , and primary per ##ito ##nea ##l cancer : - oral contra ##ceptive ##s - tuba ##l liga ##tion - breast ##fe ##eding - risk - reducing sal ##ping ##o - o ##op ##hore ##ct ##omy - it is not clear whether the following affect the risk of o ##var ##ian , fall ##op ##ian tube , and primary per ##ito ##nea ##l cancer : - diet - alcohol - as ##pi ##rin and non - ste ##roid ##al anti - inflammatory drugs - smoking - tal ##c - in ##fer ##tility treatment - cancer prevention clinical trials are used to study ways to prevent cancer . - new ways t

In [45]:
train_df.shape

(159282, 2)

In [56]:
class QADataset(Dataset):
  def __init__(
      self,
      data,
      tokenizer,
      source_max_token_len = 128,
      target_max_token_len = 32,
      ):
    
    self.data =  data
    self.tokenizer =  tokenizer
    self.source_max_token_len =  source_max_token_len
    self.target_max_token_len =  target_max_token_len


  def __len__(self):
    return len(self.data)

  def __getitem__(self, index: int):
    data_row = self.data.iloc[index]

    source_encoding = tokenizer(
      data_row['Questions'],
      max_length=self.source_max_token_len,
      padding='max_length',
      truncation="only_second",
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"
      )
    
    target_encoding = tokenizer(
      data_row['Answers'],
      max_length=self.target_max_token_len,
      padding='max_length',
      truncation=True,
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"
      )
    
    labels = target_encoding['input_ids']
    labels[labels==0] = -100

    return dict(
        question=data_row['Questions'],
        answer_text=data_row['Answers'],
        input_ids=source_encoding["input_ids"].flatten(),
        attention_mask=source_encoding['attention_mask'].flatten(),
        labels=labels.flatten()
    )

In [57]:
class DataModule(pl.LightningDataModule):
    def __init__(
          self,
          train_df,
          val_df,
          test_df,
          tokenizer,
          batch_size = 8,
          source_max_token_len = 128,
          target_max_token_len = 32,
          ):
        super().__init__()
        self.train_df = train_df
        self.val_df = val_df
        self.test_df = test_df
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len

    def setup(self, stage=None):
        self.train_dataset = QADataset(
            self.train_df,
            self.tokenizer,
            self.source_max_token_len,
            self.target_max_token_len
            )

        self.val_dataset = QADataset(
            self.val_df,
            self.tokenizer,
            self.source_max_token_len,
            self.target_max_token_len
        )
        self.test_dataset = QADataset(
            self.test_df,
            self.tokenizer,
            self.source_max_token_len,
            self.target_max_token_len
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=4
            )

    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            num_workers=4
            )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=1,
            num_workers=4
            )

In [58]:
BATCH_SIZE = 2
N_EPOCHS = 2

data_module = DataModule(train_df, val_df, test_df, tokenizer, batch_size=BATCH_SIZE)
data_module.setup()

In [64]:
class QAModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint, return_dict = True)


    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(
            input_ids, 
            attention_mask=attention_mask
        )

        return output.loss, output

    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask=batch['attention_mask']
        labels = batch['labels']
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return {"loss": loss, "predictions":outputs, "labels": labels}

    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask=batch['attention_mask']
        labels = batch['labels']
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss

    def test_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask=batch['attention_mask']
        labels = batch['labels']
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=0.0001)
        return optimizer

In [65]:
model = QAModel()

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [66]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath="/kaggle/working/checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min"
)

In [67]:
trainer = pl.Trainer(
    callbacks=checkpoint_callback,
    max_epochs=N_EPOCHS,
    accelerator='gpu',
    devices = 1
)

In [68]:
trainer.fit(model, data_module)

Sanity Checking: 0it [00:00, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

ValueError: `self.log(val_loss, None)` was called, but `NoneType` values cannot be logged