<a href="https://colab.research.google.com/github/Priyansh-S-K/Intel_GenAI_Project/blob/main/Disease_Symptom_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch torchtext transformers sentencepiece pandas tqdm datasets

In [None]:
!pip install openvino optimum optimum-intel gradio

In [None]:
from datasets import load_dataset, DatasetDict, Dataset
import pandas as pd
import ast
import datasets
from tqdm import tqdm
import time

In [None]:
#  data_sample = load_dataset("QuyenAnhDE/Diseases_Symptoms")
data_sample = load_dataset("keivalya/MedQuad-MedicalQnADataset")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data_sample

In [None]:
updated_data = [{'Question': item['Question'], 'Answer': item['Answer']} for item in data_sample['train']]
df = pd.DataFrame(updated_data)

# updated_data = [{'Name': item['Name'], 'Treatments': item['Treatments']} for item in data_sample['train']]
# df = pd.DataFrame(updated_data)

In [None]:
df=df.sample(frac=0.4)

In [None]:
import re

def split_and_join(text):
    # Replace multiple delimiters with a space using regex
    text = re.sub(r'[ ,:?"().-]+', ' ', text)
    # Split by spaces, strip whitespace, and join with a single space
    parts = [part.strip() for part in text.split()]
    return ' '.join(parts)

In [None]:
df['Question'] = df['Question'].apply(split_and_join)
df['Answer'] = df['Answer'].apply(split_and_join)

# df['Treatments'] = df['Treatments'].apply(split_and_join)

In [None]:

from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

In [None]:
device=torch.device('cpu')

In [None]:
BATCH_SIZE = 8
df.describe()

In [None]:
class LanguageDataset(Dataset):
    """
    An extension of the Dataset object to:
      - Make training loop cleaner
      - Make ingestion easier from pandas df's
    """
    def __init__(self, df, tokenizer):
        self.labels = df.columns
        self.data = df.to_dict(orient='records')
        self.tokenizer = tokenizer
        x = self.fittest_max_length(df)  # Fix here
        self.max_length = x

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = self.data[idx][self.labels[0]]
        y = self.data[idx][self.labels[1]]
        text = f"{x} | {y}"
        tokens = self.tokenizer.encode_plus(text, return_tensors='pt', max_length=128, padding='max_length', truncation=True)
        return tokens

    def fittest_max_length(self, df):  # Fix here
        """
        Smallest power of two larger than the longest term in the data set.
        Important to set up max length to speed training time.
        """
        max_length = max(len(max(df[self.labels[0]], key=len)), len(max(df[self.labels[1]], key=len)))
        x = 2
        while x < max_length: x = x * 2
        return x

# Cast the Huggingface data set as a LanguageDataset we defined above
data_sample = LanguageDataset(df, tokenizer)



In [None]:
train_size = int(0.8 * len(data_sample))
valid_size = len(data_sample) - train_size
train_data, valid_data = random_split(data_sample, [train_size, valid_size])

In [None]:

# Make the iterators
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=BATCH_SIZE)

In [None]:
num_epochs = 1

In [None]:
batch_size = BATCH_SIZE
model_name = 'gpt2'
gpu = 0

In [None]:

# Set the learning rate and loss function
## CrossEntropyLoss measures how close answers to the truth.
## More punishing for high confidence wrong answers
criterion = nn.CrossEntropyLoss(ignore_index = tokenizer.pad_token_id)
optimizer = optim.Adam(model.parameters(), lr=5e-4)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Init a results dataframe
results = pd.DataFrame(columns=['epoch', 'transformer', 'batch_size', 'gpu',
                                'training_loss', 'validation_loss', 'epoch_duration_sec'])

In [None]:
# The training loop
for epoch in range(num_epochs):
    start_time = time.time()  # Start the timer for the epoch

    # Training
    ## This line tells the model we're in 'learning mode'
    model.train()
    epoch_training_loss = 0
    train_iterator = tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{num_epochs} Batch Size: {batch_size}, Transformer: {model_name}")
    for batch in train_iterator:
        optimizer.zero_grad()
        inputs = batch['input_ids'].squeeze(1).to(device)
        targets = inputs.clone()
        outputs = model(input_ids=inputs, labels=targets)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_iterator.set_postfix({'Training Loss': loss.item()})
        epoch_training_loss += loss.item()
    avg_epoch_training_loss = epoch_training_loss / len(train_iterator)

    # Validation
    ## This line below tells the model to 'stop learning'
    model.eval()
    epoch_validation_loss = 0
    total_loss = 0
    valid_iterator = tqdm(valid_loader, desc=f"Validation Epoch {epoch+1}/{num_epochs}")
    with torch.no_grad():
        for batch in valid_iterator:
            inputs = batch['input_ids'].squeeze(1).to(device)
            targets = inputs.clone()
            outputs = model(input_ids=inputs, labels=targets)
            loss = outputs.loss
            total_loss += loss
            valid_iterator.set_postfix({'Validation Loss': loss.item()})
            epoch_validation_loss += loss.item()

    avg_epoch_validation_loss = epoch_validation_loss / len(valid_loader)

    end_time = time.time()  # End the timer for the epoch
    epoch_duration_sec = end_time - start_time  # Calculate the duration in seconds

    new_row = {'transformer': model_name,
               'batch_size': batch_size,
               'gpu': gpu,
               'epoch': epoch+1,
               'training_loss': avg_epoch_training_loss,
               'validation_loss': avg_epoch_validation_loss,
               'epoch_duration_sec': epoch_duration_sec}  # Add epoch_duration to the dataframe

    results.loc[len(results)] = new_row
    print(f"Epoch: {epoch+1}, Validation Loss: {total_loss/len(valid_loader)}")


In [None]:
model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/finalmodel")
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/finalmodel")

In [None]:
from optimum.intel import OVModelForCausalLM
from transformers import AutoTokenizer,pipeline
model_id="/content/drive/MyDrive/Colab Notebooks/finalmodel"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = OVModelForCausalLM.from_pretrained(model_id, export=True)

In [None]:

import random
import time

from transformers import pipeline
import gradio as gr

# Initialize the DistilGPT-2 text generation pipeline
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

def gpt2_generate_text(prompt, max_length=100):
    # Generate text using the DistilGPT-2 model
    generated = generator(prompt, max_length=max_length, num_return_sequences=1)
    return generated[0]['generated_text']


with gr.Blocks() as demo:
    gr.Markdown("# Medical Chatbot")
    gr.Markdown("### Example questions to ask: 'Panic Disorder', 'Turner Syndrome', 'Vocal cord polyp', 'Cryptochidism")
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    submit=gr.Button("Submit")
    clear = gr.Button("Clear")



    def user(user_message, history):
        return "", history + [[user_message, None]]

    def bot(history):
        bot_message = gpt2_generate_text(history[-1][0])
        history[-1][1] = ""
        for character in bot_message:
            history[-1][1] += character
            time.sleep(0.05)
            yield history

    submit.click(user, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot, chatbot, chatbot
    )
    clear.click(lambda: None, None, chatbot, queue=False)

demo.queue()
demo.launch()
