In [None]:
!pip install -q transformers datasets accelerate

In [None]:
!pip install evaluate sentencepiece

In [None]:
!pip install --upgrade accelerate


In [None]:
!pip install faiss-gpu


In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
from datasets import load_dataset, DatasetDict
import torch
import numpy as np
import evaluate
import pandas as pd
import os
import faiss
from datasets import Dataset
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [3]:
import pandas as pd

data = pd.read_csv('/kaggle/input/medicaldata/finaldata.csv')

# Handle missing values and ensure columns are strings
data['question'] = data['question'].fillna("").astype(str)
data['answer'] = data['answer'].fillna("").astype(str)

questions = data['question'].tolist() 
answers = data['answer'].tolist()     

# RAG

In [25]:
model = SentenceTransformer('all-MiniLM-L6-v2')

question_embeddings = model.encode(questions, convert_to_numpy=True)

dimension = question_embeddings.shape[1]  
index = faiss.IndexFlatL2(dimension)      
index.add(question_embeddings)           

Batches:   0%|          | 0/319 [00:00<?, ?it/s]

In [5]:
def interactive_query():
    while True:
        user_query = input("Enter your question (or 'exit' to quit): ")
        if user_query.lower() == 'exit':
            print("Exiting interactive session. Goodbye!")
            break
        
        query_embedding = model.encode([user_query], convert_to_numpy=True)
        k = 1 
        distances, indices = index.search(query_embedding, k)
        
        retrieved_answer = answers[indices[0][0]]
        similarity_score = distances[0][0]
        
        print("\n Answer: ", retrieved_answer)
        print(f"**Similarity Score:** {similarity_score:.4f}\n")

interactive_query()

Enter your question (or 'exit' to quit):  A 62-year-old woman presents to her physician with a painless breast mass on her left breast for the past 4 months. She mentions that she noticed the swelling suddenly one day and thought it would resolve by itself. Instead, it has been slowly increasing in size. On physical examination of the breasts, the physician notes a single non-tender, hard, and fixed nodule over left breast. An ultrasonogram of the breast shows a solid mass, and a fine-needle aspiration biopsy confirms the mass to be lobular carcinoma of the breast. When the patient asks about her prognosis, the physician says that the prognosis can be best determined after both grading and staging of the tumor. Based on the current diagnostic information, the physician says that they can only grade, but no stage, the neoplasm. Which of the following facts about the neoplasm is currently available to the physician?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


 Answer:  The tumor cells exhibit marked nuclear atypia.
**Similarity Score:** 0.0000



Enter your question (or 'exit' to quit):  A 47-year-old female with a history of hypertension presents to your outpatient clinic for numbness, tingling in her right hand that has been slowly worsening over the last several months. She has tried using a splint but receives minimal relief. She is an analyst for a large consulting firm and spends most of her workday in front of a computer. Upon examination, you noticed that the patient has a prominent jaw and her hands appear disproportionately large. Her temperature is 99 deg F (37.2 deg C), blood pressure is 154/72 mmHg, pulse is 87/min, respirations are 12/min. A fasting basic metabolic panel shows: Na: 138 mEq/L, K: 4.1 mEq/L, Cl: 103 mEq/L, CO2: 24 mEq/L, BUN: 12 mg/dL, Cr: 0.8 mg/dL, Glucose: 163 mg/dL. Which of the following tests would be most helpful in identifying the underlying diagnosis?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


 Answer:  Measurement of insulin-like growth factor 1 alone and growth hormone levels after oral glucose
**Similarity Score:** 0.0000



Enter your question (or 'exit' to quit):  exit


Exiting interactive session. Goodbye!


# Fine Tune LLM

In [6]:
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=128):
        self.questions = data['question']
        self.answers = data['answer']
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        answer = self.answers[idx]
        inputs = self.tokenizer(
            question, max_length=self.max_len, truncation=True, padding="max_length", return_tensors="pt"
        )
        targets = self.tokenizer(
            answer, max_length=self.max_len, truncation=True, padding="max_length", return_tensors="pt"
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": targets["input_ids"].squeeze(),
        }

In [7]:
data = pd.read_csv('/kaggle/input/medicaldata/finaldata.csv')
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

train_data.to_csv("train_data.csv", index=False)
test_data.to_csv("test_data.csv", index=False)
print("Train/Test split done!")

Train/Test split done!


In [8]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [9]:
train_data_path = "train_data.csv"
test_data_path = "test_data.csv"
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

In [10]:
train_data = train_data[train_data['answer'].map(type) == str]
test_data = test_data[test_data['answer'].map(type) == str]

In [11]:
train_data = train_data.reset_index(drop=True)

test_data = test_data.reset_index(drop=True)

print("Index reset successfully!")

Index reset successfully!


In [12]:
print(train_data.index) 
print(test_data.index)  


RangeIndex(start=0, stop=8141, step=1)
RangeIndex(start=0, stop=2036, step=1)


In [14]:
train_dataset = CustomDataset(train_data, tokenizer)

In [15]:
training_args = TrainingArguments(
    output_dir="./flan_t5_base_results",
    run_name="flan_t5_medical_query_finetuning",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    save_steps=500,
    logging_steps=100,
    save_total_limit=1,
    fp16=True, 
    eval_strategy="no",
    no_cuda=False,
    report_to="none"
)

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

In [None]:
# torch.cuda.empty_cache()
# torch.cuda.ipc_collect()

In [17]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
100,12.9854
200,1.2673
300,0.2938
400,0.2558
500,0.2513
600,0.24
700,0.2398
800,0.2356
900,0.2263
1000,0.2378




TrainOutput(global_step=1527, training_loss=1.141709942870774, metrics={'train_runtime': 1253.3486, 'train_samples_per_second': 19.486, 'train_steps_per_second': 1.218, 'total_flos': 4180957204709376.0, 'train_loss': 1.141709942870774, 'epoch': 3.0})

In [18]:
test_dataset = CustomDataset(test_data, tokenizer)

results = trainer.evaluate(eval_dataset=test_dataset)
print("Evaluation Results:", results)



Evaluation Results: {'eval_loss': 0.20866546034812927, 'eval_runtime': 39.6938, 'eval_samples_per_second': 51.293, 'eval_steps_per_second': 3.225, 'epoch': 3.0}


In [19]:
model.save_pretrained("./fine_tuned_flan_t5_base")
tokenizer.save_pretrained("./fine_tuned_flan_t5_base")

('./fine_tuned_flan_t5_base/tokenizer_config.json',
 './fine_tuned_flan_t5_base/special_tokens_map.json',
 './fine_tuned_flan_t5_base/spiece.model',
 './fine_tuned_flan_t5_base/added_tokens.json')

In [20]:
import shutil

model_folder = "./fine_tuned_flan_t5_base"

zip_file = "flan_t5_fine_tuned.zip"
shutil.make_archive(base_name="flan_t5_fine_tuned", format="zip", root_dir=model_folder)

print(f"Model saved successfully as {zip_file}.")


Model saved successfully as flan_t5_fine_tuned.zip.


In [21]:
# from IPython.display import FileLink

# # Provide a download link
# FileLink(zip_file)

# **Fallback**

In [None]:
from sentence_transformers import SentenceTransformer
from transformers import T5Tokenizer, T5ForConditionalGeneration
import faiss

retrieval_model = SentenceTransformer('all-MiniLM-L6-v2')
fallback_model = T5ForConditionalGeneration.from_pretrained("./fine_tuned_flan_t5_base")
fallback_tokenizer = T5Tokenizer.from_pretrained("./fine_tuned_flan_t5_base")

questions = data['question'].tolist()  # Use unmodified data for retrieval
answers = data['answer'].tolist()

question_embeddings = retrieval_model.encode(questions, convert_to_numpy=True)
dimension = question_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(question_embeddings)

In [54]:
def generate_fallback_answer(query):
    inputs = fallback_tokenizer(
        query, return_tensors="pt", max_length=128, truncation=True, padding="max_length"
    )
    outputs = fallback_model.generate(
        inputs["input_ids"], max_length=50, num_beams=5, early_stopping=True
    )
    return fallback_tokenizer.decode(outputs[0], skip_special_tokens=True)

threshold = 0.5  # Define threshold based on testing

def query_pipeline(user_query):
    # Step 1: Retrieve using FAISS
    query_embedding = retrieval_model.encode([user_query], convert_to_numpy=True)
    k = 1
    distances, indices = index.search(query_embedding, k)
    distance = distances[0][0]  # L2 distance
    retrieved_answer = answers[indices[0][0]]

    # Calculate query length
    query_length = len(user_query.split())
    print(f"Query Length: {query_length}")

    # Step 2: Apply enhanced fallback logic
    threshold = 0.8  # Distance threshold
    if (distance > threshold ):
        print("Fallback triggered! Generating answer...")
        return generate_fallback_answer(user_query)
    else:
        return retrieved_answer

# Inference

In [53]:
def interactive_test():
    print("Interactive Doctor GPT Chatbot")
    print("Type 'exit' to quit.\n")
    while True:
        user_query = input("Enter your question: ")
        if user_query.lower() == 'exit':
            print("Exiting interactive session. Goodbye!")
            break
        response = query_pipeline(user_query)
        print("\n**Response:**", response, "\n")

# Run the interactive session
interactive_test()


Interactive Doctor GPT Chatbot
Type 'exit' to quit.



Enter your question:  A 67-year-old man comes to the physician because of a 3-day history of fever, chills, headache, and fatigue. He appears ill. His temperature is 39°C (102.2°F). Analysis of nasal secretions shows infection with an enveloped, single-stranded segmented RNA virus. In response to infection with this pathogen, certain cells present antigens from the pathogen to CD8+ T-lymphocytes. Which of the following statements about the molecules used for the presentation of these antigens is most accurate?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query Length: 74
Average Question Length: 116.18

**Response:** The molecule consists of a heavy chain associated with β2 microglobulin 



Enter your question:  A 7-year-old boy comes to the physician because of a generalized rash for 3 days. Over the past 5 days, he has also had a high fever and a sore throat. His 16-year-old sister was treated for infectious mononucleosis 2 weeks ago. He returned from a summer camp a week ago. His immunizations are up-to-date. Three years ago, he required intubation after an allergic reaction to dicloxacillin. The patient appears ill. His temperature is 38.2°C (100.8°F). Examination shows circumferential oral pallor. Cervical lymphadenopathy is present. There is tonsillar erythema and exudate. A confluent, blanching, punctate erythematous rash with a rough texture is spread over his trunk and extremities. His hemoglobin concentration is 13.3 g/dL, leukocyte count is 12,000/mm3, and erythrocyte sedimentation rate is 43 mm/h. Which of the following is the most appropriate next step in management?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query Length: 137
Average Question Length: 116.18

**Response:** Azithromycin therapy 



Enter your question:  A 45-year-old man presents with epigastric pain that improves after eating but returns a few hours later. He has a history of regular nonsteroidal anti-inflammatory drug (NSAID) use. Is it the most likely the ulcer disease?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query Length: 36
Average Question Length: 116.18

**Response:** Gastroduodenal artery 



Enter your question:  A 45-year-old man presents with epigastric pain that improves after eating but returns a few hours later. He has a history of regular nonsteroidal anti-inflammatory drug (NSAID) use. Is it peptic ulcer disease?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query Length: 33
Average Question Length: 116.18

**Response:** Epithelium, lamina propria, muscularis mucosa, and submucosa 



Enter your question:  exit


Exiting interactive session. Goodbye!


# Streamlit app code

In [56]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.44.1-py3-none-any.whl.metadata (8.9 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.44.1-py3-none-any.whl (9.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.8/9.8 MB[0m [31m69.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m52.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.44.1


In [62]:
import subprocess

In [72]:
streamlit_code = """
import streamlit as st
from sentence_transformers import SentenceTransformer
from transformers import T5Tokenizer, T5ForConditionalGeneration
import faiss
import pandas as pd

# Load pre-trained models and data
retrieval_model = SentenceTransformer('all-MiniLM-L6-v2')
fallback_model = T5ForConditionalGeneration.from_pretrained("./fine_tuned_flan_t5_base")
fallback_tokenizer = T5Tokenizer.from_pretrained("./fine_tuned_flan_t5_base")

# Load FAISS index
index = faiss.read_index("/kaggle/working/faiss_index.index")

# Load dataset questions and answers
data = pd.read_csv('/kaggle/input/medicaldata/finaldata.csv')
questions = data['question'].tolist()
answers = data['answer'].tolist()

# Fallback function
def generate_fallback_answer(query):
    inputs = fallback_tokenizer(
        query, return_tensors="pt", max_length=128, truncation=True, padding="max_length"
    )
    outputs = fallback_model.generate(
        inputs["input_ids"], max_length=50, num_beams=5, early_stopping=True
    )
    return fallback_tokenizer.decode(outputs[0], skip_special_tokens=True)

# Query pipeline
threshold = 0.8
def query_pipeline(user_query):
    query_embedding = retrieval_model.encode([user_query], convert_to_numpy=True)
    k = 1
    distances, indices = index.search(query_embedding, k)
    distance = distances[0][0]
    retrieved_answer = answers[indices[0][0]]

    if distance > threshold:
        return generate_fallback_answer(user_query)
    else:
        return retrieved_answer

# Streamlit app
def main():
    st.title("Doctor GPT 🩺")
    st.write("Ask any medical query and let the chatbot assist you!")

    # User input
    user_query = st.text_input("Enter your medical question:")

    if st.button("Get Answer"):
        if user_query:
            response = query_pipeline(user_query)
            st.success(f"**Response:** {response}")
        else:
            st.error("Please enter a question!")

if __name__ == "__main__":
    main()
"""

# Save code to a Python script
with open("chatbot_app.py", "w") as f:
    f.write(streamlit_code)
print("Streamlit code saved as chatbot_app.py!")

Streamlit code saved as chatbot_app.py!


In [68]:
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Downloading pyngrok-7.2.3-py3-none-any.whl (23 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.3


In [70]:
!ngrok config add-authtoken 2vXPJtqghyvoldzhiCLevVZd9rF_5zG4YqqH4mqvgaXjj1qXt

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [75]:
from pyngrok import ngrok
import subprocess

# Start Streamlit in the background
streamlit_process = subprocess.Popen(["streamlit", "run", "chatbot_app.py"])

# Open an Ngrok tunnel for port 8502
public_url = ngrok.connect(8502)
print("Public URL:", public_url)


PyngrokNgrokHTTPError: ngrok client exception, API returned 502: {"error_code":103,"status_code":502,"msg":"failed to start tunnel","details":{"err":"failed to start tunnel: Your account may not run more than 3 tunnels over a single ngrok agent session.\nThe tunnels already running on this session are:\ntn_2vXPfme1ewckPRcaA7fhMShpmhe, tn_2vXR24tInYajrP9vHidDAo8NP94, tn_2vXXkxERzs2cseBNewIDFpoFCXj\n\r\n\r\nERR_NGROK_324\r\n"}}
