In [None]:
!pip install datasets



In [None]:
import pandas as pd
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import os
from datasets import Dataset

# Installation
# !pip install -q transformers datasets evaluate peft trl bitsandbytes accelerate

# Mount Google Drive if using Colab
from google.colab import drive
drive.mount('/content/drive')

# Define paths
cleaned_data_path = "/content/drive/My Drive/ai_medical_assistant/cleaned_data/"
model_path = "/content/drive/My Drive/ai_medical_assistant/models/"
train_file = os.path.join(cleaned_data_path, "chatbot_train.csv")
val_file = os.path.join(cleaned_data_path, "chatbot_val.csv")

# Model Selection - using Flan-T5 small which is a good balance of quality and size for T4 GPUs
model_name = "GanjinZero/biobart-v2-base"  # Can try "facebook/bart-base" as alternative

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the model (using device_map="auto" for efficient GPU utilization)
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    device_map="auto"
)

# Load and prepare the data
train_df = pd.read_csv(train_file)
val_df = pd.read_csv(val_file)

# For initial experimentation with T4, use a smaller subset
# For final training, you can increase this fraction
train_df = train_df.sample(frac=0.05, random_state=42)
val_df = val_df.sample(frac=0.05, random_state=42)

print(f"Training data size: {len(train_df)}")
print(f"Validation data size: {len(val_df)}")

# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Maximum sequence lengths
max_input_length = 512
max_target_length = 256

# Preprocessing function
def preprocess_function(examples):
    inputs = examples["question"]
    targets = examples["answer"]

    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        padding="max_length",
        truncation=True
    )

    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=max_target_length,
            padding="max_length",
            truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Process the datasets
train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names
)

val_dataset = val_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=val_dataset.column_names
)

# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True
)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=os.path.join(model_path, "biobart-v2-medical-chatbot"),
    evaluation_strategy="steps",
    eval_steps=100,
    logging_steps=100,
    save_steps=500,
    save_total_limit=3,
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    gradient_accumulation_steps=4,
)

# Create Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start training
trainer.train()

# Save the model
trainer.save_model(os.path.join(model_path, "biobart-v2-medical-chatbot-final"))
print("Fine-tuning complete!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Training data size: 11561
Validation data size: 1285


Map:   0%|          | 0/11561 [00:00<?, ? examples/s]



Map:   0%|          | 0/1285 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msakshi3maurya[0m ([33msakshi3maurya-thakur-college-of-engineering-and-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
100,1.7269,1.47933
200,1.5434,1.393505
300,1.4649,1.333728
400,1.4224,1.293921
500,1.395,1.264641
600,1.3832,1.241913
700,1.3725,1.229943
800,1.2856,1.217425
900,1.2753,1.209503
1000,1.2336,1.200323




Fine-tuning complete!


In [None]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

class MedicalChatbot:
    def __init__(self, model_path):
        print("Loading model and tokenizer...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
        self.model.eval()  # Set to evaluation mode

        # Move to GPU if available
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        print(f"Model loaded on {self.device}")

    def generate_response(self, user_input, max_length=150, temperature=0.7):
        # Tokenize the input
        inputs = self.tokenizer(user_input, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        # Generate response
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_length=max_length,
                num_return_sequences=1,
                temperature=temperature,
                top_p=0.9,
                do_sample=True
            )


        # Decode the response
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response

# Example usage - run this in a notebook or script
if __name__ == "__main__":
    # Update with your model path
    model_path = "/content/drive/My Drive/ai_medical_assistant/models/biobart-v2-medical-chatbot-final"

    chatbot = MedicalChatbot(model_path)

    print("Medical Assistant Chatbot is ready. Type 'exit' to quit.")

    while True:
        user_input = input("You: ")
        if user_input.lower() in ['exit', 'quit', 'bye']:
            break

        response = chatbot.generate_response(user_input)
        print(f"MedBot: {response}")

Loading model and tokenizer...
Model loaded on cuda
Medical Assistant Chatbot is ready. Type 'exit' to quit.
You: hello
MedBot: hello and welcome to ask a doctor service i have reviewed your query and here is my advice for more information consult a general practitioner online 
You: i have headache fever and cold
MedBot: hello and welcome to ask a doctor service i have reviewed your query and here is my advice headache fever and cold are the symptoms of viral infection and can be due to viral or bacterial infection i would suggest you to consult a physician for proper diagnosis and treatment hope i have answered your query let me know if i can assist you further
You: suggest me home remedies
MedBot: hello and welcome to ask a doctor service i have reviewed your query and here is my advice i suggest you not to worry much i would suggest you to visit a physician for proper diagnosis and treatment hope i have answered your query let me know if i can assist you further
You: what is bronchi

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [None]:
import pandas as pd
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import os
from datasets import Dataset

# Installation
# !pip install -q transformers datasets evaluate peft trl bitsandbytes accelerate

# Mount Google Drive if using Colab
from google.colab import drive
drive.mount('/content/drive')

# Define paths
cleaned_data_path = "/content/drive/My Drive/ai_medical_assistant/cleaned_data/"
model_path = "/content/drive/My Drive/ai_medical_assistant/models/"
train_file = os.path.join(cleaned_data_path, "chatbot_train.csv")
val_file = os.path.join(cleaned_data_path, "chatbot_val.csv")

# Model Selection - using Flan-T5 small which is a good balance of quality and size for T4 GPUs
model_name = "GanjinZero/biobart-v2-large"  # Can try "facebook/bart-base" as alternative

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the model (using device_map="auto" for efficient GPU utilization)
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    device_map="auto"
)

# Load and prepare the data
train_df = pd.read_csv(train_file)
val_df = pd.read_csv(val_file)

# For initial experimentation with T4, use a smaller subset
# For final training, you can increase this fraction
train_df = train_df.sample(frac=0.05, random_state=42)
val_df = val_df.sample(frac=0.05, random_state=42)

print(f"Training data size: {len(train_df)}")
print(f"Validation data size: {len(val_df)}")

# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Maximum sequence lengths
max_input_length = 512
max_target_length = 256

# Preprocessing function
def preprocess_function(examples):
    inputs = examples["question"]
    targets = examples["answer"]

    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        padding="max_length",
        truncation=True
    )

    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=max_target_length,
            padding="max_length",
            truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Process the datasets
train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names
)

val_dataset = val_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=val_dataset.column_names
)

# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True
)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=os.path.join(model_path, "biobart-v2-large-medical-chatbot"),
    evaluation_strategy="steps",
    eval_steps=100,
    logging_steps=100,
    save_steps=500,
    save_total_limit=3,
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    gradient_accumulation_steps=4,
)

# Create Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start training
trainer.train()

# Save the model
trainer.save_model(os.path.join(model_path, "biobart-v2-large-medical-chatbot-final"))
print("Fine-tuning complete!")

Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/892k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.68k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.77G [00:00<?, ?B/s]

Training data size: 11561
Validation data size: 1285


Map:   0%|          | 0/11561 [00:00<?, ? examples/s]



Map:   0%|          | 0/1285 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msakshi3maurya[0m ([33msakshi3maurya-thakur-college-of-engineering-and-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
100,1.6391,1.338476
200,1.4014,1.254328
300,1.3196,1.198969
400,1.2835,1.165825
500,1.2529,1.14551
600,1.2497,1.126802
700,1.2408,1.115256
800,1.1372,1.111217
900,1.1187,1.102607
1000,1.0857,1.093356




Fine-tuning complete!


In [None]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

class MedicalChatbot:
    def __init__(self, model_path):
        print("Loading model and tokenizer...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
        self.model.eval()  # Set to evaluation mode

        # Move to GPU if available
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        print(f"Model loaded on {self.device}")

    def generate_response(self, user_input, max_length=150, temperature=0.7):
        # Tokenize the input
        inputs = self.tokenizer(user_input, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        # Generate response
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_length=max_length,
                num_return_sequences=1,
                temperature=temperature,
                top_p=0.9,
                do_sample=True
            )


        # Decode the response
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response

# Example usage - run this in a notebook or script
if __name__ == "__main__":
    # Update with your model path
    model_path = "/content/drive/My Drive/ai_medical_assistant/models/biobart-v2-large-medical-chatbot-final"

    chatbot = MedicalChatbot(model_path)

    print("Medical Assistant Chatbot is ready. Type 'exit' to quit.")

    while True:
        user_input = input("You: ")
        if user_input.lower() in ['exit', 'quit', 'bye']:
            break

        response = chatbot.generate_response(user_input)
        print(f"MedBot: {response}")

Loading model and tokenizer...
Model loaded on cuda
Medical Assistant Chatbot is ready. Type 'exit' to quit.
You: What are the symptoms of diabetes?
MedBot: hello and welcome to ask a doctor service i have reviewed your query and here is my advice the symptoms that you describe are not related to diabetes but rather to the insulin resistance that is the cause of the diabetes i would suggest you to consult a diabetologist for the diagnosis and treatment hope i have answered your query let me know if i can assist you further regards dr shinas hussain general  family physician
You: Can you tell me about high blood pressure?
MedBot: hello and welcome to ask a doctor service i have reviewed your query and here is my advice high blood pressure can be due to a number of reasons like1 uncontrolled diabetes2 high cholesterol levels3 high sodium levels4 high uric acid levels5 high blood sugar levels6 obesity7 sedentary life style7 low physical activity levels8 low magnesium levels9 low vitamin d

In [None]:
!pip install transformers datasets torch nltk rouge-score

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-many

## **Evaluation using BLEU and ROUGE Scores**

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import gc
import torch # import torch here

# Add this to relevant functions
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
import pandas as pd
import numpy as np
from rouge_score import rouge_scorer
import nltk
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

# Download necessary nltk data
nltk.download('punkt')

def load_model_and_tokenizer(model_path):
    """Load the fine-tuned BioBart model and tokenizer"""
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
    return model, tokenizer

def generate_predictions(model, tokenizer, input_texts, batch_size=4, max_length=512):
    predictions = []
    for i in range(0, len(input_texts), batch_size):
        batch = input_texts[i:i+batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt", max_length=max_length)

        # Move to GPU if available
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(device)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Generate outputs
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            num_beams=4,
            no_repeat_ngram_size=3,
            early_stopping=True
        )

        # Decode outputs
        batch_predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        predictions.extend(batch_predictions)

        # Clear CUDA cache to free up memory
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    return predictions

def calculate_bleu(predictions, references):
    """Calculate BLEU score"""
    # Tokenize predictions and references
    tokenized_preds = [nltk.word_tokenize(pred.lower()) for pred in predictions]
    tokenized_refs = [[nltk.word_tokenize(ref.lower())] for ref in references]

    # Calculate BLEU score
    smoothing = SmoothingFunction().method1
    bleu_score = corpus_bleu(tokenized_refs, tokenized_preds, smoothing_function=smoothing)

    # Calculate individual n-gram scores
    bleu_1 = corpus_bleu(tokenized_refs, tokenized_preds, weights=(1, 0, 0, 0), smoothing_function=smoothing)
    bleu_2 = corpus_bleu(tokenized_refs, tokenized_preds, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing)
    bleu_3 = corpus_bleu(tokenized_refs, tokenized_preds, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothing)
    bleu_4 = corpus_bleu(tokenized_refs, tokenized_preds, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing)

    return {
        'bleu': bleu_score,
        'bleu_1': bleu_1,
        'bleu_2': bleu_2,
        'bleu_3': bleu_3,
        'bleu_4': bleu_4
    }

def calculate_rouge(predictions, references):
    """Calculate ROUGE scores"""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    scores = {
        'rouge1_precision': [],
        'rouge1_recall': [],
        'rouge1_fmeasure': [],
        'rouge2_precision': [],
        'rouge2_recall': [],
        'rouge2_fmeasure': [],
        'rougeL_precision': [],
        'rougeL_recall': [],
        'rougeL_fmeasure': []
    }

    for pred, ref in zip(predictions, references):
        score = scorer.score(ref, pred)

        # Extract and store metrics
        scores['rouge1_precision'].append(score['rouge1'].precision)
        scores['rouge1_recall'].append(score['rouge1'].recall)
        scores['rouge1_fmeasure'].append(score['rouge1'].fmeasure)

        scores['rouge2_precision'].append(score['rouge2'].precision)
        scores['rouge2_recall'].append(score['rouge2'].recall)
        scores['rouge2_fmeasure'].append(score['rouge2'].fmeasure)

        scores['rougeL_precision'].append(score['rougeL'].precision)
        scores['rougeL_recall'].append(score['rougeL'].recall)
        scores['rougeL_fmeasure'].append(score['rougeL'].fmeasure)

    # Calculate averages
    for key in scores.keys():
        scores[key] = np.mean(scores[key])

    return scores

def evaluate_biobart_model(model_path, test_dataset):
    """Evaluate the fine-tuned BioBart model on the test dataset"""
    # Load model and tokenizer
    model, tokenizer = load_model_and_tokenizer(model_path)

    # Extract input texts and reference outputs from the test dataset
    input_texts = [sample['question'] for sample in test_dataset]
    references = [sample['answer'] for sample in test_dataset]

    # Generate predictions
    predictions = generate_predictions(model, tokenizer, input_texts)

    # Calculate BLEU scores
    bleu_scores = calculate_bleu(predictions, references)

    # Calculate ROUGE scores
    rouge_scores = calculate_rouge(predictions, references)

    # Combine results
    results = {**bleu_scores, **rouge_scores}

    return results, predictions

# Example usage
if __name__ == "__main__":
    # Replace with your model path and dataset
    model_path = "/content/drive/My Drive/ai_medical_assistant/models/biobart-v2-medical-chatbot-final"
    test_dataset_path = "/content/drive/My Drive/ai_medical_assistant/cleaned_data/test_dataset.csv"

    # Load the test dataset using pandas
    test_dataset = pd.read_csv(test_dataset_path).to_dict('records')

    # Evaluate the model
    results, predictions = evaluate_biobart_model(model_path, test_dataset)

    # Print results
    print("\nEvaluation Results:")
    print("-" * 40)
    print(f"BLEU Score: {results['bleu']:.4f}")
    print(f"BLEU-1: {results['bleu_1']:.4f}")
    print(f"BLEU-2: {results['bleu_2']:.4f}")
    print(f"BLEU-3: {results['bleu_3']:.4f}")
    print(f"BLEU-4: {results['bleu_4']:.4f}")
    print("-" * 40)
    print(f"ROUGE-1 F1: {results['rouge1_fmeasure']:.4f}")
    print(f"ROUGE-2 F1: {results['rouge2_fmeasure']:.4f}")
    print(f"ROUGE-L F1: {results['rougeL_fmeasure']:.4f}")
    print("-" * 40)

    # Print some prediction examples
    print("\nSample Predictions:")
    for i, (pred, ref) in enumerate(zip(predictions, [sample['target_text'] for sample in test_dataset])):
        print(f"\nExample {i+1}:")
        print(f"Reference: {ref}")
        print(f"Prediction: {pred}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Mount Google Drive if using Colab
from google.colab import drive
drive.mount('/content/drive')

# Load your dataset
# Replace 'your_dataset.csv' with your actual file path
df = pd.read_csv('/content/drive/My Drive/ai_medical_assistant/cleaned_data/ai-medical-chatbot.csv')

# Define training and validation split (80% training, 20% validation)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Check the sizes
print(f"Original dataset size: {len(df)}")
print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")

# Save the splits to separate files
train_df.to_csv('/content/drive/My Drive/ai_medical_assistant/cleaned_data/training_set.csv', index=False)
val_df.to_csv('/content/drive/My Drive/ai_medical_assistant/cleaned_data/validation_set.csv', index=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Original dataset size: 256916
Training set size: 205532
Validation set size: 51384


In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [None]:
import pandas as pd
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import os
from datasets import Dataset

# Installation
# !pip install -q transformers datasets evaluate peft trl bitsandbytes accelerate

# Mount Google Drive if using Colab
from google.colab import drive
drive.mount('/content/drive')

# Define paths
cleaned_data_path = "/content/drive/My Drive/ai_medical_assistant/cleaned_data/"
model_path = "/content/drive/My Drive/ai_medical_assistant/models/"
train_file = os.path.join(cleaned_data_path, "training_set.csv")
val_file = os.path.join(cleaned_data_path, "validation_set.csv")

# Model Selection - using Flan-T5 small which is a good balance of quality and size for T4 GPUs
model_name = "GanjinZero/biobart-v2-base"  # Can try "facebook/bart-base" as alternative

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the model (using device_map="auto" for efficient GPU utilization)
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    device_map="auto"
)

# Load and prepare the data
train_df = pd.read_csv(train_file)
val_df = pd.read_csv(val_file)

# For initial experimentation with T4, use a smaller subset
# For final training, you can increase this fraction
train_df = train_df.sample(frac=0.05, random_state=42)
val_df = val_df.sample(frac=0.05, random_state=42)

print(f"Training data size: {len(train_df)}")
print(f"Validation data size: {len(val_df)}")

# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Maximum sequence lengths
max_input_length = 512
max_target_length = 256

# Preprocessing function
def preprocess_function(examples):
    inputs = examples["Patient"]
    targets = examples["Doctor"]

    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        padding="max_length",
        truncation=True
    )

    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=max_target_length,
            padding="max_length",
            truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Process the datasets
train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names
)

val_dataset = val_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=val_dataset.column_names
)

# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True
)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=os.path.join(model_path, "biobart-v2-base-medical-chatbot"),
    eval_strategy="steps",
    eval_steps=100,
    logging_steps=100,
    save_steps=500,
    save_total_limit=3,
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    gradient_accumulation_steps=4,
)

# Create Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start training
trainer.train()

# Save the model
trainer.save_model(os.path.join(model_path, "biobart-v2-base-medical-chatbot-final1"))
print("Fine-tuning complete!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Training data size: 10277
Validation data size: 2569


Map:   0%|          | 0/10277 [00:00<?, ? examples/s]



Map:   0%|          | 0/2569 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msakshi3maurya[0m ([33msakshi3maurya-thakur-college-of-engineering-and-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
100,1.7761,1.478433
200,1.561,1.401212
300,1.4507,1.360373
400,1.4267,1.327816
500,1.4067,1.298827
600,1.3782,1.282363
700,1.3527,1.27684
800,1.2599,1.264109
900,1.2552,1.255891
1000,1.2572,1.247762




Fine-tuning complete!


In [None]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

class MedicalChatbot:
    def __init__(self, model_path):
        print("Loading model and tokenizer...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
        self.model.eval()  # Set to evaluation mode

        # Move to GPU if available
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        print(f"Model loaded on {self.device}")

    def generate_response(self, user_input, max_length=150, temperature=0.7):
        # Tokenize the input
        inputs = self.tokenizer(user_input, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        # Generate response
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_length=max_length,
                num_return_sequences=1,
                temperature=temperature,
                top_p=0.9,
                do_sample=True
            )


        # Decode the response
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response

# Example usage - run this in a notebook or script
if __name__ == "__main__":
    # Update with your model path
    model_path = "/content/drive/My Drive/ai_medical_assistant/models/biobart-v2-base-medical-chatbot-final1"

    chatbot = MedicalChatbot(model_path)

    print("Medical Assistant Chatbot is ready. Type 'exit' to quit.")

    while True:
        user_input = input("You: ")
        if user_input.lower() in ['exit', 'quit', 'bye']:
            break

        response = chatbot.generate_response(user_input)
        print(f"MedBot: {response}")

Loading model and tokenizer...
Model loaded on cuda
Medical Assistant Chatbot is ready. Type 'exit' to quit.
You: hi
MedBot: Hello and Welcome to ‘Ask A Doctor’ service. I have reviewed your query and here is my advice. Hope I have answered your query. Let me know if I can assist you further.
You: What are the symptoms of diabetes?
MedBot: Hello and Welcome to ‘Ask A Doctor’ service. I have reviewed your query and here is my advice. I will try to help you in discussing with your family physician about the symptoms of diabetes. Hope I have answered your query. Let me know if I can assist you further. Regards, Dr. Shinas Hussain, General & Family Physician
You: Can you tell me about high blood pressure?
MedBot: Hello and Welcome to ‘Ask A Doctor’ service. I have reviewed your query and here is my advice. High blood pressure can be due to many reasons. 1. Hypertension 2. Diabetes. 3. Smoking. 4. Obesity. All these factors can lead to high blood pressure. So better to consult a cardiologis

In [None]:
!pip install transformers datasets torch nltk rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
import pandas as pd
import numpy as np
from rouge_score import rouge_scorer
import nltk
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

# Download necessary nltk data
nltk.download('punkt')

def load_model_and_tokenizer(model_path):
    """Load the fine-tuned BioBart model and tokenizer"""
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
    return model, tokenizer

def generate_predictions(model, tokenizer, input_texts, batch_size=4, max_length=512):
    predictions = []
    for i in range(0, len(input_texts), batch_size):
        batch = input_texts[i:i+batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt", max_length=max_length)

        # Move to GPU if available
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(device)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Generate outputs
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            num_beams=4,
            no_repeat_ngram_size=3,
            early_stopping=True
        )

        # Decode outputs
        batch_predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        predictions.extend(batch_predictions)

        # Clear CUDA cache to free up memory
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    return predictions

def calculate_bleu(predictions, references):
    """Calculate BLEU score"""
    # Tokenize predictions and references
    tokenized_preds = [nltk.word_tokenize(pred.lower()) for pred in predictions]
    tokenized_refs = [[nltk.word_tokenize(ref.lower())] for ref in references]

    # Calculate BLEU score
    smoothing = SmoothingFunction().method1
    bleu_score = corpus_bleu(tokenized_refs, tokenized_preds, smoothing_function=smoothing)

    # Calculate individual n-gram scores
    bleu_1 = corpus_bleu(tokenized_refs, tokenized_preds, weights=(1, 0, 0, 0), smoothing_function=smoothing)
    bleu_2 = corpus_bleu(tokenized_refs, tokenized_preds, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing)
    bleu_3 = corpus_bleu(tokenized_refs, tokenized_preds, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothing)
    bleu_4 = corpus_bleu(tokenized_refs, tokenized_preds, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing)

    return {
        'bleu': bleu_score,
        'bleu_1': bleu_1,
        'bleu_2': bleu_2,
        'bleu_3': bleu_3,
        'bleu_4': bleu_4
    }

def calculate_rouge(predictions, references):
    """Calculate ROUGE scores"""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    scores = {
        'rouge1_precision': [],
        'rouge1_recall': [],
        'rouge1_fmeasure': [],
        'rouge2_precision': [],
        'rouge2_recall': [],
        'rouge2_fmeasure': [],
        'rougeL_precision': [],
        'rougeL_recall': [],
        'rougeL_fmeasure': []
    }

    for pred, ref in zip(predictions, references):
        score = scorer.score(ref, pred)

        # Extract and store metrics
        scores['rouge1_precision'].append(score['rouge1'].precision)
        scores['rouge1_recall'].append(score['rouge1'].recall)
        scores['rouge1_fmeasure'].append(score['rouge1'].fmeasure)

        scores['rouge2_precision'].append(score['rouge2'].precision)
        scores['rouge2_recall'].append(score['rouge2'].recall)
        scores['rouge2_fmeasure'].append(score['rouge2'].fmeasure)

        scores['rougeL_precision'].append(score['rougeL'].precision)
        scores['rougeL_recall'].append(score['rougeL'].recall)
        scores['rougeL_fmeasure'].append(score['rougeL'].fmeasure)

    # Calculate averages
    for key in scores.keys():
        scores[key] = np.mean(scores[key])

    return scores

def evaluate_biobart_model(model_path, test_dataset):
    """Evaluate the fine-tuned BioBart model on the test dataset"""
    # Load model and tokenizer
    model, tokenizer = load_model_and_tokenizer(model_path)

    # Extract input texts and reference outputs from the test dataset
    input_texts = [sample['question'] for sample in test_dataset]
    references = [sample['answer'] for sample in test_dataset]

    # Generate predictions
    predictions = generate_predictions(model, tokenizer, input_texts)

    # Calculate BLEU scores
    bleu_scores = calculate_bleu(predictions, references)

    # Calculate ROUGE scores
    rouge_scores = calculate_rouge(predictions, references)

    # Combine results
    results = {**bleu_scores, **rouge_scores}

    return results, predictions

# Example usage
if __name__ == "__main__":
    # Replace with your model path and dataset
    model_path = "/content/drive/My Drive/ai_medical_assistant/models/biobart-v2-base-medical-chatbot-final1"
    test_dataset_path = "/content/drive/My Drive/ai_medical_assistant/cleaned_data/test_dataset.csv"

    # Load the test dataset using pandas
    test_dataset = pd.read_csv(test_dataset_path).to_dict('records')

    # Evaluate the model
    results, predictions = evaluate_biobart_model(model_path, test_dataset)

    # Print results
    print("\nEvaluation Results:")
    print("-" * 40)
    print(f"BLEU Score: {results['bleu']:.4f}")
    print(f"BLEU-1: {results['bleu_1']:.4f}")
    print(f"BLEU-2: {results['bleu_2']:.4f}")
    print(f"BLEU-3: {results['bleu_3']:.4f}")
    print(f"BLEU-4: {results['bleu_4']:.4f}")
    print("-" * 40)
    print(f"ROUGE-1 F1: {results['rouge1_fmeasure']:.4f}")
    print(f"ROUGE-2 F1: {results['rouge2_fmeasure']:.4f}")
    print(f"ROUGE-L F1: {results['rougeL_fmeasure']:.4f}")
    print("-" * 40)

    # Print some prediction examples
    print("\nSample Predictions:")
    for i, (pred, ref) in enumerate(zip(predictions, [sample['target_text'] for sample in test_dataset])):
        print(f"\nExample {i+1}:")
        print(f"Reference: {ref}")
        print(f"Prediction: {pred}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
