In [None]:
pip install datasets peft trl transformers pandas torch spacy nltk rouge_score bert_score sentence_transformers bitsandbytes accelerate

In [None]:
import os
import json
import pandas as pd
import torch
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel, PeftModelForCausalLM
from trl import SFTTrainer

from peft import LoraConfig, get_peft_model, PeftModel
from trl import SFTTrainer
from transformers import TrainingArguments, TrainerCallback
import json
import os

In [None]:
def create_prompt(instruction, input_text):
    """Format the instruction and input into a prompt"""
    if input_text:
        return f"{instruction}\n\n{input_text}"
    return instruction

def load_and_format_dataset(file_path, train_split=0.8, output_dir="data"):
    """Improved dataset preparation"""
    os.makedirs(output_dir, exist_ok=True)
    df = pd.read_csv(file_path)

    # Validate and filter
    required_columns = ["instruction", "input", "output"]
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"Dataset must contain {required_columns} columns")
    df = df[df["input"] != "No structured clinical data available."]

    formatted_data = []
    for _, row in df.iterrows():
        # Create chat format
        user_msg = create_prompt(row["instruction"], row["input"])
        assistant_msg = row["output"]

        # Create both formats
        formatted_data.append({
            "messages": [
                {"role": "user", "content": user_msg},
                {"role": "assistant", "content": assistant_msg}
            ],
            "text": f"### User: {user_msg} ###\n### Assistant: {assistant_msg} ###"
        })

    # Split and save
    train_size = int(len(formatted_data) * train_split)
    for split, data in [("train", formatted_data[:train_size]),
                       ("validation", formatted_data[train_size:])]:
        with open(os.path.join(output_dir, f"{split}.jsonl"), "w") as f:
            for item in data:
                json.dump(item, f)
                f.write("\n")

    print(f"Saved {train_size} training and {len(formatted_data)-train_size} validation examples")
    return load_dataset("json", data_files={
        "train": os.path.join(output_dir, "train.jsonl"),
        "validation": os.path.join(output_dir, "validation.jsonl")
    })

In [None]:
def preprocess_and_save_dataset(dataset, tokenizer, output_dir="preprocessed_data"):
    """Pre-tokenize and cache dataset"""
    os.makedirs(output_dir, exist_ok=True)
    def tokenize_function(example):
        return tokenizer(example["text"], truncation=True, max_length=512)  # Truncate to 512 tokens
    tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["messages", "text"])
    tokenized_dataset.save_to_disk(output_dir)
    return tokenized_dataset

def configure_qlora_model(model_name="BioMistral/BioMistral-7B"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=False,
        llm_int8_enable_fp32_cpu_offload=True,
        llm_int8_skip_modules=["lm_head"]
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
        torch_dtype=torch.float16,
        trust_remote_code=True,
        offload_folder="offload",
        low_cpu_mem_usage=True,
        offload_state_dict=True
    )
    model.config.pad_token_id = tokenizer.eos_token_id
    model = prepare_model_for_kbit_training(model)

    lora_config = LoraConfig(
        r=4,  # Reduced from 8
        lora_alpha=8,  # Reduced from 16
        lora_dropout=0.05,
        target_modules=["q_proj", "v_proj"],
        bias="none",
        task_type="CAUSAL_LM",
        inference_mode=False,
        fan_in_fan_out=False,
        modules_to_save=["embed_tokens", "lm_head"]
    )

    print("Applying PEFT adapters to the model...")
    peft_model = get_peft_model(model, lora_config)
    print(f"[DEBUG] Type after get_peft_model: {type(peft_model)}")

    if not isinstance(peft_model, (PeftModel, PeftModelForCausalLM)):
        raise ValueError("Model is not a PEFT model instance!")
    else:
        print("[OK] Model wrapped with PEFT successfully.")

    print(peft_model.print_trainable_parameters())

    for name, param in peft_model.named_parameters():
        if 'lora' in name:
            param.requires_grad = True

    return peft_model, tokenizer

In [None]:
def setup_trainer(model, dataset, output_dir="biomistral_finetuned"):
    if not isinstance(model, (PeftModel, PeftModelForCausalLM)):
        raise ValueError("Model is not a PEFT-wrapped instance! Cannot continue with training.")

    print(f"Model is a PEFT model: {isinstance(model, (PeftModel, PeftModelForCausalLM))}")

    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=2,  # Increased from 1
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=4,  # Reduced from 8
        num_train_epochs=3,
        learning_rate=2e-5,
        bf16=True,  # Changed from fp16=True
        bf16_full_eval=True,
        save_strategy="epoch",
        eval_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        logging_steps=500,
        save_total_limit=2,
        push_to_hub=False,
        gradient_checkpointing=True,
        optim="adamw_torch_fused",
        max_grad_norm=0.3,
        warmup_ratio=0.03,
        lr_scheduler_type="cosine",
    )

    def formatting_func(example):
        return "\n".join([
            f"### {msg['role'].capitalize()}: {msg['content']} ###"
            for msg in example["messages"]
        ])

    print("Creating SFTTrainer...")
    return SFTTrainer(
        model=model,
        train_dataset=dataset["train"],
        eval_dataset=dataset["validation"],
        args=training_args,
        formatting_func=formatting_func,
        peft_config=None,
    )

def main():
    torch.cuda.empty_cache()  # Clear GPU memory
    print("Loading and preparing dataset...")
    dataset = load_and_format_dataset("bio_mistral_qa_combined.csv")
    print(f"Dataset loaded: {dataset}")

    print("Preprocessing and tokenizing dataset...")
    model, tokenizer = configure_qlora_model()  # Load tokenizer first
    dataset = preprocess_and_save_dataset(dataset, tokenizer)
    print(f"Preprocessed dataset: {dataset}")

    print("Configuring QLoRA model...")
    model, tokenizer = configure_qlora_model()  # Reload model
    print("QLoRA model configured successfully!")

    if not isinstance(model, (PeftModel, PeftModelForCausalLM)):
        raise ValueError("Model is not properly wrapped as a PEFT model!")

    print("Setting up trainer...")
    trainer = setup_trainer(model, dataset)
    print("Trainer configured successfully!")

    print("Starting training...")
    trainer.train()

    trainer.save_model()
    print(f"Model trained and saved to {trainer.args.output_dir}")

    return model, tokenizer, trainer

if __name__ == "__main__":
    model, tokenizer, trainer = main()

## Evaluation

In [None]:
!pip install transformers datasets torch pandas numpy scikit-learn rouge-score nltk scispacy

!pip install sentence-transformers

In [None]:
pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bc5cdr_md-0.5.1.tar.gz

In [None]:
!pip uninstall -y numpy thinc spacy scispacy
!pip install numpy==1.26.4
!pip install spacy==3.7.2
!pip install scispacy==0.5.1


In [None]:
import nltk
nltk.download("punkt")

In [None]:
!pip install transformers torch sentence-transformers spacy nltk pandas rouge-score

In [None]:
pip install accelerate bitsandbytes transformers bert-score peft

In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import logging
import re
import spacy
from nltk.tokenize import word_tokenize
import nltk
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from sentence_transformers import SentenceTransformer, util
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import os

# Suppress transformers warnings
logging.getLogger("transformers").setLevel(logging.ERROR)

# Download NLTK data
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("wordnet")

In [None]:
# Load spaCy and SentenceTransformer models
def load_spacy_model():
    """Load spaCy medical NER model."""
    try:
        return spacy.load("en_ner_bc5cdr_md")
    except Exception as e:
        print(f"Error loading spaCy model: {e}")
        return None

def load_sentence_transformer():
    """Load SentenceTransformer for FCS."""
    try:
        return SentenceTransformer("all-MiniLM-L6-v2", device="cuda" if torch.cuda.is_available() else "cpu")
    except Exception as e:
        print(f"Error loading SentenceTransformer: {e}")
        return None

nlp = load_spacy_model()
embedder = load_sentence_transformer()

In [None]:
# Load fine-tuned model and tokenizer
def load_fine_tuned_model(model_name="BioMistral/BioMistral-7B", checkpoint_dir="biomistral_finetuned"):
    """Load the fine-tuned QLoRA model and tokenizer."""
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=False,
            llm_int8_enable_fp32_cpu_offload=True,
            llm_int8_skip_modules=["lm_head"]
        )

        base_model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            device_map="auto",
            torch_dtype=torch.float16,
            trust_remote_code=True,
            offload_folder="offload",
            low_cpu_mem_usage=True,
            offload_state_dict=True
        )
        base_model.config.pad_token_id = tokenizer.eos_token_id

        if os.path.exists(checkpoint_dir):
            if os.path.exists(os.path.join(checkpoint_dir, "adapter_model.bin")):
                print(f"Loading fine-tuned model from {checkpoint_dir}")
                model = PeftModel.from_pretrained(base_model, checkpoint_dir, is_trainable=False)
            else:
                checkpoints = [d for d in os.listdir(checkpoint_dir) if d.startswith("checkpoint-")]
                if checkpoints:
                    latest_checkpoint = max(checkpoints, key=lambda x: int(x.split("-")[1]))
                    checkpoint_path = os.path.join(checkpoint_dir, latest_checkpoint)
                    print(f"Loading fine-tuned model from checkpoint: {checkpoint_path}")
                    model = PeftModel.from_pretrained(base_model, checkpoint_path, is_trainable=False)
                else:
                    raise ValueError(f"No checkpoints or final model found in {checkpoint_dir}")
        else:
            raise ValueError(f"Checkpoint directory {checkpoint_dir} does not exist")

        return model, tokenizer
    except Exception as e:
        print(f"Error loading fine-tuned model: {e}")
        return None, None

model, tokenizer = load_fine_tuned_model()
if model is None or tokenizer is None:
    raise ValueError("Failed to load fine-tuned model or tokenizer")

In [None]:

questions = [
    "Are there any further procedures planned for the patient?",
    "Does the patient require long term monitoring?",
    "What precautions does the patient need to take post-discharge?",
    "What medications is the patient currently taking?",
    "What is the patient's primary diagnosis?"
]

inputs = [
    "Gender: F\nChief Complaint: Abdominal distention, nausea, and vomiting\nHistory: Cirrhosis, multiple paracenteses for ascites\nPlan: Schedule regular paracentesis every 2 weeks",
    "Gender: F\nChief Complaint: Abdominal distention, nausea, and vomiting\nPlan: Monitor weight and abdominal girth daily; assess for signs of fluid overload",
    "Gender: M\nChief Complaint: Abd pain, Hypotension\nDischarge Plan: Follow low sodium diet, take prescribed meds, and avoid strenuous activity",
    "Gender: F\nCurrent Medications: Lisinopril 10mg daily, Furosemide 40mg daily\nAllergies: None known\nAssessment: Hypertension, fluid retention",
    "Gender: M\nChief Complaint: Fever, Cough\nFindings: CXR shows consolidation in the right lower lobe\nAssessment: Community-acquired pneumonia"
]

references = [
    "Yes, the patient requires regular paracentesis due to fluid accumulation.",
    "Yes, the patient requires close monitoring for fluid accumulation and symptoms.",
    "Follow up with the doctor or nurse practitioner. Avoid heavy lifting and follow dietary guidelines.",
    "The patient is currently taking Lisinopril and Furosemide.",
    "The patient's primary diagnosis is community-acquired pneumonia."
]

In [None]:
# Prompt and validation functions
def create_prompt(question, context):
    """Create a prompt for the model."""
    return f"""You are a clinical assistant. Provide concise, factual answers based ONLY on the available information.

Question: {question}
Available Context: {context if context.strip() else "No specific clinical data provided"}

Answer (just the factual medical response, no references to tables/figures):"""

def validate_answer(answer):
    """Validate generated answer to exclude invalid phrases."""
    invalid_phrases = ["Table", "Figure", "as shown in", "refer to"]
    if any(phrase.lower() in answer.lower() for phrase in invalid_phrases):
        return "Unable to generate proper response from available data"
    return answer.strip()

# Dataset class for generation
class QADataset(Dataset):
    """Dataset class for question answering."""
    def __init__(self, questions, inputs, references, tokenizer, max_length=256):
        self.questions = questions
        self.inputs = inputs
        self.references = references
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        input_text = self.inputs[idx]
        prompt = create_prompt(question, input_text)
        encoding = self.tokenizer(
            prompt,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'prompt_length': encoding['input_ids'].shape[1],
            'question': question,
            'input_text': input_text,
            'reference': self.references[idx]
        }

# Generate responses
def generate_responses(model, tokenizer, questions, inputs, references):
    """Generate responses for the dataset."""
    bad_words = ["Table", "Figure"]
    bad_words_ids = [tokenizer.encode(word, add_special_tokens=False) for word in bad_words if tokenizer.encode(word, add_special_tokens=False)]

    generation_kwargs = {
        'max_new_tokens': 150,
        'do_sample': True,
        'temperature': 0.3,
        'repetition_penalty': 1.5,
        'no_repeat_ngram_size': 4,
        'bad_words_ids': bad_words_ids if bad_words_ids else None,
        'eos_token_id': tokenizer.eos_token_id,
        'pad_token_id': tokenizer.pad_token_id
    }

    qa_dataset = QADataset(questions, inputs, references, tokenizer)
    dataloader = DataLoader(qa_dataset, batch_size=1, shuffle=False)
    generated_outputs = []
    sample_number = 0

    try:
        for batch in dataloader:
            sample_number += 1
            input_ids = batch['input_ids'].to("cuda" if torch.cuda.is_available() else "cpu")
            attention_mask = batch['attention_mask'].to("cuda" if torch.cuda.is_available() else "cpu")
            question = batch['question'][0]
            input_text = batch['input_text'][0]
            reference = batch['reference'][0]
            prompt_length = batch['prompt_length'][0]

            print(f"\n=== Sample {sample_number} ===")
            print(f"Instruction: {question}")
            print(f"Input: {input_text}")

            with torch.amp.autocast(device_type="cuda" if torch.cuda.is_available() else "cpu", dtype=torch.float16):
                outputs = model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    **generation_kwargs
                )

            if outputs.shape[1] > prompt_length:
                new_tokens = outputs[0, prompt_length:]
            else:
                print(f"Warning: No new tokens generated for sample {sample_number}")
                new_tokens = outputs[0]

            generated_answer = tokenizer.decode(new_tokens, skip_special_tokens=True)
            generated_answer = validate_answer(generated_answer)

            print(f"Generated Answer: {generated_answer}")
            print(f"Ground Truth Answer: {reference}")

            generated_outputs.append(generated_answer)

        print(f"\nProcessed {sample_number} samples")
        return generated_outputs

    except Exception as e:
        print(f"Error during generation: {str(e)}")
        print(f"Stopped at sample {sample_number}")
        print(f"Problematic sample details: {question}, {input_text}")
        return generated_outputs

generated_outputs = generate_responses(model, tokenizer, questions, inputs, references)

In [None]:

# Evaluate generated outputs
if generated_outputs:
    compute_metrics_per_query(generated_outputs, references, questions, nlp, embedder)
else:
    print("No outputs generated due to error.")

# Clear GPU memory
torch.cuda.empty_cache()