# Configuration

In [1]:
!pip install -U transformers datasets peft accelerate bitsandbytes sentencepiece

Collecting transformers
  Downloading transformers-4.56.2-py3-none-any.whl.metadata (40 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/40.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-4.1.1-py3-none-any.whl.metadata (18 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-21.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Downloading transformers-4.56.2-py3-none-any.whl (11.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m140.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-4.1.1-py3-none-any.whl (503 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m503.6/503.6 kB[0m [31m41.4 MB/s[0m eta [36m0:00:0

In [2]:
!pip install --upgrade transformers accelerate bitsandbytes



In [None]:
!pip install bert-score sacrebleu rouge-score scispacy spacy
!python -m spacy download en_core_web_sm >/dev/null


Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting scispacy
  Downloading scispacy-0.6.2-py3-none-any.whl.metadata (20 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting conllu (from scispacy)
  Downloading conllu-6.0.0-py3-none-any.whl.metadata (21 kB)
Collecting numpy (from bert-score)
  Downloading numpy-1.26.4-cp312-cp312-m

# Libraries

In [None]:
import pandas as pd
from datasets import Dataset, concatenate_datasets

# Load Model

In [None]:
from huggingface_hub import login
login(new_session=True)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer,BitsAndBytesConfig

model_name = "meta-llama/Llama-3.1-8B"

bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype="bfloat16" )

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token


# Dataset

### Training Dataset

In [None]:
import pandas as pd
from datasets import Dataset, concatenate_datasets

def load_csv_as_dataset(path, instruction):
    df = pd.read_csv(path)
    df = df.dropna(subset=[df.columns[0], df.columns[1]])
    df = df.rename(columns={df.columns[0]: "input", df.columns[1]: "output"})
    df["input"] = df["input"].astype(str)
    df["output"] = df["output"].astype(str)
    df["instruction"] = instruction
    return Dataset.from_pandas(df[["instruction","input","output"]])

def limit_dataset(ds, max_examples=500, seed=42):
    """Shuffle deterministically and keep up to max_examples rows."""
    current_len = len(ds)
    if current_len <= max_examples:
        return ds
    return ds.shuffle(seed=seed).select(range(max_examples))

findings_ds = load_csv_as_dataset("/content/train/findings.csv", "Summarize clinical findings.")
summarization_ds = load_csv_as_dataset("/content/train/summarization.csv", "Summarize given prompt-response.")
dialogue_ds = load_csv_as_dataset("/content/train/dialogues.csv", "Convert dialogue to structured assessment.")
notes_ds = load_csv_as_dataset("/content/train/notes.csv", "Summarize notes into clinical problems.")

findings_ds = limit_dataset(findings_ds, max_examples=500, seed=42)
summarization_ds = limit_dataset(summarization_ds, max_examples=500, seed=43)
dialogue_ds = limit_dataset(dialogue_ds, max_examples=500, seed=44)
notes_ds = limit_dataset(notes_ds, max_examples=500, seed=45)

train_dataset = concatenate_datasets([findings_ds, summarization_ds, dialogue_ds, notes_ds])

MAX_TOTAL = 2000
if len(train_dataset) > MAX_TOTAL:
    train_dataset = train_dataset.shuffle(seed=123).select(range(MAX_TOTAL))

print("Per-source sizes:",
      len(findings_ds), len(summarization_ds), len(dialogue_ds), len(notes_ds))
print("Final train size:", len(train_dataset))


### Validate Dataset

In [None]:
import pandas as pd
from datasets import Dataset, concatenate_datasets

def load_csv_as_dataset(path, instruction):
    df = pd.read_csv(path)
    df = df.dropna(subset=[df.columns[0], df.columns[1]])
    df = df.rename(columns={df.columns[0]: "input", df.columns[1]: "output"})
    df["input"] = df["input"].astype(str)
    df["output"] = df["output"].astype(str)
    df["instruction"] = instruction
    return Dataset.from_pandas(df[["instruction","input","output"]])

def limit_dataset(ds, max_examples=200, seed=99):
    """Shuffle deterministically and keep up to max_examples rows."""
    if len(ds) <= max_examples:
        return ds
    return ds.shuffle(seed=seed).select(range(max_examples))

findings_ds = load_csv_as_dataset("/content/validate/findings.csv", "Summarize clinical findings.")
summarization_ds = load_csv_as_dataset("/content/validate/summarization.csv", "Summarize given prompt-response.")
dialogue_ds = load_csv_as_dataset("/content/validate/dialogues.csv", "Convert dialogue to structured assessment.")
notes_ds = load_csv_as_dataset("/content/validate/notes.csv", "Summarize notes into clinical problems.")

findings_ds = limit_dataset(findings_ds, max_examples=200, seed=101)
summarization_ds = limit_dataset(summarization_ds, max_examples=200, seed=102)
dialogue_ds = limit_dataset(dialogue_ds, max_examples=200, seed=103)
notes_ds = limit_dataset(notes_ds, max_examples=200, seed=104)

validate_dataset = concatenate_datasets([findings_ds, summarization_ds, dialogue_ds, notes_ds])

MAX_TOTAL = 1000
if len(validate_dataset) > MAX_TOTAL:
    validate_dataset = validate_dataset.shuffle(seed=202).select(range(MAX_TOTAL))

print("Per-source sizes:",
      len(findings_ds), len(summarization_ds), len(dialogue_ds), len(notes_ds))
print("Final validate size:", len(validate_dataset))


# Prompt Engineering

In [None]:
def format_example(example):
    return f"""### Instruction:
{example['instruction']}

### Input:
{example['input']}

### Response:
{example['output']}"""

train_dataset = train_dataset.map(lambda x: {"text": format_example(x)})
eval_dataset = validate_dataset.map(lambda x: {"text": format_example(x)})


# Tokenize

In [None]:
def tokenize(batch):
    tokenized = tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=1024
    )
    # labels = input_ids (mask padding as -100 so it's ignored in loss)
    tokenized["labels"] = [
        [(tok if tok != tokenizer.pad_token_id else -100) for tok in ids]
        for ids in tokenized["input_ids"]
    ]
    return tokenized

train_tokenized = train_dataset.map(tokenize, batched=True, remove_columns=train_dataset.column_names)
eval_tokenized = eval_dataset.map(tokenize, batched=True, remove_columns=eval_dataset.column_names)


# QLoRA and PEFT

In [None]:
from peft import LoraConfig, get_peft_model

peft_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=["q_proj","v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)


# Train Model

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./llama-qlora-checkpoints",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=50,
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=2,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=eval_tokenized,
    tokenizer=tokenizer
)

trainer.train()




In [None]:
# Save model weights + LoRA adapters
trainer.save_model("./llama3.1-qlora-clinical")

# Save tokenizer
tokenizer.save_pretrained("./llama3.1-qlora-clinical")


In [None]:
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
ft_model = PeftModel.from_pretrained(base_model, "./llama3.1-qlora-clinical")

merged_model = ft_model.merge_and_unload()



# Validate Model

In [None]:
from transformers import Trainer, AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset

# Load tokenizer and merged model
tokenizer = AutoTokenizer.from_pretrained("./llama3.1-qlora-clinical")

# Prepare your evaluation dataset
# Example: a HuggingFace Dataset object with "input" and "output" columns
eval_dataset = validate_dataset  # your tokenized/processed validation dataset

# Configure a Trainer for evaluation only
trainer = Trainer(
    model=merged_model,
    tokenizer=tokenizer,
    eval_dataset=eval_dataset
)

# Run evaluation
eval_results = trainer.evaluate()
print(eval_results)


In [None]:
from typing import List, Optional, Dict, Any
import numpy as np
import math
import torch

import bert_score
import evaluate
from sacrebleu import bleu_score
import spacy
from tqdm import tqdm

nlp = spacy.load("en_core_web_sm")

def _decode_labels(label_ids, tokenizer):
    """Decode label ids to strings. Replaces -100 with tokenizer.pad_token_id before decoding."""
    if isinstance(label_ids, np.ndarray):
        label_ids = label_ids.tolist()
    decoded = []
    for lbl in label_ids:
        if isinstance(lbl, int):
            lbl = [lbl]
        # replace -100
        lbl = [tokenizer.pad_token_id if token == -100 else token for token in lbl]
        decoded.append(tokenizer.decode(lbl, skip_special_tokens=True, clean_up_tokenization_spaces=True).strip())
    return decoded

def _decode_preds(preds, tokenizer):
    """Decode predictions (could be logits or ids). Accepts numpy arrays or torch tensors."""
    if isinstance(preds, np.ndarray):
        arr = preds
    else:
        try:
            arr = preds.detach().cpu().numpy()
        except Exception:
            arr = np.array(preds)
    if arr.ndim == 3:
        arr = np.argmax(arr, axis=-1)
    decoded = []
    for p in arr:
        if isinstance(p, (np.int64, np.int32, int)):
            p = [int(p)]
        decoded.append(tokenizer.decode([int(x) for x in p], skip_special_tokens=True, clean_up_tokenization_spaces=True).strip())
    return decoded

def medcon_batch_score(preds: List[str], refs: List[str]) -> Dict[str,float]:
    """
    Lightweight MedCon-style metric:
    - Extracts spaCy named entities and noun-chunks from prediction & reference
    - Computes per-example concept precision/recall/F1 based on set overlap
    - Returns macro-averaged precision/recall/f1 across the dataset
    NOTE: This is a simple proxy for clinical-concept overlap. Replace with scispaCy / UMLS-based matcher for stricter medical metrics.
    """
    precisions = []
    recalls = []
    f1s = []
    for p, r in zip(preds, refs):
        doc_p = nlp(p or "")
        doc_r = nlp(r or "")
        conc_p = set([ent.text.strip().lower() for ent in doc_p.ents if ent.text.strip()]) \
                 | set([chunk.text.strip().lower() for chunk in doc_p.noun_chunks if chunk.text.strip()])
        conc_r = set([ent.text.strip().lower() for ent in doc_r.ents if ent.text.strip()]) \
                 | set([chunk.text.strip().lower() for chunk in doc_r.noun_chunks if chunk.text.strip()])

        if len(conc_p) == 0 and len(conc_r) == 0:
            precisions.append(1.0)
            recalls.append(1.0)
            f1s.append(1.0)
            continue
        if len(conc_p) == 0:
            precisions.append(0.0)
            recalls.append(0.0)
            f1s.append(0.0)
            continue

        tp = len(conc_p & conc_r)
        prec = tp / len(conc_p) if len(conc_p) > 0 else 0.0
        rec = tp / len(conc_r) if len(conc_r) > 0 else 0.0
        if prec + rec == 0:
            f1 = 0.0
        else:
            f1 = 2 * prec * rec / (prec + rec)
        precisions.append(prec)
        recalls.append(rec)
        f1s.append(f1)

    return {
        "medcon_precision": float(np.mean(precisions)),
        "medcon_recall": float(np.mean(recalls)),
        "medcon_f1": float(np.mean(f1s))
    }

def validate_all_metrics(
    trainer,
    tokenizer,
    eval_dataset: Optional[Any] = None,
    max_eval_samples: Optional[int] = None,
    batch_size: Optional[int] = None,
    verbose: bool = True
) -> Dict[str, float]:
    """
    Run evaluation using the trainer and compute BERTScore, ROUGE-L, BLEU, and MedCon.
    - trainer: HuggingFace Trainer instance (must have model on device)
    - tokenizer: tokenizer used for decode
    - eval_dataset: optional dataset to use (if None, trainer.eval_dataset is used)
    - max_eval_samples: if set, truncates the eval dataset to the first N samples to speed up testing
    - batch_size: (unused here) keep for API parity; trainer.predict will use trainer.args
    Returns dict with aggregated metrics.
    """
    ds = eval_dataset if eval_dataset is not None else getattr(trainer, "eval_dataset", None)
    if ds is None:
        raise ValueError("No eval_dataset provided and trainer.eval_dataset is None.")

    if max_eval_samples is not None:
        ds = ds.select(range(min(max_eval_samples, len(ds))))

    if verbose:
        print(f"Running prediction on {len(ds)} examples...")
    pred_output = trainer.predict(ds, metric_key_prefix="eval")
    raw_preds = pred_output.predictions
    label_ids = pred_output.label_ids

    preds = _decode_preds(raw_preds, tokenizer)
    refs = _decode_labels(label_ids, tokenizer)

    preds = [p.strip() for p in preds]
    refs = [r.strip() for r in refs]

    assert len(preds) == len(refs), "Predictions and references must be same length."

    if verbose:
        print("Computing BERTScore...")
    P, R, F1 = bert_score.score(cands=preds, refs=refs, lang="en", rescale_with_baseline=True)
    bert_f1 = float(F1.mean().cpu().numpy())

    if verbose:
        print("Computing ROUGE-L...")
    rouge = evaluate.load("rouge")
    rouge_res = rouge.compute(predictions=preds, references=refs)
    rougeL = rouge_res.get("rougeL", None)
    if rougeL is None:
        rougeL = rouge_res.get("rougeLsum", 0.0)

    if verbose:
        print("Computing BLEU (sacrebleu)...")
    bleu = sacrebleu.corpus_bleu(preds, [refs])

    if verbose:
        print("Computing MedCon (spaCy noun-chunk+entities overlap)...")
    medcon = medcon_batch_score(preds, refs)

    results = {
        "bert_f1": bert_f1,
        "rougeL": float(rougeL),
        "bleu": bleu_score,
        "medcon_precision": medcon["medcon_precision"],
        "medcon_recall": medcon["medcon_recall"],
        "medcon_f1": medcon["medcon_f1"],
        "n_examples": len(preds)
    }

    # Print nicely
    print("\n===== Validation results =====")
    print(f"Examples evaluated: {results['n_examples']}")
    print(f"BERTScore F1 (rescaled): {results['bert_f1']:.4f}")
    print(f"ROUGE-L: {results['rougeL']:.4f}")
    print(f"BLEU (sacrebleu): {results['bleu']:.2f}")
    print(f"MedCon P/R/F1: {results['medcon_precision']:.4f} / {results['medcon_recall']:.4f} / {results['medcon_f1']:.4f}")
    print("==============================\n")

    return results



In [None]:
results = validate_all_metrics(trainer, tokenizer, eval_dataset=None, max_eval_samples=None)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# !cp -r llama-qlora-checkpoints /content/drive/MyDrive
# !cp -r llama3.1-qlora-clinical /content/drive/MyDrive

In [None]:
!cp -r /content/drive/MyDrive/llama-qlora-checkpoints /content/
!cp -r /content/drive/MyDrive/llama3.1-qlora-clinical /content/