In [None]:
# Core libraries and HF datasets
import os, json, shutil, zipfile
import numpy as np
from datasets import Dataset

from google.colab import drive
drive.mount("/content/drive")


Mounted at /content/drive


### Load M3 pairs from Drive (already cleaned/aligned)

In [None]:
# Load cleaned M3 pairs from Drive and add task/level control tokens
def load_m3_from_drive(path, task="M3"):
    out = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            rec  = json.loads(line)
            gran = rec["granularity"]       # "DOC", "SENT", "CLAIM"
            src  = rec["input_text"]
            tgt  = rec["target_text"]

            inp  = f"<TASK:{task}> <LEVEL:{gran}> " + src
            out.append({
                "input_text":  inp,
                "target_text": tgt,
                "granularity": gran
            })
    return out

m3_pairs = load_m3_from_drive("/content/drive/MyDrive/capstone/m3_all_pairs.jsonl")
print("Loaded M3 pairs:", len(m3_pairs))


Loaded M3 pairs: 1147


### Load OSE pairs from Drive and harmonize fields

In [None]:
# Load OSE cleaned file and convert to same schema (input/target/granularity)
def load_ose_pairs(path):
    rows = []
    print(f"Loading OSE pairs from: {path}")
    with open(path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            rec = json.loads(line)

            src  = rec["source"]            # original OSE article
            tgt  = rec["target"]            # simplified text
            gran = rec.get("level", "EASY") # EASY / MED / HARD

            ctl = f"<TASK:OSE> <LEVEL:{gran}> "
            processed = {
                "input_text":  ctl + src,
                "target_text": tgt,
                "granularity": gran
            }
            if i == 0:
                print("Sample raw record:", rec)
                print("\nSample processed record:", processed)
                print("-" * 60)
            rows.append(processed)
    print(f"Loaded {len(rows)} OSE pairs.")
    return rows

ose_pairs = load_ose_pairs("/content/drive/MyDrive/capstone/ose_clean.jsonl")


Loading OSE pairs from: /content/drive/MyDrive/capstone/ose_clean.jsonl
Sample raw record: {'source': 'When you see the word Amazon, what\'s the first thing that springs to mind - the world\'s biggest forest, the longest river or the largest internet retailer - and which do you consider most important?\nThese questions have risen to the fore in an arcane, but hugely important, debate about how to redraw the boundaries of the internet. Brazil and Peru have lodged objections to a bid made by the §§PROT0§§ e-commerce giant for a prime new piece of cyberspace: ".amazon".\nThe Seattle-based company has applied for its brand to be a top-level domain name (currently ".com"), but the South American governments argue this would prevent the use of this internet address for environmental protection, the promotion of indigenous rights and other public interest uses.\nAlong with dozens of other disputed claims to names, including ".patagonia" and ".shangrila", the issue cuts to the heart of debates

### Combine OSE + M3 and build an HF Dataset with a stratified split

In [None]:
# Combine OSE + M3, then convert to HuggingFace Dataset and split
all_pairs = m3_pairs + ose_pairs
print(f"\nTotal combined dataset size: {len(all_pairs)}")
print(f"M3 pairs: {len(m3_pairs)} | OSE pairs: {len(ose_pairs)}")

full_ds = Dataset.from_list(all_pairs)
print("Full dataset features:", full_ds.features)

# Encode 'granularity' as a ClassLabel so we can stratify
full_ds = full_ds.class_encode_column("granularity")
label_names = full_ds.features["granularity"].names
print("Granularity label names:", label_names)

# Stratified train/val split by granularity
train_test = full_ds.train_test_split(
    test_size=0.15,
    seed=42,
    stratify_by_column="granularity"
)
train_ds = train_test["train"]
val_ds   = train_test["test"]

print("Train samples:", len(train_ds))
print("Val samples:", len(val_ds))



Total combined dataset size: 1714
M3 pairs: 1147 | OSE pairs: 567
Full dataset features: {'input_text': Value('string'), 'target_text': Value('string'), 'granularity': Value('string')}


Casting to class labels:   0%|          | 0/1714 [00:00<?, ? examples/s]

Granularity label names: ['CLAIM', 'DOC', 'EASY', 'MED', 'SENT']
Train samples: 1456
Val samples: 258


### Tokenizer with all task/level control tokens

In [None]:
from transformers import AutoTokenizer

# BART tokenizer with all TASK + LEVEL control tokens
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base", use_fast=True)
special_tokens = {
    "additional_special_tokens": [
        "<TASK:OSE>", "<TASK:M3>",
        "<LEVEL:EASY>", "<LEVEL:MED>", "<LEVEL:HARD>",
        "<LEVEL:DOC>", "<LEVEL:SENT>", "<LEVEL:CLAIM>",
    ]
}
tokenizer.add_special_tokens(special_tokens)
print("Tokenizer size (with extra tokens):", len(tokenizer))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Tokenizer size (with extra tokens): 50273


### Preprocess (dynamic max length per granularity) and tokenize

In [None]:
# Max lengths for different granularities
max_src_len_doc   = 1024   # DOC + long OSE articles
max_src_len_small = 256    # SENT / CLAIM (shorter)
max_tgt_len       = 256    # summaries

def preprocess(batch):
    # Decide max source length per example, using integer labels -> names
    max_src_per_example = []
    for g_id in batch["granularity"]:
        gran = label_names[g_id]  # "DOC", "SENT", "CLAIM", "EASY", "MED", "HARD"
        if gran in ["DOC", "EASY", "MED", "HARD"]:
            max_src_per_example.append(max_src_len_doc)
        else:  # "SENT", "CLAIM"
            max_src_per_example.append(max_src_len_small)

    input_ids = []
    attention_masks = []
    for text, max_len in zip(batch["input_text"], max_src_per_example):
        enc = tokenizer(
            text,
            max_length=max_len,
            truncation=True,
            padding="max_length",
        )
        input_ids.append(enc["input_ids"])
        attention_masks.append(enc["attention_mask"])

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["target_text"],
            max_length=max_tgt_len,
            truncation=True,
            padding="max_length",
        )

    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks,
        "labels": labels["input_ids"],
    }

# Apply preprocessing to train/val sets
train_tokenized = train_ds.map(
    preprocess,
    batched=True,
    remove_columns=train_ds.column_names,
)
val_tokenized = val_ds.map(
    preprocess,
    batched=True,
    remove_columns=val_ds.column_names,
)

print("Tokenized train size:", len(train_tokenized))
print("Tokenized val size:", len(val_tokenized))
print("Example input length:", len(train_tokenized[0]["input_ids"]))
print("Example label length:", len(train_tokenized[0]["labels"]))


Map:   0%|          | 0/1456 [00:00<?, ? examples/s]



Map:   0%|          | 0/258 [00:00<?, ? examples/s]

Tokenized train size: 1456
Tokenized val size: 258
Example input length: 1024
Example label length: 256


### Copy + unzip Phase-1 model from Drive (your base joint model)

In [None]:
# Copy base Phase-1 model zip from Drive and unzip to ./p1_model
zip_file_path  = "/content/drive/MyDrive/capstone/base_modell.zip"  # note double 'l' in Drive
destination_path = "./base_model.zip"

if os.path.exists(zip_file_path):
    shutil.copy(zip_file_path, destination_path)
    print(f"Copied {zip_file_path} to {destination_path}")
else:
    print(f"File not found: {zip_file_path}")

zipfile_path = "base_model.zip"
with zipfile.ZipFile(zipfile_path, "r") as zip_ref:
    zip_ref.extractall(".")
print("Extracted to ./p1_model:", os.path.exists("./p1_model"))


Copied /content/drive/MyDrive/capstone/base_modell.zip to ./base_model.zip
Extracted to ./p1_model: True


### Load model, resize embeddings, and create data collator

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

# Load Phase 1 checkpoint and align embeddings with tokenizer
model_dir = "./p1_model"
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)
model.resize_token_embeddings(len(tokenizer))

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
print("Model and data collator ready.")


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Model and data collator ready.


### ROUGE metric + compute_metrics function

In [None]:
# Install evaluation libs once
!pip install -q evaluate rouge_score

from evaluate import load
rouge = load("rouge")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds  = np.where(preds  != -100, preds,  tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds  = tokenizer.batch_decode(preds,  skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
    )
    # Normalize different rouge implementations
    first_val = list(result.values())[0]
    if hasattr(first_val, "mid"):
        return {k: v.mid.fmeasure for k, v in result.items()}
    else:
        return result


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


Downloading builder script: 0.00B [00:00, ?B/s]

### Training helper run_experiment and running your chosen LR

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

os.environ["WANDB_DISABLED"] = "true"  # keep wandb off

def run_experiment(lr, output_name):
    print(f"\n===== Running LR = {lr}, output = {output_name} =====\n")

    args = Seq2SeqTrainingArguments(
        output_dir=output_name,
        predict_with_generate=True,
        generation_max_length=128,
        learning_rate=lr,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=16,
        num_train_epochs=10,
        fp16=True,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="rougeL",
        label_smoothing_factor=0.1,
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=args,
        train_dataset=train_tokenized,
        eval_dataset=val_tokenized,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    metrics = trainer.evaluate()
    print("\n===== Final Evaluation Metrics =====")
    print(metrics)
    return metrics

# Example: only run the best LR you already selected, e.g. 2e-5
results_lr2e5 = run_experiment(5e-5, "./joint_lr2e5")



Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).



===== Running LR = 5e-05, output = ./joint_lr2e5 =====



  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,2.105377,0.337118,0.169869,0.276901,0.293766
2,No log,2.022522,0.342241,0.174053,0.281914,0.299128
3,No log,1.992805,0.350849,0.183987,0.292044,0.308744
4,No log,1.95933,0.355104,0.184663,0.294425,0.311683
5,No log,1.940768,0.366169,0.190112,0.303805,0.32081
6,No log,1.929272,0.375558,0.197572,0.311777,0.328889
7,No log,1.921053,0.373985,0.198542,0.311301,0.328761
8,No log,1.915667,0.376684,0.201625,0.313532,0.329838
9,No log,1.910687,0.376946,0.203825,0.315921,0.332489
10,No log,1.910458,0.38024,0.205272,0.319088,0.335978


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].



===== Final Evaluation Metrics =====
{'eval_loss': 1.9104578495025635, 'eval_rouge1': 0.3802398408145955, 'eval_rouge2': 0.20527154667809241, 'eval_rougeL': 0.31908818007524065, 'eval_rougeLsum': 0.3359782036345098, 'eval_runtime': 209.2357, 'eval_samples_per_second': 1.233, 'eval_steps_per_second': 1.233, 'epoch': 10.0}


In [None]:
results_lr2e5 = run_experiment(2e-5, "./joint_lr2e5")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).



===== Running LR = 2e-05, output = ./joint_lr2e5 =====



  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,1.901279,0.378684,0.202855,0.316299,0.333035
2,No log,1.895283,0.379386,0.205935,0.318898,0.33614
3,No log,1.896107,0.385588,0.208759,0.322447,0.340625
4,No log,1.894798,0.386798,0.207278,0.322589,0.339781
5,No log,1.886383,0.388123,0.210422,0.324688,0.34087
6,No log,1.885346,0.38886,0.210252,0.323643,0.340852
7,No log,1.881209,0.385027,0.205701,0.319183,0.33654
8,No log,1.880196,0.392503,0.212156,0.328102,0.345422
9,No log,1.876329,0.389914,0.21025,0.325547,0.342455
10,No log,1.876983,0.392807,0.212907,0.328859,0.34644


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].



===== Final Evaluation Metrics =====
{'eval_loss': 1.876983404159546, 'eval_rouge1': 0.3928071099201247, 'eval_rouge2': 0.21290727034799187, 'eval_rougeL': 0.3288587008046291, 'eval_rougeLsum': 0.34643956361576356, 'eval_runtime': 217.0064, 'eval_samples_per_second': 1.189, 'eval_steps_per_second': 1.189, 'epoch': 10.0}


### Final evaluation-only Trainer (optional, for a clean “joint” metric)

### Qualitative examples for a few validation indices

In [None]:
# Find one example index for each granularity among M3-like labels
doc_idx = sent_idx = claim_idx = None

for i, ex in enumerate(val_ds):
    g_id = ex["granularity"]
    gran = label_names[g_id]  # e.g. "DOC", "SENT", "CLAIM", "EASY", ...

    if gran == "DOC" and doc_idx is None:
        doc_idx = i
    elif gran == "SENT" and sent_idx is None:
        sent_idx = i
    elif gran == "CLAIM" and claim_idx is None:
        claim_idx = i

    if doc_idx is not None and sent_idx is not None and claim_idx is not None:
        break

print("DOC idx:", doc_idx)
print("SENT idx:", sent_idx)
print("CLAIM idx:", claim_idx)

def show_example(idx):
    ex = val_ds[idx]
    g_id = ex["granularity"]
    gran = label_names[g_id]

    src = ex["input_text"]
    tgt = ex["target_text"]

    max_src = 1024 if gran in ["DOC", "EASY", "MED", "HARD"] else 256
    max_tgt = 256

    print(f"\n=== {gran} example (idx={idx}) ===")
    print("INPUT (truncated):\n", src[:600], "...\n")
    print("REFERENCE SUMMARY:\n", tgt[:600], "...\n")

    inputs = tokenizer(
        src,
        return_tensors="pt",
        truncation=True,
        max_length=max_src,
    )
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    model.eval()
    with torch.no_grad():
        gen_ids = model.generate(
            **inputs,
            max_length=max_tgt,
            num_beams=4,
        )
    pred = tokenizer.batch_decode(gen_ids, skip_special_tokens=True)[0]
    print("MODEL SUMMARY:\n", pred[:600], "...\n")

# Call on a few indices
show_example(doc_idx)
show_example(sent_idx)
show_example(claim_idx)


DOC idx: 3
SENT idx: 2
CLAIM idx: 0

=== DOC example (idx=3) ===
INPUT (truncated):
 <TASK:M3> <LEVEL:DOC> PURPOSE: To evaluate the association of myocilin (MYOC), optineurin (OPTN), and apolipoprotein E (APOE) genes and their interactions in primary open angle glaucoma (POAG). METHODS: A cohort of 400 unrelated POAG patients (294 high tension glaucoma, HTG, and 106 normal tension glaucoma, NTG) and 281 unrelated control subjects were recruited. All coding exons and splicing junctions in MYOC and OPTN were screened for sequence alterations. Common polymorphisms in APOE were genotyped. Single genes were investigated by univariate and haplotype analysis, and gene-gene interactio ...

REFERENCE SUMMARY:
 Our meta-analysis provides strong evidence that the APOE ε2/ε3/ε4 polymorphism is not associated with POAG susceptibility in any populations. ...

MODEL SUMMARY:
 CONCLUSIONS: This meta-analysis suggests that myocilin ε4/ε3/ε4 polymorphisms are associated with increased risk of POAG. ...


# **Lr = 5e-5 Examples:**


*   DOC idx: 3
*   SENT idx: 2
*   CLAIM idx: 0




### **=== DOC example (idx=3) ===**
INPUT (truncated):
 <TASK:M3> <LEVEL:DOC> PURPOSE: To evaluate the association of myocilin (MYOC), optineurin (OPTN), and apolipoprotein E (APOE) genes and their interactions in primary open angle glaucoma (POAG). METHODS: A cohort of 400 unrelated POAG patients (294 high tension glaucoma, HTG, and 106 normal tension glaucoma, NTG) and 281 unrelated control subjects were recruited. All coding exons and splicing junctions in MYOC and OPTN were screened for sequence alterations. Common polymorphisms in APOE were genotyped. Single genes were investigated by univariate and haplotype analysis, and gene-gene interactio ...

REFERENCE SUMMARY:
 Our meta-analysis provides strong evidence that the APOE ε2/ε3/ε4 polymorphism is not associated with POAG susceptibility in any populations. ...

MODEL SUMMARY:
 AUTHORS' CONCLUSIONS: This meta-analysis suggests that the ε4/ε4 polymorphism is associated with a significantly increased risk of POAG. However, the evidence for a relationship between ε2/ε3 polymorphisms and POAG is inconclusive. ...


### **=== SENT example (idx=2) ===**
INPUT (truncated):
 <TASK:M3> <LEVEL:SENT> Monthly visits and a morphology-driven PRN regimen with 3 injections in case of recurrence plus quarterly injections in case of inactive CNV resulted in an average VA gain of 7-9 letters that could be maintained over 3 years. 50% of patients had 6/18 or better over 4 years. No new safety signals were identified in patients who received ranibizumab for a total of 3 years. The results of this 3-year cohort showed that the initial average acuity could be maintained over 36 months, which was comparable to those of many other clinical cohorts. Approxim ...

REFERENCE SUMMARY:
 The results of this meta-analysis clearly indicate that intravitreal anti-VEGF injection therapy is capable of maintaining visual acuity on a long-term basis of at least 4-5 years. ...

MODEL SUMMARY:
 The results of this meta-analysis showed that anti-VEGF agents were effective in maintaining visual acuity in patients with exudative AMD. ...


### **=== CLAIM example (idx=0) ===**
INPUT (truncated):
 <TASK:M3> <LEVEL:CLAIM> We did not find any statistically significant difference between the groups with respect to the levels of vitamin B6 and B12 (p>0.05), but the level of folate was lowest in Group 3 (p<0.001). Folate, vitamin B12 and B6 levels were significantly decreased and associated with elevated Hcy levels in PEXG. ...

REFERENCE SUMMARY:
 PEXG is associated with elevated plasma tHcy and low serum folic acid levels, but not serum vitamin B12, vitamin B6 levels, and MTHFR C677T genotype. ...

MODEL SUMMARY:
 PEXG is associated with elevated plasma tHcy, serum folic acid, serum vitamin B6, and MTHFR C677T genotype. ...


# **Discussion**
The joint OSE+M3 fine-tuned model demonstrates clear improvements in summarization quality, reaching ROUGE-1 ≈ 0.38 and ROUGE-L ≈ 0.32, substantially higher than both the Phase-1 baseline and the M3-only model. This confirms that combining OSE (which teaches simplification and clarity) with M3 (which provides dense biomedical content) leads to summaries that are more fluent, structured, and aligned with expert-written references. The model consistently identifies the central topic and produces coherent scientific statements across DOC, SENT, and CLAIM levels.

A qualitative comparison of checkpoints shows that lr = 2e-5 generally produces cleaner and more faithful summaries than lr = 5e-5. For example, the 5e-5 model occasionally shifted to unrelated diseases or gene families, whereas the 2e-5 model remained more on-topic and reduced the most severe hallucinations. In SENT and CLAIM examples, the 2e-5 model preserved the correct clinical context more accurately, although subtle factual inversions (e.g., flipping whether an association exists or not) still occurred.

Overall, the joint model performs strongly in linguistic quality and captures the intended meaning better than earlier stages, but it still exhibits biomedical hallucinations, typically small but sometimes clinically significant. This behavior is expected for abstractive models like BART, which generalize aggressively. The model is therefore well-suited for producing readable scientific summaries, but it should not be relied upon for tasks requiring strict factual correctness without human verification.

In [1]:
import nbformat

# 1) Path to the notebook you want to clean
path = "/content/Capstone_phase2_joint.ipynb"

# 2) Read the notebook
nb = nbformat.read(path, as_version=4)

removed_any = False

# 3) Remove notebook-level widgets metadata
if "widgets" in nb.metadata:
    print("Removing notebook-level metadata.widgets")
    nb.metadata.pop("widgets")
    removed_any = True

# 4) Remove widgets from each cell's metadata and outputs
for ci, cell in enumerate(nb.cells):
    # cell.metadata.widgets
    md = cell.get("metadata", {})
    if "widgets" in md:
        print(f"Removing cell {ci} metadata.widgets")
        md.pop("widgets")
        removed_any = True

    # output.metadata.widgets for each output in the cell
    for oi, out in enumerate(cell.get("outputs", [])):
        omd = out.get("metadata", {})
        if "widgets" in omd:
            print(f"Removing cell {ci}, output {oi} metadata.widgets")
            omd.pop("widgets")
            removed_any = True

if not removed_any:
    print("No metadata.widgets found at notebook, cell, or output level.")

# 5) Save the cleaned notebook back to the same file
nbformat.write(nb, path)
print("Cleaned notebook saved to:", path)


Removing notebook-level metadata.widgets
Cleaned notebook saved to: /content/Capstone_phase2_joint.ipynb
