In [2]:
! pip install pubmed_parser peft datasets
! pip install --upgrade transformers peft

Collecting pubmed_parser
  Downloading pubmed_parser-0.5.1-py3-none-any.whl.metadata (17 kB)
Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting unidecode (from pubmed_parser)
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu

In [3]:
import pubmed_parser as pp
xml_file_path = "/pubmed25n0001.xml.gz"

In [5]:
dicts_out_gen = pp.parse_medline_xml(xml_file_path, year_info_only=False, nlm_category=False, author_list=True, reference_list=False)

# Convert generator to list
dicts_out = list(dicts_out_gen)

# Example: print first article's title and abstract
print("Title:", dicts_out[0]['title'])
print("Abstract:", dicts_out[0]['abstract'])

Title: Formate assay in body fluids: application in methanol poisoning.
Abstract: 


In [6]:
filtered_articles = [
    article for article in dicts_out
    if article.get('abstract') and len(article['abstract'].strip()) > 50  # example threshold
]
print(f"Filtered dataset size: {len(filtered_articles)}")

Filtered dataset size: 15377


In [7]:
instruction = "Summarize the following biomedical abstract."

train_samples = []
for article in filtered_articles:
    sample = {
        "instruction": instruction,
        "input": article['abstract'],
        "output": article['abstract']  # for simple LM fine-tuning, output can be the same text
    }
    train_samples.append(sample)

In [8]:
import json

with open("pubmed_instruction_data.jsonl", "w") as f:
    for sample in train_samples:
        f.write(json.dumps(sample) + "\n")


In [9]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="pubmed_instruction_data.jsonl", split="train")

# Split into 90% train and 10% validation
split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split["train"]
eval_dataset = split["test"]


Generating train split: 0 examples [00:00, ? examples/s]

In [10]:
import os
os.environ["HF_TOKEN"] = "hf_myToken"

In [11]:
from transformers import AutoTokenizer

# Load tokenizer (replace with your model's tokenizer)
model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

def preprocess(example):
    prompt = f"Instruction: {example['instruction']}\nInput: {example['input']}\nOutput:"
    inputs = tokenizer(prompt, truncation=True, max_length=512, padding="max_length")
    outputs = tokenizer(example['output'], truncation=True, max_length=128, padding="max_length")

    inputs["labels"] = outputs["input_ids"]
    return inputs

dataset = dataset.map(preprocess, remove_columns=dataset.column_names)

train_dataset = train_dataset.map(preprocess, remove_columns=train_dataset.column_names)
eval_dataset = eval_dataset.map(preprocess, remove_columns=eval_dataset.column_names)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/15377 [00:00<?, ? examples/s]

Map:   0%|          | 0/13839 [00:00<?, ? examples/s]

Map:   0%|          | 0/1538 [00:00<?, ? examples/s]

In [12]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")


from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
! pip install peft



In [13]:
from peft import LoraConfig, get_peft_model, TaskType

# Define LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,  # since flan-t5 is seq2seq
    inference_mode=False,              # training mode
    r=8,                             # LoRA rank, adjust as needed (4,8,16)
    lora_alpha=32,                   # scaling factor
    lora_dropout=0.1,                # dropout on LoRA layers
    target_modules=["q", "v"],       # attention query and value projection layers
    bias="none"
)


In [14]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # to verify only LoRA params are trainable


trainable params: 344,064 || all params: 77,305,216 || trainable%: 0.4451


In [15]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

training_args = Seq2SeqTrainingArguments(
    output_dir="./flan-t5-lora-finetuned",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=3e-4,
    fp16=True,
    logging_steps=10,
    save_steps=100,
    save_total_limit=2,
    eval_strategy="no",
    predict_with_generate=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,  # optional, can be omitted
    # label_names=["labels"],  # explicitly specify label names
)


  trainer = Seq2SeqTrainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [16]:
prompt = "Instruction: Summarize the following biomedical abstract.\nInput: The study investigates gene X...\nOutput:"

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
generated_ids = model.generate(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask)



# ######## or ######### #

prompt = "Instruction: Summarize the following biomedical abstract.\nInput: The study investigates gene X...\nOutput:"

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

generated_ids = model.generate(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=128)

output_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

print(output_text)

The study demonstrates that a gene X is a molecule that is a molecule that is a molecule that is a molecule that is a molecule that is a molecule that is a molecule that is a molecule that is a molecule that is a molecule that is a molecule that is a molecule that is a molecule that is a molecule that is a molecule that is a molecule that is a molecule that is a molecule that is a molecule that is a


In [None]:
pip install --upgrade transformers peft



In [17]:
# Evaluate the model on the test dataset

os.environ["WANDB_API_KEY"] = "744_my_token"
trainer.evaluate()


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


[34m[1mwandb[0m: Currently logged in as: [33mhanab86635[0m ([33mhanab86635-exitings[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


{'eval_loss': nan,
 'eval_model_preparation_time': 0.0076,
 'eval_runtime': 16.3065,
 'eval_samples_per_second': 94.318,
 'eval_steps_per_second': 11.836}

In [18]:
model.save_pretrained("./flan-t5-lora-finetuned")
tokenizer.save_pretrained("./flan-t5-lora-finetuned")


('./flan-t5-lora-finetuned/tokenizer_config.json',
 './flan-t5-lora-finetuned/special_tokens_map.json',
 './flan-t5-lora-finetuned/spiece.model',
 './flan-t5-lora-finetuned/added_tokens.json')

In [19]:
model = model.merge_and_unload()