In [None]:
import os
# MUST set this BEFORE importing torch/transformers
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'

import sklearn
import numpy as np
import pandas as pd
import time, re, json
from sklearn.model_selection import train_test_split

In [None]:
input_articles_pmcids = "scripts/exp_input/REV.txt"

ground_truth_complete = "scripts/Local_model_finetuning/ground_truth/gt_dataset_info_extraction_from_snippet.xlsx"
ground_truth_no_dspage = "scripts/Local_model_finetuning/ground_truth/gt_dataset_info_no_dspage_extraction_from_snippet.xlsx"

In [None]:
with open(input_articles_pmcids, 'r') as f:
    pmc_links = f.read().splitlines()

print("Total number of PMCIDs:", len(pmc_links))


train_pmc_links, test_pmc_links = train_test_split(pmc_links, test_size=0.2, random_state=42)

print(f"Training set: {len(train_pmc_links)}")
print(f"Test set: {len(test_pmc_links)}")

In [None]:
train_test_df = pd.read_excel(ground_truth_no_dspage)

In [None]:
# Split the DataFrame based on train and test PMC links
train_df = train_test_df[train_test_df['url'].isin(train_pmc_links)]
test_df = train_test_df[train_test_df['url'].isin(test_pmc_links)]

print(f"Original DataFrame: {len(train_test_df)} rows")
print(f"Train DataFrame: {len(train_df)} rows")
print(f"Test DataFrame: {len(test_df)} rows")
print(f"Total matched: {len(train_df) + len(test_df)} rows")

In [None]:
# Examine the input and output text to understand the task
print("Sample input_text:")
print(train_df['input_text'].iloc[0][:500])
print("\n" + "="*80 + "\n")
print("Sample output_text:")
print(train_df['output_text'].iloc[0][:500])
print("\n" + "="*80 + "\n")
print(f"Average input length: {train_df['input_text'].str.len().mean():.0f} chars")
print(f"Average output length: {train_df['output_text'].str.len().mean():.0f} chars")

# Model Training Strategy

## Task: Dataset Information Extraction
Extract structured dataset identifiers and repository references from scientific text.

#### Approach: **Fine-tuned T5/Flan-T5** (Recommended)
- Designed for text-to-text generation
- Good at structured output
- Fast inference
- Model: `google/flan-t5-base` (250M params)

In [None]:
# Prepare data in the format needed for training
from datasets import Dataset

def prepare_dataset(df):
    """Convert DataFrame to HuggingFace Dataset format, filtering out NaN values"""
    # Remove rows where input_text or output_text is NaN
    df_clean = df.dropna(subset=['input_text', 'output_text']).copy()
    
    # Convert to string to ensure all values are strings
    df_clean['input_text'] = df_clean['input_text'].astype(str)
    df_clean['output_text'] = df_clean['output_text'].astype(str)
    
    data = {
        'input': df_clean['input_text'].tolist(),
        'output': df_clean['output_text'].tolist()
    }
    
    print(f"  Filtered out {len(df) - len(df_clean)} rows with missing values")
    return Dataset.from_dict(data)

print("Preparing train dataset...")
train_dataset = prepare_dataset(train_df)

print("\nPreparing test dataset...")
test_dataset = prepare_dataset(test_df)

print(f"\nTrain dataset: {len(train_dataset)} examples")
print(f"Test dataset: {len(test_dataset)} examples")
print("\nSample:")
print(f"Input: {train_dataset[0]['input'][:200]}...")
print(f"Output: {train_dataset[0]['output']}")

# Fine-tune Flan-T5

This approach uses:
- **Model**: `google/flan-t5-base` - instruction-tuned T5 model
- **Training**: Parameter-efficient fine-tuning
- **Framework**: HuggingFace Transformers + Trainer API

In [None]:
# Install required packages (run once)
!pip install -q transformers datasets accelerate evaluate rouge-score sentencepiece

In [None]:
from transformers import (
    AutoTokenizer, 
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
import evaluate
import numpy as np

# Load model and tokenizer
model_name = "google/flan-t5-base"  # 250M parameters
print(f"Loading model: {model_name}")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

print(f"Model loaded: {model.num_parameters():,} parameters")

In [None]:
# Tokenize the datasets
def preprocess_function(examples):
    """Tokenize inputs and outputs"""
    # Add task prefix to help the model understand the task
    inputs = ["Extract dataset information: " + doc for doc in examples['input']]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding=False)
    
    # Tokenize targets
    labels = tokenizer(text_target=examples['output'], max_length=256, truncation=True, padding=False)
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

# Tokenize datasets
tokenized_train = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_test = test_dataset.map(preprocess_function, batched=True, remove_columns=test_dataset.column_names)

print(f"Tokenized train dataset: {len(tokenized_train)} examples")
print(f"Tokenized test dataset: {len(tokenized_test)} examples")

In [None]:
# Setup evaluation metrics
rouge = evaluate.load("rouge")

def compute_metrics(eval_preds):
    """Compute ROUGE scores for evaluation"""
    preds, labels = eval_preds
    
    # Decode predictions
    if isinstance(preds, tuple):
        preds = preds[0]
    
    # Replace -100 in labels (padding token)
    # Clip values to valid range to prevent overflow
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    labels = np.clip(labels, 0, tokenizer.vocab_size - 1)
    preds = np.clip(preds, 0, tokenizer.vocab_size - 1)
    
    try:
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    except (OverflowError, ValueError) as e:
        print(f"Warning: Decoding error: {e}")
        # Return default metrics on error
        return {
            "rouge1": 0.0,
            "rouge2": 0.0,
            "rougeL": 0.0,
            "exact_match": 0.0
        }
    
    # Compute ROUGE scores
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    
    # Extract F1 scores
    result = {k: round(v * 100, 2) for k, v in result.items()}
    
    # Compute exact match (for structured output)
    exact_match = sum([p.strip() == l.strip() for p, l in zip(decoded_preds, decoded_labels)]) / len(decoded_preds)
    result["exact_match"] = round(exact_match * 100, 2)
    
    return result

In [None]:
# Setup training arguments
output_dir = "scripts/Local_model_finetuning/flan-t5-models"

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    
    # Training hyperparameters
    num_train_epochs=5,
    learning_rate=3e-4,
    per_device_train_batch_size=2,  # Reduced to 2 for memory constraints
    per_device_eval_batch_size=2,   # Reduced to 2 for memory constraints
    gradient_accumulation_steps=8,  # Increased to 8 to maintain effective batch size = 16
    
    # Evaluation and logging
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    
    # Generation settings for evaluation
    predict_with_generate=True,
    generation_max_length=256,
    
    # Optimizer settings
    weight_decay=0.01,
    warmup_steps=100,
    
    # Save settings
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="exact_match",
    greater_is_better=True,
    
    # Performance
    fp16=False,  # Set to True if you have GPU with fp16 support
    
    # Reproducibility
    seed=42,
)

print(f"Training configuration:")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"  Learning rate: {training_args.learning_rate}")
print(f"  Output directory: {output_dir}")

In [None]:
# Initialize data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("Trainer initialized successfully!")
print(f"Training samples: {len(tokenized_train)}")
print(f"Evaluation samples: {len(tokenized_test)}")

In [None]:
# Clear memory before training
import torch
import gc

if torch.backends.mps.is_available():
    torch.mps.empty_cache()
gc.collect()

print("Memory cleared. Ready to train.")

In [None]:
# Train the model
print("Starting training...")
train_result = trainer.train()

# Print training summary
print("\nTraining completed!")
print(f"Training loss: {train_result.training_loss:.4f}")
print(f"Training time: {train_result.metrics['train_runtime']:.2f} seconds")

# Save the final model
trainer.save_model(f"{output_dir}/final_model")
tokenizer.save_pretrained(f"{output_dir}/final_model")
print(f"\nModel saved to {output_dir}/final_model")

# Finetune more recent model

In [None]:
# Fine-tune Qwen3-0.6B-Base (causal LM)

%pip install -q "transformers>=4.51.0" accelerate datasets sentencepiece

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)

model_name_qwen = "Qwen/Qwen3-0.6B-Base"
print(f"Loading model: {model_name_qwen}")

# Tokenizer/model setup for causal LM
MAX_LEN_QWEN = 768  # reduce length to avoid OOM and simplify padding

tokenizer_qwen = AutoTokenizer.from_pretrained(model_name_qwen)
if tokenizer_qwen.pad_token is None:
    tokenizer_qwen.pad_token = tokenizer_qwen.eos_token
tokenizer_qwen.padding_side = "right"

model_qwen = AutoModelForCausalLM.from_pretrained(model_name_qwen)

def preprocess_causal(examples):
    """Format prompt + target for causal LM; labels mirror input_ids."""
    prompts = []
    for inp, out in zip(examples["input"], examples["output"]):
        prompt = (
            "Extract dataset information from the following snippet.\n"
            "Snippet:\n"
            f"{inp}\n\n"
            "Answer: "
            f"{out}"
        )
        prompts.append(prompt)
    tokenized = tokenizer_qwen(
        prompts,
        max_length=MAX_LEN_QWEN,
        truncation=True,
        padding="max_length",  # ensure uniform length for batching
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# Reuse train_dataset/test_dataset from earlier cells
tokenized_train_qwen = train_dataset.map(
    preprocess_causal,
    batched=True,
    remove_columns=train_dataset.column_names,
)

tokenized_test_qwen = test_dataset.map(
    preprocess_causal,
    batched=True,
    remove_columns=test_dataset.column_names,
)

data_collator_qwen = DataCollatorForLanguageModeling(
    tokenizer=tokenizer_qwen,
    mlm=False,
    pad_to_multiple_of=8,
)

training_args_qwen = TrainingArguments(
    output_dir="scripts/Local_model_finetuning/qwen3-0.6b-models",
    num_train_epochs=3,
    learning_rate=2e-4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    warmup_steps=100,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    fp16=False,  # set True if using CUDA with fp16 support
)

trainer_qwen = Trainer(
    model=model_qwen,
    args=training_args_qwen,
    train_dataset=tokenized_train_qwen,
    eval_dataset=tokenized_test_qwen,
    tokenizer=tokenizer_qwen,
    data_collator=data_collator_qwen,
)

print("Trainer ready for Qwen3-0.6B-Base. Run trainer_qwen.train() to start fine-tuning.")

In [None]:
# Train Qwen3-0.6B-Base
print("Starting Qwen3 fine-tuning...")
train_result_qwen = trainer_qwen.train()

print("\nTraining completed!")
print(f"Training loss: {train_result_qwen.training_loss:.4f}")
print(f"Training time: {train_result_qwen.metrics['train_runtime']:.2f} seconds")

# Save final model and tokenizer
final_dir_qwen = "scripts/Local_model_finetuning/qwen3-0.6b-models/final_model"
trainer_qwen.save_model(final_dir_qwen)
tokenizer_qwen.save_pretrained(final_dir_qwen)
print(f"Model saved to {final_dir_qwen}")

In [None]:
# Utility: detect latest HuggingFace Trainer checkpoint
import os, re

def get_last_checkpoint(dir_path: str):
    """Return full path of the latest `checkpoint-<step>` folder, or None."""
    if not os.path.isdir(dir_path):
        return None
    entries = [d for d in os.listdir(dir_path) if d.startswith("checkpoint-")]
    if not entries:
        return None
    def step_num(name):
        m = re.search(r"checkpoint-(\d+)", name)
        return int(m.group(1)) if m else -1
    latest = sorted(entries, key=step_num)[-1]
    return os.path.join(dir_path, latest)

# Show what we find for both training runs
flant5_dir = "scripts/Local_model_finetuning/flan-t5-models"
qwen_dir = "scripts/Local_model_finetuning/qwen3-0.6b-models"

ckpt_flan = get_last_checkpoint(flant5_dir)
ckpt_qwen = get_last_checkpoint(qwen_dir)
print(f"Latest Flan-T5 checkpoint: {ckpt_flan}")
print(f"Latest Qwen checkpoint: {ckpt_qwen}")

In [None]:
# Resume training for Flan-T5 from latest checkpoint (if any)
import torch, gc

# Free memory on macOS MPS if available
if torch.backends.mps.is_available():
    torch.mps.empty_cache()

gc.collect()

ckpt_flan = get_last_checkpoint(output_dir)
if ckpt_flan:
    print(f"Resuming Flan-T5 training from checkpoint: {ckpt_flan}")
    train_result = trainer.train(resume_from_checkpoint=ckpt_flan)
else:
    print("No Flan-T5 checkpoint found. Starting training from scratch.")
    train_result = trainer.train()

print("\nTraining completed!")
print(f"Training loss: {train_result.training_loss:.4f}")
print(f"Training time: {train_result.metrics['train_runtime']:.2f} seconds")

# Save the final model
trainer.save_model(f"{output_dir}/final_model")
tokenizer.save_pretrained(f"{output_dir}/final_model")
print(f"Model saved to {output_dir}/final_model")

In [12]:
# Resume training for Qwen3-0.6B-Base from latest checkpoint (if any)
import torch, gc

if torch.backends.mps.is_available():
    torch.mps.empty_cache()

gc.collect()

qwen_output_dir = "scripts/Local_model_finetuning/qwen3-0.6b-models"
ckpt_qwen = get_last_checkpoint(qwen_output_dir)
if ckpt_qwen:
    print(f"Resuming Qwen3 training from checkpoint: {ckpt_qwen}")
    train_result_qwen = trainer_qwen.train(resume_from_checkpoint=ckpt_qwen)
else:
    print("No Qwen checkpoint found. Starting training from scratch.")
    train_result_qwen = trainer_qwen.train()

print("\nQwen3 training completed!")
print(f"Training loss: {train_result_qwen.training_loss:.4f}")
print(f"Training time: {train_result_qwen.metrics['train_runtime']:.2f} seconds")

# Save final model and tokenizer
final_dir_qwen = f"{qwen_output_dir}/final_model"
trainer_qwen.save_model(final_dir_qwen)
tokenizer_qwen.save_pretrained(final_dir_qwen)
print(f"Model saved to {final_dir_qwen}")

KeyboardInterrupt: 