In [None]:
!pip install datasets

In [None]:
!pip install bert-score

In [None]:
!pip install rouge_score

In [None]:
!pip install evaluate


In [None]:
from google.colab import drive
drive.mount('/content/drive')
%load_ext autoreload
%autoreload 2

In [None]:
# Load JSON dataset
output_json = "/content/drive/MyDrive/umich 25 WN/CSE 692 Project/BART/DATASET/5class_Land_larger_dis.json"


In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer
from datasets import Dataset, DatasetDict
import json

import random

# Load pretrained BART model and tokenizer
model_name = "facebook/bart-large"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Add "PFAS" as a special token
special_tokens = ["PFAS"]
tokenizer.add_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))


with open(output_json, "r", encoding="utf-8") as f:
    dataset = json.load(f)



# Shuffle for randomness
random.shuffle(dataset)

# Track already used original sentences
used_originals = set()

# Store selected examples by category
discharger_data = []
landcover_data = []
label_data = []



# How many samples per category to target
target_per_category = len(dataset) // 3
extra = len(dataset) - target_per_category * 3

for item in dataset:
    orig = item["original"]

    if orig in used_originals:
        continue

    if "masked_discharger" in item and len(discharger_data) < target_per_category:
        discharger_data.append({"input_text": item["masked_discharger"], "target_text": orig})
        used_originals.add(orig)
    elif "masked_landcover" in item and len(landcover_data) < target_per_category:
        landcover_data.append({"input_text": item["masked_landcover"], "target_text": orig})
        used_originals.add(orig)
    elif "masked_label" in item and len(label_data) < target_per_category:
        label_data.append({"input_text": item["masked_label"], "target_text": orig})
        used_originals.add(orig)

    # Once we reach the total, stop
    if len(discharger_data) + len(landcover_data) + len(label_data) >= 407:
        break

# If there's still room (e.g., 200*3 = 600, need 2 more), fill from any category
remaining = len(dataset) - (len(discharger_data) + len(landcover_data) + len(label_data))
if remaining > 0:
    remaining_data = []
    for item in dataset:
        orig = item["original"]
        if orig in used_originals:
            continue
        if "masked_label" in item:
            remaining_data.append({"input_text": item["masked_label"], "target_text": orig})
        elif "masked_discharger" in item:
            remaining_data.append({"input_text": item["masked_discharger"], "target_text": orig})
        elif "masked_landcover" in item:
            remaining_data.append({"input_text": item["masked_landcover"], "target_text": orig})
        if len(remaining_data) >= remaining:
            break

# Final dataset
hf_data = discharger_data + landcover_data + label_data + remaining_data
random.shuffle(hf_data)  # shuffle for randomness


In [None]:
# Convert to Hugging Face Dataset format
hf_dataset = Dataset.from_list(hf_data)

def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["input_text"],
        text_target=examples["target_text"],
        max_length=256,
        truncation=True,
        padding="max_length"
    )

    labels = [
        [-100 if token == tokenizer.pad_token_id else token for token in label]
        for label in model_inputs["labels"]
    ]

    return {
        "input_ids": model_inputs["input_ids"],
        "attention_mask": model_inputs["attention_mask"],
        "labels": labels
    }


tokenized_dataset = hf_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["input_text", "target_text"]
)



In [None]:
tokenized_dataset

In [None]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.2)
train_val_dataset = split_dataset["train"].train_test_split(test_size=0.125)

train_dataset = train_val_dataset["train"]
val_dataset = train_val_dataset["test"]
test_dataset = split_dataset["test"]


In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq

import sys

model.generation_config.early_stopping = True
model.generation_config.num_beams = 4
model.generation_config.no_repeat_ngram_size = 3
model.generation_config.forced_bos_token_id = tokenizer.bos_token_id

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

training_args = TrainingArguments(
    output_dir="./pfas_bart_finetuned",
    eval_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=8,
    save_strategy="epoch",
    save_total_limit=1,
    logging_dir="./logs",
    logging_steps=10,
    learning_rate=3e-5,
    warmup_steps=500,
    weight_decay=0.01,
    remove_unused_columns=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator
)

trainer.train()

model.save_pretrained("/content/drive/MyDrive/umich 25 WN/CSE 692 Project/BART/5_classes/land_large_dis_bart/bart_model")
tokenizer.save_pretrained("/content/drive/MyDrive/umich 25 WN/CSE 692 Project/BART/5_classes/bart_model/land_large_dis_bart/bart_model")

In [None]:
# Evaluate on the test dataset
test_results = trainer.evaluate(eval_dataset=test_dataset)

print(test_results)
# Compute and print perplexity
import math
print(f">>> Test Perplexity: {math.exp(test_results['eval_loss']):.2f}")


In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer

model_path = "/content/drive/MyDrive/umich 25 WN/CSE 692 Project/BART/5_classes/land_large_dis_bart/bart_model"
model = BartForConditionalGeneration.from_pretrained(model_path)
tokenizer = BartTokenizer.from_pretrained(model_path)
model.eval().to("cuda")  # or "cpu" if no GPU


In [None]:


from transformers import DataCollatorForSeq2Seq
from torch.utils.data import DataLoader


data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)

test_loader = DataLoader(test_dataset, batch_size=8, collate_fn=data_collator)



In [None]:
import torch
predictions = []
references = []

for batch in test_loader:
    input_ids = batch['input_ids'].to("cuda")
    attention_mask = batch['attention_mask'].to("cuda")
    labels = batch['labels']

    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=128)

    decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # Fix starts here: replace -100 with tokenizer.pad_token_id, then decode
    labels[labels == -100] = tokenizer.pad_token_id
    decoded_refs = tokenizer.batch_decode(labels, skip_special_tokens=True)

    predictions.extend(decoded_preds)
    references.extend(decoded_refs)


In [None]:
import evaluate

rouge = evaluate.load("rouge")

results = rouge.compute(predictions=predictions, references=references)
print(results)


In [None]:
import evaluate

# Load BLEU metric
bleu = evaluate.load("bleu")


formatted_preds = predictions
formatted_refs = [[ref] for ref in references]  # single reference per prediction

# Compute BLEU
bleu_result = bleu.compute(predictions=formatted_preds, references=formatted_refs)
print(f"BLEU score: {bleu_result['bleu']:.4f}")


In [None]:
meteor = evaluate.load("meteor")
meteor_result = meteor.compute(predictions=predictions, references=references)
print(meteor_result)


In [None]:
from bert_score import score
P, R, F1 = score(predictions, references, lang="en", verbose=True)
print(f"BERTScore - P: {P.mean().item():.4f}, R: {R.mean().item():.4f}, F1: {F1.mean().item():.4f}")
