# Emission Reduction Strategy Generator (T5-small fine-tune)
This notebook fine-tunes a small seq2seq model (T5-small) on synthetic climate/emission strategy examples
and demonstrates inference and simple evaluation (ROUGE / BLEU). All steps are local and runnable in Jupyter.


In [4]:
import os
import pandas as pd
import torch
from pprint import pprint

from datasets import Dataset, DatasetDict
from transformers import (
    T5ForConditionalGeneration,
    T5TokenizerFast,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


Using device: cuda


In [94]:
train_df = pd.read_csv("processed_data_50k/train.csv")
val_df   = pd.read_csv("processed_data_50k/val.csv")
test_df  = pd.read_csv("processed_data_50k/test.csv")

print("Train:", len(train_df), " Validation:", len(val_df), " Test:", len(test_df))

# preview
train_df.head()


Train: 40000  Validation: 5000  Test: 5000


Unnamed: 0,input,output
0,Facility: Šilalės District Municipality | Country: LTU | Sector: biological-treatment-of-solid-waste-and-biogenic | EmissionReductionPotential(Mt): 0.000 | Difficulty: Long-term,Flare CH4 from anaerobic digestion
1,Facility: Ripiceni Commune | Country: ROU | Sector: manure-left-on-pasture-cattle | EmissionReductionPotential(Mt): 0.000 | Difficulty: Long-term,"Rotating herds between multiple pastures allows pastures to recover between seasons and increase carbon stored in the soil and improves feed efficiency. In addition, growing specific fodder crops like plaintain could also reduce N2O."
2,Facility: Nhommalath District | Country: LAO | Sector: forest-land-fires | EmissionReductionPotential(Mt): 0.309 | Difficulty: Long-term,"Fire risk mitigation includes: Fuel load reduction, fire-resilient forest management, landscape-level fire planning and zoning, restoration of fire-adaptive ecosystems, and post-fire recovery."
3,Facility: ITA_MatureDairyCattle_10230 | Country: ITA | Sector: manure-management-cattle-operation | EmissionReductionPotential(Mt): 0.000 | Difficulty: Long-term,"For dairy cattle high productivity, change manure handling to solid storage: where manure is stored, typically for several months, in unconfined piles or stacks."
4,Facility: POL_MatureDairyCattle_9543 | Country: POL | Sector: manure-management-cattle-operation | EmissionReductionPotential(Mt): 0.000 | Difficulty: Long-term,"For dairy cattle high productivity, modify the existing system and change manure handling to dry lot, where the manure is periodically removed from the paved or unpaved confined area and can be spread onto fields."


In [96]:
# Cell 4 — Convert Pandas → HuggingFace Dataset Format
train_dataset = Dataset.from_pandas(train_df)
val_dataset   = Dataset.from_pandas(val_df)
test_dataset  = Dataset.from_pandas(test_df)

dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

dataset


DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 40000
    })
    validation: Dataset({
        features: ['input', 'output'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['input', 'output'],
        num_rows: 5000
    })
})

In [98]:

# Load Tokenizer & Model (T5-Small)
model_name = "t5-base"

tokenizer = T5TokenizerFast.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

max_input_len = 256
max_output_len = 180


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [100]:
# Tokenization Function
def tokenize_fn(batch):
    inputs = ["generate strategy: " + x for x in batch["input"]]
    model_inputs = tokenizer(
        inputs,
        max_length=max_input_len,
        padding="max_length",
        truncation=True
    )

    # tokenize strategy descriptions (targets)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["output"],
            max_length=max_output_len,
            padding="max_length",
            truncation=True
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(
    tokenize_fn,
    batched=True,
    remove_columns=["input", "output"]
)

tokenized_dataset


Map:   0%|          | 0/40000 [00:00<?, ? examples/s]



Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 40000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5000
    })
})

In [102]:
# Data Collator & Training Arguments
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="./t5_base_climatetrace_finetuned_50k",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    num_train_epochs=5,  # increase to 4–6 if using GPU
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    save_total_limit=2
)


In [104]:
# Metrics (ROUGE + BLEU)
import evaluate
rouge = evaluate.load("rouge")
bleu  = evaluate.load("sacrebleu")

def compute_metrics(eval_pred):
    preds, labels = eval_pred

    # Decode predictions
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Decode labels (replace -100)
    labels = [
        [(l if l != -100 else tokenizer.pad_token_id) for l in label]
        for label in labels
    ]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Clean 
    decoded_preds = [p.strip() for p in decoded_preds]
    decoded_labels = [l.strip() for l in decoded_labels]

    # ---- ROUGE (now returns floats directly) ----
    rouge_scores = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )

    # ---- BLEU (correct format: list of lists) ----
    bleu_scores = bleu.compute(
        predictions=decoded_preds,
        references=[[ref] for ref in decoded_labels]
    )

    return {
        "rouge1": rouge_scores["rouge1"],
        "rougeL": rouge_scores["rougeL"],
        "bleu": bleu_scores["score"]
    }



In [105]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss


In [None]:
trainer.save_model("models/t5_climate_strategy_final")
tokenizer.save_pretrained("models/t5_climate_strategy_final")

print("Model saved!")


In [None]:
# Inference Helper
def generate_strategy(text, max_len=180, num_beams=4):
    inp = "generate strategy: " + text
    tokens = tokenizer(inp, return_tensors="pt", truncation=True).to(device)

    gen_ids = model.generate(
        **tokens,
        max_length=max_len,
        num_beams=num_beams,
        early_stopping=True
    )

    return tokenizer.decode(gen_ids[0], skip_special_tokens=True)


In [None]:
# Test on Real Examples
examples = [
    "Facility: Sunndalsora aluminium plant | Country: NOR | Sector: aluminum | EmissionReductionPotential(Mt): 0.78 | Difficulty: Short-term",
    "Facility: Some Cement Plant | Country: IND | Sector: cement | EmissionReductionPotential(Mt): 1.52 | Difficulty: Mid-term"
]

for e in examples:
    print("INPUT:", e)
    print("OUTPUT:", generate_strategy(e))
    print("-" * 80)


## Evaluate on Test Set

In [None]:
preds = []
refs = []

for row in test_df.to_dict("records")[:50]:  # evaluate first 50
    p = generate_strategy(row["input"])
    preds.append(p)
    refs.append(row["output"])

print("Sample Predictions:")
for i in range(3):
    print("\nInput:", test_df.iloc[i]["input"])
    print("Pred:", preds[i])
    print("True:", refs[i])


In [None]:
from plyer import notification

notification.notify(
    title='Jupyter Notebook',
    message='✅ All cells finished running successfully!',
    timeout=10  # seconds
)


In [None]:
import random
import pandas as pd
import torch
import evaluate
from transformers import T5TokenizerFast, T5ForConditionalGeneration

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using:", device)

# -------------------------------
# LOAD MODELS
# -------------------------------

# Fine-tuned model
ft_model_path = "models/t5_climate_strategy_final"
ft_tokenizer = T5TokenizerFast.from_pretrained(ft_model_path)
ft_model = T5ForConditionalGeneration.from_pretrained(ft_model_path).to(device)

# Base T5 model
base_model_name = "t5-small"
base_tokenizer = T5TokenizerFast.from_pretrained(base_model_name)
base_model = T5ForConditionalGeneration.from_pretrained(base_model_name).to(device)

# -------------------------------
# Load test dataset and sample 2 rows
# -------------------------------
test_df = pd.read_csv("processed_data/test.csv")
examples = test_df.sample(2).to_dict("records")

print("Selected Inputs:\n")
for i, ex in enumerate(examples, 1):
    print(f"{i}. {ex['input']}\n")

# -------------------------------
# Helper function for generation
# -------------------------------
def generate(model, tokenizer, text, max_len=200):
    encoded = tokenizer(
        "generate strategy: " + text,
        return_tensors="pt",
        truncation=True
    ).to(device)

    output_ids = model.generate(
        **encoded,
        max_length=max_len,
        num_beams=4,
        early_stopping=True
    )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# -------------------------------
# Evaluate both models on both inputs
# -------------------------------

# Load metrics
rouge = evaluate.load("rouge")
bleu  = evaluate.load("sacrebleu")

results = []

for ex in examples:
    inp = ex["input"]
    true = ex["output"]

    base_pred = generate(base_model, base_tokenizer, inp)
    ft_pred   = generate(ft_model, ft_tokenizer, inp)

    # --- Compute ROUGE ---
    base_rouge = rouge.compute(predictions=[base_pred], references=[true])
    ft_rouge   = rouge.compute(predictions=[ft_pred], references=[true])

    # --- Compute BLEU ---
    base_bleu = bleu.compute(predictions=[base_pred], references=[[true]])["score"]
    ft_bleu   = bleu.compute(predictions=[ft_pred], references=[[true]])["score"]

    results.append({
        "Input": inp,
        "Ground Truth": true,
        
        "Base Prediction": base_pred,
        "FT Prediction": ft_pred,

        "Base ROUGE-1": base_rouge["rouge1"],
        "Base ROUGE-L": base_rouge["rougeL"],
        "Base BLEU": base_bleu,

        "FT ROUGE-1": ft_rouge["rouge1"],
        "FT ROUGE-L": ft_rouge["rougeL"],
        "FT BLEU": ft_bleu
    })

# -------------------------------
# Create a beautiful comparison table
# -------------------------------
pd.set_option("display.max_colwidth", None)
comparison_df = pd.DataFrame(results)

comparison_df
