In [8]:
"""
Run this script in Google Colab (T4 GPU recommended).

It will:
 1. Install dependencies
 2. Load the ClimateBERT 'environmental_claims' dataset
 3. Evaluate 5 models off-the-shelf (zero-shot classification)
 4. Select the best-performing model
 5. Fine-tune that best model for 3 epochs
 6. Save the fine-tuned version to './best_finetuned_model'

Estimated runtime: ~30‚Äì40 minutes on Colab T4 GPU
"""

# ===============================================================
# 1) Install dependencies
# ===============================================================
!pip install -q transformers datasets evaluate accelerate scikit-learn sentence-transformers

import os
import numpy as np
import time
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    pipeline
)
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import torch

# ===============================================================
# 2) Config
# ===============================================================
MODELS_TO_TEST = [
    "bert-base-uncased",
    "roberta-base",
    "climatebert/distilroberta-base-climate-f",
    "nbroad/ESG-BERT",
    "sentence-transformers/all-MiniLM-L6-v2",
]

EPOCHS = 3
LR = 2e-5
BATCH_SIZE = 16
MAX_LENGTH = 256
RANDOM_SEED = 42
OUTPUT_BASE = "./results_finetuned_models"

os.makedirs(OUTPUT_BASE, exist_ok=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
print("Using device:", device)

# ===============================================================
# 3) Load dataset
# ===============================================================
print("\nüì¶ Loading dataset: climatebert/environmental_claims ...")
dataset = load_dataset("climatebert/environmental_claims")

if "train" in dataset:
    ds_train = dataset["train"]
    ds_val = dataset["validation"] if "validation" in dataset else dataset["test"]
else:
    dataset = dataset["train"].train_test_split(test_size=0.2, seed=RANDOM_SEED)
    ds_train = dataset["train"]
    ds_val = dataset["test"]

print("Train size:", len(ds_train), "Validation size:", len(ds_val))

# ===============================================================
# 4) Evaluate each model off-the-shelf (zero-shot)
# ===============================================================
all_results = []

print("\nüîç Evaluating zero-shot performance of all 5 models...")
for model_name in MODELS_TO_TEST:
    print("\n" + "="*80)
    print(f"üöÄ Testing zero-shot model: {model_name}")
    print("="*80)
    try:
        classifier = pipeline(
            "text-classification",
            model=model_name,
            tokenizer=model_name,
            device=0 if torch.cuda.is_available() else -1,
            truncation=True,
            max_length=MAX_LENGTH,
        )

        texts = ds_val["text"][:200]  # small subset for quick testing
        true_labels = ds_val["label"][:200]

        preds = []
        for t in texts:
            result = classifier(t, truncation=True)[0]
            preds.append(1 if "claim" in result["label"].lower() else 0)

        acc = accuracy_score(true_labels, preds)
        f1 = f1_score(true_labels, preds)
        print(f"‚úÖ Zero-shot F1: {f1:.4f}, Acc: {acc:.4f}")
        all_results.append({"model": model_name, "f1": f1, "accuracy": acc})

    except Exception as e:
        print(f"‚ùå Error testing {model_name}: {e}")

# ===============================================================
# 5) Pick the best model from zero-shot results
# ===============================================================
print("\nüìä Zero-shot Model Comparison:")
for r in all_results:
    print(r)

best_model_name = sorted(all_results, key=lambda x: x["f1"], reverse=True)[0]["model"]
print(f"\nüèÜ Best model (zero-shot): {best_model_name}")

# ===============================================================
# 6) Fine-tune the best model
# ===============================================================
def preprocess_function(examples, tokenizer):
    return tokenizer(examples["text"], truncation=True, padding=False, max_length=MAX_LENGTH)

def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    labels = pred.label_ids
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="binary"),
        "precision": precision_score(labels, preds, zero_division=0),
        "recall": recall_score(labels, preds, zero_division=0),
    }

print(f"\nüéØ Fine-tuning best model: {best_model_name} ...")
tokenizer = AutoTokenizer.from_pretrained(best_model_name)
model = AutoModelForSequenceClassification.from_pretrained(best_model_name, num_labels=2)

tokenized_train = ds_train.map(lambda x: preprocess_function(x, tokenizer), batched=True)
tokenized_val = ds_val.map(lambda x: preprocess_function(x, tokenizer), batched=True)
tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
tokenized_val.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
output_dir = os.path.join(OUTPUT_BASE, best_model_name.replace("/", "_"))
os.makedirs(output_dir, exist_ok=True)

training_args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    logging_dir=f"{output_dir}/logs",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

start_time = time.time()
trainer.train()
end_time = time.time()

eval_results = trainer.evaluate()
print(f"\n‚úÖ Fine-tuned {best_model_name} -> F1: {eval_results['eval_f1']:.4f}, Acc: {eval_results['eval_accuracy']:.4f}, Time: {(end_time-start_time)/60:.1f} min")

# ===============================================================
# 7) Save final fine-tuned model
# ===============================================================
final_model_path = os.path.join(OUTPUT_BASE, "best_finetuned_model")
os.makedirs(final_model_path, exist_ok=True)
model.save_pretrained(final_model_path)
tokenizer.save_pretrained(final_model_path)

print("\n‚úÖ All done! Best fine-tuned model saved at ./results_finetuned_models/best_finetuned_model")


Using device: cuda

üì¶ Loading dataset: climatebert/environmental_claims ...
Train size: 2117 Validation size: 265

üîç Evaluating zero-shot performance of all 5 models...

üöÄ Testing zero-shot model: bert-base-uncased


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


‚úÖ Zero-shot F1: 0.0000, Acc: 0.7100

üöÄ Testing zero-shot model: roberta-base


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at climatebert/distilroberta-base-climate-f and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ Zero-shot F1: 0.0000, Acc: 0.7100

üöÄ Testing zero-shot model: climatebert/distilroberta-base-climate-f


Device set to use cuda:0


‚úÖ Zero-shot F1: 0.0000, Acc: 0.7100

üöÄ Testing zero-shot model: nbroad/ESG-BERT


Device set to use cuda:0
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ Zero-shot F1: 0.0000, Acc: 0.7100

üöÄ Testing zero-shot model: sentence-transformers/all-MiniLM-L6-v2


Device set to use cuda:0


‚úÖ Zero-shot F1: 0.0000, Acc: 0.7100

üìä Zero-shot Model Comparison:
{'model': 'bert-base-uncased', 'f1': 0.0, 'accuracy': 0.71}
{'model': 'roberta-base', 'f1': 0.0, 'accuracy': 0.71}
{'model': 'climatebert/distilroberta-base-climate-f', 'f1': 0.0, 'accuracy': 0.71}
{'model': 'nbroad/ESG-BERT', 'f1': 0.0, 'accuracy': 0.71}
{'model': 'sentence-transformers/all-MiniLM-L6-v2', 'f1': 0.0, 'accuracy': 0.71}

üèÜ Best model (zero-shot): bert-base-uncased

üéØ Fine-tuning best model: bert-base-uncased ...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2117 [00:00<?, ? examples/s]

Map:   0%|          | 0/265 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.406877,0.8,0.713514,0.554622,1.0
2,No log,0.206546,0.924528,0.848485,0.848485,0.848485
3,No log,0.253197,0.909434,0.833333,0.769231,0.909091



‚úÖ Fine-tuned bert-base-uncased -> F1: 0.8485, Acc: 0.9245, Time: 1.4 min

‚úÖ All done! Best fine-tuned model saved at ./results_finetuned_models/best_finetuned_model


In [9]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
import os
import shutil

# Define the path to the saved model
model_path = "./results_finetuned_models/best_finetuned_model" # Assuming the model was saved here based on the previous code

# Define the destination path in your Google Drive
# You can change 'my_models' to any folder name you prefer in your Drive
drive_destination_path = "/content/drive/MyDrive/my_finetuned_climate_model"

# Create the destination directory in Drive if it doesn't exist
os.makedirs(drive_destination_path, exist_ok=True)

# Copy the model directory to Google Drive
try:
    shutil.copytree(model_path, drive_destination_path)
    print(f"Model successfully saved to Google Drive at: {drive_destination_path}")
except FileExistsError:
    print(f"Directory {drive_destination_path} already exists in Google Drive. Model was not copied.")
except FileNotFoundError:
    print(f"Source directory {model_path} not found. Please ensure the model was saved correctly.")
except Exception as e:
    print(f"An error occurred while copying the model to Google Drive: {e}")

Directory /content/drive/MyDrive/my_finetuned_climate_model already exists in Google Drive. Model was not copied.


In [11]:
# Download the saved model to your local machine
from google.colab import files
import shutil

# Zip the model directory for easier download
model_path = "./results_finetuned_models/best_finetuned_model"
zip_filename = "best_finetuned_model.zip"

try:
    shutil.make_archive(zip_filename.replace(".zip", ""), 'zip', model_path)
    print(f"Zipping model directory: {model_path} -> {zip_filename}")

    # Download the zip file
    print("Downloading the model zip file...")
    files.download(zip_filename)
    print("Download initiated. Check your browser's download panel.")

except FileNotFoundError:
    print(f"Model directory {model_path} not found. Cannot create zip for download.")
except Exception as e:
    print(f"An error occurred during zipping or downloading: {e}")

Zipping model directory: ./results_finetuned_models/best_finetuned_model -> best_finetuned_model.zip
Downloading the model zip file...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Download initiated. Check your browser's download panel.


In [12]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

model_path = "./results_finetuned_models/best_finetuned_model"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

nlp_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)


Device set to use cuda:0


In [13]:
examples = [
    "We reduced our carbon emissions by 20% last year.",
    "We care about the environment.",
    "Our energy usage is entirely renewable according to government reports.",
]

for text in examples:
    result = nlp_pipeline(text)[0]
    print(f"{text}\n‚Üí {result}\n")


We reduced our carbon emissions by 20% last year.
‚Üí {'label': 'LABEL_1', 'score': 0.9439000487327576}

We care about the environment.
‚Üí {'label': 'LABEL_0', 'score': 0.9363537430763245}

Our energy usage is entirely renewable according to government reports.
‚Üí {'label': 'LABEL_0', 'score': 0.574424147605896}

