# Train Model for Carbon Intensity Prediction 🌱⚡

This script trains a model to predict carbon intensity using the provided dataset.

In [1]:
import os
import pandas as pd
from datasets import Dataset, DatasetDict, ClassLabel
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    TrainingArguments, 
    Trainer, 
    pipeline
)
from sklearn.model_selection import train_test_split
from huggingface_hub import login

### 1. Load and preprocess data

In [2]:
df = pd.read_csv("dataset_carbon_intesity_for_activites.csv")
df = df.dropna()

# Map labels to ints
label_names = ["low", "medium", "high"]
label2id = {name: i for i, name in enumerate(label_names)}
id2label = {i: name for i, name in enumerate(label_names)}
df["label"] = df["label"].map(label2id)

# Split train/test
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)

# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
dataset = DatasetDict({"train": train_dataset, "test": test_dataset})

### 2. Tokenization

In [3]:
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def preprocess(example):
    return tokenizer(example["activity"], truncation=True, padding="max_length", max_length=32)

dataset = dataset.map(preprocess, batched=True)

Map:   0%|          | 0/33 [00:00<?, ? examples/s]

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

### 3. Model setup

In [4]:
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, 
    num_labels=len(label_names),
    id2label=id2label,
    label2id=label2id
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 4. Training setup

In [5]:

training_args = TrainingArguments(
    output_dir="carbon-intensity-classifier",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    learning_rate=2e-5,
    logging_steps=5,
    push_to_hub=True,
    hub_model_id="jessica-ecosia/carbon-intensity-classifier",  # change if you want to push to your own account
    hub_strategy="end",
    report_to="codecarbon",  # to report metrics to CodeCarbon
)

### 5. Trainer and training

In [6]:
def compute_metrics(eval_pred):
    from sklearn.metrics import accuracy_score, f1_score
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="macro")
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


trainer.train()

  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[codecarbon INFO @ 14:02:25] [setup] RAM Tracking...
[codecarbon INFO @ 14:02:25] [setup] CPU Tracking...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: 

Step,Training Loss
5,1.1128
10,1.1016
15,0.9796
20,0.9535
25,0.866
30,0.83
35,0.763
40,0.7355
45,0.7309
50,0.7009


[codecarbon INFO @ 14:02:33] Energy consumed for RAM : 0.000011 kWh. RAM Power : 6.0 W
[codecarbon INFO @ 14:02:33] Delta energy consumed for CPU with constant : 0.000081 kWh, power : 42.5 W
[codecarbon INFO @ 14:02:33] Energy consumed for All CPU : 0.000081 kWh
[codecarbon INFO @ 14:02:33] 0.000093 kWh of electricity used since the beginning.


TrainOutput(global_step=50, training_loss=0.8773746252059936, metrics={'train_runtime': 6.88, 'train_samples_per_second': 47.965, 'train_steps_per_second': 7.267, 'total_flos': 2732188821120.0, 'train_loss': 0.8773746252059936, 'epoch': 10.0})

In [7]:
from getpass import getpass

hugging_face_token = getpass("YOUR_HUGGINGFACE_TOKEN")

login(token=hugging_face_token)

YOUR_HUGGINGFACE_TOKEN ········


### 7. Save final model to Hugging Face Hub

In [8]:
trainer.save_model("carbon-intensity-classifier")

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

'(MaxRetryError("HTTPSConnectionPool(host='hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com', port=443): Max retries exceeded with url: /repos/d3/07/d3078206a56a158a9e339b1246e868ae53f7b08346c1377b2da709e87c166515/15068f809a69068c510d55965b7451248f33f977ac12ea3a878b08303915a80d?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=AKIA2JU7TKAQLC2QXPN7%2F20250528%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250528T120318Z&X-Amz-Expires=86400&X-Amz-Signature=8aee037de89e4c617eb129a7c5d417afc54e3e28399944d8c2f7849d75fccd82&X-Amz-SignedHeaders=host&partNumber=1&uploadId=PY32UJOIhw0F9zdboZNnBsFpztYg2A2VPgJRjw9mvpITyCLigNqlnZDLOwL0fa1N0WnkacQSpuITT24xZlsErgINQLyR5Lz4KwrsOmk6fsfh4rUmbalIC59ATJJXzRTU&x-id=UploadPart (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:2393)')))"), '(Request ID: a285da0a-5809-4b2e-875b-819ffe01507f)')' thrown while requesting PUT https://hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com/repos/d3/07/d

In [9]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-classification", model="jessica-ecosia/carbon-intensity-classifier")

pipe("flying Berlin to Italy")

Device set to use mps:0


[{'label': 'high', 'score': 0.5594532489776611}]

In [10]:
pipe("train Berlin to Italy")

[{'label': 'medium', 'score': 0.4121767580509186}]