In [1]:
!pip install datasets scikit-learn pandas transformers

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (

In [2]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [3]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


In [4]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np
import evaluate

In [5]:
# === Step 1: Load Your Data ===
train_df_1 = pd.read_excel("/content/incidents_train_clean_excel.xlsx")
test_df_1 = pd.read_excel("/content/incidents_test_clean_excel.xlsx")

train_df=train_df_1[["title", "hazard"]]
test_df=test_df_1[["title", "hazard"]]

In [8]:
# === Step 2: Encode Labels (if not already integers) ===
label_encoder = LabelEncoder()

#test_df['label'] = label_encoder.transform(test_df['hazard'])

train_df['label'] = label_encoder.fit_transform(train_df['hazard'])

# Filter test_df to avoid unseen labels
test_df = test_df[test_df['hazard'].isin(label_encoder.classes_)]
test_df['label'] = label_encoder.transform(test_df['hazard'])

num_labels = len(label_encoder.classes_)
num_labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['label'] = label_encoder.fit_transform(train_df['hazard'])


130

In [9]:
print("Label range:", train_df['label'].min(), "to", train_df['label'].max())
print("Number of classes (num_labels):", num_labels)


Label range: 0 to 129
Number of classes (num_labels): 130


In [11]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [12]:
# === Step 4: Tokenize ===
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(example):
    return tokenizer(example['title'], truncation=True, padding='max_length')

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/5084 [00:00<?, ? examples/s]

Map:   0%|          | 0/997 [00:00<?, ? examples/s]

In [13]:
train_dataset.set_format("torch")
test_dataset.set_format("torch")

In [14]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=130)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='weighted', zero_division=0
    )

    macro_f1 = precision_recall_fscore_support(
        labels, preds, average='macro', zero_division=0
    )[2]

    acc = accuracy_score(labels, preds)

    return {
        'accuracy': acc,
        'weighted_precision': precision,
        'weighted_recall': recall,
        'weighted_f1': f1,
        'macro_f1': macro_f1,
    }


In [16]:
training_args = TrainingArguments(
    output_dir="./distilbert-results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=4,
    weight_decay=0.01,
    report_to="none",  # Disable wandb and other logging tools
    logging_dir="./logs",  # Directory to store TensorBoard logs
    logging_steps=10,  # Log every 10 steps
    eval_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",  # Save the model at the end of each epoch
    load_best_model_at_end=True,  # Load the best model based on evaluation metric
    metric_for_best_model="weighted_f1",  # Choose F1-score as the metric for best model
    logging_first_step=True,  # Log the first step as well
)

In [17]:
# === Step 8: Initialize Trainer ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [18]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Weighted Precision,Weighted Recall,Weighted F1,Macro F1
1,2.6691,2.561976,0.451354,0.342251,0.451354,0.364206,0.049627
2,2.0336,2.143006,0.530592,0.434973,0.530592,0.458205,0.101027
3,2.0374,2.017685,0.561685,0.488275,0.561685,0.50687,0.136272
4,1.9048,1.983202,0.562688,0.48762,0.562688,0.506618,0.137971


TrainOutput(global_step=1272, training_loss=2.4041127097681634, metrics={'train_runtime': 984.174, 'train_samples_per_second': 20.663, 'train_steps_per_second': 1.292, 'total_flos': 2700006281871360.0, 'train_loss': 2.4041127097681634, 'epoch': 4.0})

In [19]:


!zip -r my_model.zip /content/distilbert-results/checkpoint-954


  adding: content/distilbert-results/checkpoint-954/ (stored 0%)
  adding: content/distilbert-results/checkpoint-954/rng_state.pth (deflated 25%)
  adding: content/distilbert-results/checkpoint-954/special_tokens_map.json (deflated 42%)
  adding: content/distilbert-results/checkpoint-954/training_args.bin (deflated 51%)
  adding: content/distilbert-results/checkpoint-954/tokenizer.json (deflated 71%)
  adding: content/distilbert-results/checkpoint-954/optimizer.pt (deflated 34%)
  adding: content/distilbert-results/checkpoint-954/scheduler.pt (deflated 56%)
  adding: content/distilbert-results/checkpoint-954/vocab.txt (deflated 53%)
  adding: content/distilbert-results/checkpoint-954/tokenizer_config.json (deflated 75%)
  adding: content/distilbert-results/checkpoint-954/trainer_state.json (deflated 79%)
  adding: content/distilbert-results/checkpoint-954/model.safetensors (deflated 8%)
  adding: content/distilbert-results/checkpoint-954/config.json (deflated 75%)


In [20]:
from google.colab import files
files.download("my_model.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>