In [1]:
# We need to install the libraries from Hugging Face
!pip install transformers datasets evaluate huggingface_hub

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [6]:
from huggingface_hub import login

# This will show a box to paste your HF access token
# Get a token here: https://huggingface.co/settings/tokens
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
from datasets import load_dataset

# Load your custom CSV files
dataset = load_dataset('csv', data_files={'train': 'train.csv', 'test': 'test.csv'})

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 8
    })
})


In [8]:
import sys
import os
from transformers import AutoTokenizer
from datasets import ClassLabel # <-- Import this!

# 1. Define our model checkpoint and tokenizer
model_checkpoint = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# --- FIX STARTS HERE ---
# 2. We must explicitly tell the dataset that 'label' is a category.
# Get the unique labels from the training data
labels_list = sorted(list(dataset["train"].unique("label")))
print(f"Found unique labels: {labels_list}")

# Create a ClassLabel feature from our list
class_label_feature = ClassLabel(names=labels_list)

# "Cast" (convert) the 'label' column in both train and test splits
# This is the line that fixes the error!
dataset['train'] = dataset['train'].cast_column('label', class_label_feature)
dataset['test'] = dataset['test'].cast_column('label', class_label_feature)
# --- END OF FIX ---

# 3. Create the label-to-ID mappings (this line will work now!)
labels = dataset["train"].features["label"].names
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for i, label in enumerate(labels)}

print("Label mappings:", label2id)

# 4. Create the function that tokenizes text and converts labels
def preprocess_function(examples):
    # Tokenize the texts
    tokenized_inputs = tokenizer(examples["text"], truncation=True, padding=True, max_length=128)
    # The .map() function will now automatically use the ClassLabel mapping
    tokenized_inputs["labels"] = examples["label"]
    return tokenized_inputs

# 5. Apply the function to all our data
tokenized_datasets = dataset.map(preprocess_function, batched=True)

print("\nSuccessfully tokenized datasets:")
print(tokenized_datasets)

Found unique labels: ['english', 'french', 'kirundi', 'swahili']


Casting the dataset:   0%|          | 0/25 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/8 [00:00<?, ? examples/s]

Label mappings: {'english': 0, 'french': 1, 'kirundi': 2, 'swahili': 3}


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Map:   0%|          | 0/8 [00:00<?, ? examples/s]


Successfully tokenized datasets:
DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 25
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 8
    })
})


In [12]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import evaluate

# 1. Load the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(labels),  # 4 labels: kirundi, french, swahili, english
    id2label=id2label,
    label2id=label2id
)

# 2. Define a function to compute accuracy
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# 3. Define the training arguments
# THIS IS YOUR REPO ID. Make sure it matches your HF username
repo_name = "burundi-lang-id" # You can change this
hf_repo_id = f"samandari/{repo_name}" # <-- IMPORTANT: Use your username

training_args = TrainingArguments(
    output_dir=repo_name,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
    hub_model_id=hf_repo_id,
    hub_strategy="every_save",
    # --- THIS IS THE FIX ---
    report_to="none",  # This tells the Trainer to not ask for wandb
    # --- END OF FIX ---
)

# 4. Create the Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 5. START TRAINING!
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.323117,0.25
2,No log,1.284572,0.375
3,No log,1.260675,0.375


TrainOutput(global_step=6, training_loss=1.3261085351308186, metrics={'train_runtime': 117.2008, 'train_samples_per_second': 0.64, 'train_steps_per_second': 0.051, 'total_flos': 847931713200.0, 'train_loss': 1.3261085351308186, 'epoch': 3.0})

In [13]:
# This pushes the final, best model to your Hugging Face profile
trainer.push_to_hub(commit_message="End of training for burundi-lang-id")

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...lang-id/training_args.bin: 100%|##########| 5.84kB / 5.84kB            

  ...02656.90194ef676e1.2978.0: 100%|##########| 5.39kB / 5.39kB            

  ...lang-id/model.safetensors:   5%|4         | 33.5MB /  711MB            

CommitInfo(commit_url='https://huggingface.co/samandari/burundi-lang-id/commit/d4fd43db0fa84c7d2145597a58ed83375da9d4db', commit_message='End of training for burundi-lang-id', commit_description='', oid='d4fd43db0fa84c7d2145597a58ed83375da9d4db', pr_url=None, repo_url=RepoUrl('https://huggingface.co/samandari/burundi-lang-id', endpoint='https://huggingface.co', repo_type='model', repo_id='samandari/burundi-lang-id'), pr_revision=None, pr_num=None)