In [2]:
# 1. Install dependencies
!pip install transformers datasets seqeval --quiet


In [12]:
# Upgrade libraries to potentially fix compatibility issues
!pip install --upgrade transformers datasets seqeval --quiet

In [3]:
# 2. Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# 3. Set data path (update if needed)
conll_path = '/content/drive/MyDrive/NER/ner_sample_conll.txt'


In [5]:
# 4. Load and preprocess data
def read_conll(filepath):
    sentences, labels = [], []
    with open(filepath, encoding='utf-8') as f:
        tokens, tags = [], []
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append(tokens)
                    labels.append(tags)
                    tokens, tags = [], []
            else:
                splits = line.split()
                if len(splits) >= 2:
                    tokens.append(splits[0])
                    tags.append(splits[-1])
        if tokens:
            sentences.append(tokens)
            labels.append(tags)
    return sentences, labels

sentences, ner_tags = read_conll(conll_path)
unique_tags = sorted(set(tag for doc in ner_tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}


In [6]:
from datasets import Dataset, DatasetDict

data = {
    'tokens': sentences,
    'ner_tags': [[tag2id[tag] for tag in tags] for tags in ner_tags]
}
dataset = Dataset.from_dict(data)
dataset = dataset.train_test_split(test_size=0.2, seed=42)
dataset = DatasetDict({'train': dataset['train'], 'test': dataset['test']})


In [7]:
# 5. Define models to compare
model_names = {
    'XLM-Roberta': 'xlm-roberta-base',
    'mBERT': 'bert-base-multilingual-cased',
    'DistilBERT': 'distilbert-base-multilingual-cased'
}


In [8]:
# 6. Helper functions
from transformers import AutoTokenizer

def tokenize_and_align_labels(examples, tokenizer):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label[word_idx] != tag2id['O'] else tag2id['O'])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs['labels'] = labels
    return tokenized_inputs


In [9]:
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score

def compute_metrics(p):
    import numpy as np
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_labels = [[id2tag[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2tag[pred] for (pred, lab) in zip(prediction, label) if lab != -100]
        for prediction, label in zip(predictions, labels)
    ]
    # Remove the 'report' from the returned dictionary as it's not JSON serializable
    return {
        'f1': f1_score(true_labels, true_predictions),
        'precision': precision_score(true_labels, true_predictions),
        'recall': recall_score(true_labels, true_predictions),
        # 'report': classification_report(true_labels, true_predictions) # Removed
    }

In [10]:
# 7. Fine-tune and evaluate each model with a custom training loop
from transformers import AutoModelForTokenClassification, AutoTokenizer
from datasets import Dataset, DatasetDict
import torch
from torch.utils.data import DataLoader
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import f1_score, precision_score, recall_score
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

results = {}
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

for model_label, model_checkpoint in model_names.items():
    print(f'\n--- Fine-tuning {model_label} with Custom Loop ---')

    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    tokenized_datasets = dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer), batched=True)
    tokenized_datasets = tokenized_datasets.remove_columns(['tokens', 'ner_tags'])

    # Data Collator
    data_collator = DataCollatorForTokenClassification(tokenizer)

    # Create PyTorch DataLoaders
    train_dataloader = DataLoader(
        tokenized_datasets['train'],
        shuffle=True,
        batch_size=8,
        collate_fn=data_collator
    )

    eval_dataloader = DataLoader(
        tokenized_datasets['test'],
        batch_size=8,
        collate_fn=data_collator
    )

    model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(tag2id)).to(device)

    # Optimizer and Scheduler (basic example)
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    # A more sophisticated scheduler can be added here if needed

    num_train_epochs = 3
    # Basic training loop
    model.train()
    for epoch in range(num_train_epochs):
        print(f"Epoch {epoch+1}/{num_train_epochs}")
        for batch in tqdm(train_dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

    # Evaluation loop
    print("Evaluating model...")
    model.eval()
    all_predictions = []
    all_labels = []
    for batch in tqdm(eval_dataloader):
        with torch.no_grad():
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        # Collect predictions and labels, ignoring special tokens (-100)
        for prediction, label in zip(predictions, batch['labels']):
            true_labels = [id2tag[l.item()] for l in label if l.item() != -100]
            true_predictions = [
                id2tag[p.item()] for (p, l) in zip(prediction, label) if l.item() != -100
            ]
            all_predictions.append(true_predictions)
            all_labels.append(true_labels)

    # Compute metrics
    eval_results = {
        'f1': f1_score(all_labels, all_predictions),
        'precision': precision_score(all_labels, all_predictions),
        'recall': recall_score(all_labels, all_predictions),
        # Optionally, you can also print the classification report here if needed
        # 'report': classification_report(all_labels, all_predictions)
    }

    results[model_label] = eval_results
    print(f"Evaluation results for {model_label}:\n{eval_results}")

# The results dictionary is now populated and can be used by the next cell


--- Fine-tuning XLM-Roberta with Custom Loop ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/32 [00:00<?, ? examples/s]

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


  0%|          | 0/4 [00:00<?, ?it/s]

Epoch 2/3


  0%|          | 0/4 [00:00<?, ?it/s]

Epoch 3/3


  0%|          | 0/4 [00:00<?, ?it/s]

Evaluating model...


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Evaluation results for XLM-Roberta:
{'f1': np.float64(0.0), 'precision': np.float64(0.0), 'recall': np.float64(0.0)}

--- Fine-tuning mBERT with Custom Loop ---


Map:   0%|          | 0/32 [00:00<?, ? examples/s]

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


  0%|          | 0/4 [00:00<?, ?it/s]

Epoch 2/3


  0%|          | 0/4 [00:00<?, ?it/s]

Epoch 3/3


  0%|          | 0/4 [00:00<?, ?it/s]

Evaluating model...


  0%|          | 0/1 [00:00<?, ?it/s]

Evaluation results for mBERT:
{'f1': np.float64(0.21238938053097345), 'precision': np.float64(0.42857142857142855), 'recall': np.float64(0.1411764705882353)}

--- Fine-tuning DistilBERT with Custom Loop ---


Map:   0%|          | 0/32 [00:00<?, ? examples/s]

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


  0%|          | 0/4 [00:00<?, ?it/s]

Epoch 2/3


  0%|          | 0/4 [00:00<?, ?it/s]

Epoch 3/3


  0%|          | 0/4 [00:00<?, ?it/s]

Evaluating model...


  0%|          | 0/1 [00:00<?, ?it/s]

Evaluation results for DistilBERT:
{'f1': np.float64(0.0), 'precision': np.float64(0.0), 'recall': np.float64(0.0)}


  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
# 8. Compare results
summary = []
for model_label, res in results.items():
    summary.append({
        'Model': model_label,
        'F1': res['f1'],
        'Precision': res['precision'],
        'Recall': res['recall']
    })

df = pd.DataFrame(summary)
df = df.sort_values('F1', ascending=False)
display(df)


Unnamed: 0,Model,F1,Precision,Recall
1,mBERT,0.212389,0.428571,0.141176
0,XLM-Roberta,0.0,0.0,0.0
2,DistilBERT,0.0,0.0,0.0
