In [None]:
# 1. Install dependencies
!pip install transformers datasets seqeval matplotlib seaborn --quiet


In [None]:
# 2. Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


In [None]:
# 3. Set data path (update if needed)
conll_path = '/content/drive/MyDrive/NER/ner_sample_conll.txt'


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def read_conll(filepath):
    sentences, labels = [], []
    with open(filepath, encoding='utf-8') as f:
        tokens, tags = [], []
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append(tokens)
                    labels.append(tags)
                    tokens, tags = [], []
            else:
                splits = line.split()
                if len(splits) >= 2:
                    tokens.append(splits[0])
                    tags.append(splits[-1])
        if tokens:
            sentences.append(tokens)
            labels.append(tags)
    return sentences, labels

sentences, ner_tags = read_conll(conll_path)
unique_tags = sorted(set(tag for doc in ner_tags for tag in doc))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# Visualize entity distribution
all_labels = [tag for tags in ner_tags for tag in tags]
label_counts = pd.Series(all_labels).value_counts()
plt.figure(figsize=(8,4))
sns.barplot(x=label_counts.index, y=label_counts.values, palette='husl')
plt.title('Entity Label Distribution')
plt.ylabel('Count')
plt.xlabel('Entity Label')
plt.show()


In [None]:
from datasets import Dataset, DatasetDict

data = {
    'tokens': sentences,
    'ner_tags': [[tag2id[tag] for tag in tags] for tags in ner_tags]
}
dataset = Dataset.from_dict(data)
dataset = dataset.train_test_split(test_size=0.2, seed=42)
dataset = DatasetDict({'train': dataset['train'], 'test': dataset['test']})


In [None]:
import time
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score

model_names = {
    'XLM-Roberta': 'xlm-roberta-base',
    'mBERT': 'bert-base-multilingual-cased',
    'DistilBERT': 'distilbert-base-multilingual-cased'
}

def tokenize_and_align_labels(examples, tokenizer):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label[word_idx] != tag2id['O'] else tag2id['O'])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

results = {}

for model_label, model_checkpoint in model_names.items():
    print(f'\n--- Fine-tuning {model_label} ---')
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    tokenized_datasets = dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer), batched=True)
    tokenized_datasets = tokenized_datasets.remove_columns(['tokens', 'ner_tags'])
    tokenized_datasets.set_format('torch')
    
    model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(tag2id))
    data_collator = DataCollatorForTokenClassification(tokenizer)
    
    args = TrainingArguments(
        output_dir=f'/content/drive/MyDrive/NER/results/{model_label}',
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir=f'/content/drive/MyDrive/NER/logs/{model_label}',
        logging_steps=10
    )

    trainer = Trainer(
        model,
        args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['test'],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=lambda p: {
            'f1': f1_score([[id2tag[l] for l in label if l != -100] for label in p.label_ids],
                           [[id2tag[pred] for (pred, lab) in zip(prediction, label) if lab != -100]
                            for prediction, label in zip(np.argmax(p.predictions, axis=2), p.label_ids)]),
            'precision': precision_score([[id2tag[l] for l in label if l != -100] for label in p.label_ids],
                           [[id2tag[pred] for (pred, lab) in zip(prediction, label) if lab != -100]
                            for prediction, label in zip(np.argmax(p.predictions, axis=2), p.label_ids)]),
            'recall': recall_score([[id2tag[l] for l in label if l != -100] for label in p.label_ids],
                           [[id2tag[pred] for (pred, lab) in zip(prediction, label) if lab != -100]
                            for prediction, label in zip(np.argmax(p.predictions, axis=2), p.label_ids)]),
            'report': classification_report([[id2tag[l] for l in label if l != -100] for label in p.label_ids],
                           [[id2tag[pred] for (pred, lab) in zip(prediction, label) if lab != -100]
                            for prediction, label in zip(np.argmax(p.predictions, axis=2), p.label_ids)])
        }
    )

    start = time.time()
    trainer.train()
    elapsed = time.time() - start
    eval_results = trainer.evaluate()
    print(eval_results['report'])
    results[model_label] = {**eval_results, 'train_time': elapsed}


In [None]:
summary = []
for model_label, res in results.items():
    summary.append({
        'Model': model_label,
        'F1': res['f1'],
        'Precision': res['precision'],
        'Recall': res['recall'],
        'Train Time (s)': res['train_time']
    })

df = pd.DataFrame(summary)
df = df.sort_values('F1', ascending=False)
display(df)

plt.figure(figsize=(8,4))
sns.barplot(x='Model', y='F1', data=df, palette='husl')
plt.title('F1 Score by Model')
plt.show()
