In [3]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import (XLMRobertaTokenizerFast, XLMRobertaForTokenClassification, 
                          DistilBertTokenizerFast, DistilBertForTokenClassification,
                          BertTokenizerFast, BertForTokenClassification,
                          Trainer, TrainingArguments, DataCollatorForTokenClassification)
import evaluate

# Load the labeled dataset in CoNLL format
def load_conll_data(file_path):
    """Loads CoNLL formatted data into a pandas DataFrame."""
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        sentence = []
        labels = []
        for line in f:
            if line.strip():
                token, label = line.strip().split()
                sentence.append(token)
                labels.append(label)
            else:
                data.append((sentence, labels))
                sentence = []
                labels = []
    if sentence:  # For the last sentence if there is no newline
        data.append((sentence, labels))
    return data

conll_file_path = '../output/labeled_telegram_data.conll'
data = load_conll_data(conll_file_path)

# Convert data into a DataFrame
df = pd.DataFrame(data, columns=['tokens', 'labels'])

# Check the size of the dataset
if len(df) > 1:
    # Split into train and validation sets if you have more than one sample
    train_df, val_df = train_test_split(df, test_size=0.2)
else:
    # If the dataset is too small, use the entire dataset for both training and validation
    print("Dataset too small to split. Using the entire dataset for training and evaluation.")
    train_df = df
    val_df = df

# Convert to Hugging Face dataset format
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Tokenization and alignment of labels (same function as before)
def tokenize_and_align_labels(examples, tokenizer):
    """Tokenizes inputs and aligns labels."""
    tokenized_inputs = tokenizer(examples['tokens'], is_split_into_words=True, padding=True, truncation=True)

    labels = []
    for i, label in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # get word ids
        label_ids = [-100] * len(tokenized_inputs['input_ids'][i])  # default to -100 (ignore index)

        # Align labels with tokenized inputs
        for j, label_id in enumerate(label):
            if j < len(word_ids) and word_ids[j] is not None:  # avoid IndexError
                if label_id in label_map:  # Check if label_id exists in label_map
                    label_ids[word_ids[j]] = label_map[label_id]  # map label to its id

        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs

# Map label names to IDs
label_list = list(set(label for labels in df['labels'] for label in labels))
label_list = sorted(label_list)
label_map = {label: i for i, label in enumerate(label_list)}

# Metrics function using Hugging Face's `evaluate` library
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)

    # Remove ignored index (-100)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Define model names for comparison
model_names = [
    "xlm-roberta-base",  # XLM-Roberta
    "distilroberta-base",  # DistilRoBERTa
    "bert-base-multilingual-cased",  # mBERT
]

# Store results
results = {}

for model_name in model_names:
    print(f"Training model: {model_name}")
    
    # Load tokenizer and model based on the model name
    try:
        if model_name == "xlm-roberta-base":
            tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name)
            model = XLMRobertaForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))
        elif model_name == "distilroberta-base":
            tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
            model = DistilBertForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))
        elif model_name == "bert-base-multilingual-cased":
            tokenizer = BertTokenizerFast.from_pretrained(model_name)
            model = BertForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))

        # Initialize the data collator (moved inside the loop)
        data_collator = DataCollatorForTokenClassification(tokenizer, padding=True)

    except Exception as e:
        print(f"Error loading model {model_name}: {e}")
        continue

    # Tokenize the datasets
    tokenized_train_dataset = train_dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer), 
                                                batched=True, remove_columns=['tokens', 'labels'])
    tokenized_val_dataset = val_dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer), 
                                             batched=True, remove_columns=['tokens', 'labels'])

    # Set up training arguments
    training_args = TrainingArguments(
        output_dir=f'../results/{model_name}',  # output directory
        evaluation_strategy="epoch",           # evaluate every epoch
        learning_rate=2e-5,                    # learning rate
        per_device_train_batch_size=16,        # batch size for training
        per_device_eval_batch_size=16,         # batch size for evaluation
        num_train_epochs=3,                    # total number of training epochs
        weight_decay=0.01,                     # strength of weight decay
        logging_dir=f'../logs/{model_name}',   # directory for storing logs
    )

    # Create a Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_val_dataset,   # Use validation set for evaluation
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    # Start training
    trainer.train()

    # Evaluate the fine-tuned model on the validation set
    eval_results = trainer.evaluate()
    results[model_name] = eval_results
    print(f"Evaluation results for {model_name}: {eval_results}")

# Print summary of results
print("\nModel Comparison Results:")
for model_name, result in results.items():
    print(f"{model_name}: Precision={result['eval_precision']:.4f}, Recall={result['eval_recall']:.4f}, F1={result['eval_f1']:.4f}, Accuracy={result['eval_accuracy']:.4f}")

# Step 7: Save the fine-tuned models
for model_name in model_names:
    model.save_pretrained(f'../models/{model_name}')
    tokenizer.save_pretrained(f'../models/{model_name}')
    print(f"Model and tokenizer for {model_name} saved successfully!")


Dataset too small to split. Using the entire dataset for training and evaluation.
Training model: xlm-roberta-base


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1/1 [00:02<00:00,  2.36s/ examples]
Map: 100%|██████████| 1/1 [00:02<00:00,  2.24s/ examples]
 33%|███▎      | 1/3 [00:06<00:13,  6.50s/it]
 33%|███▎      | 1/3 [00:07<00:13,  6.50s/it]

{'eval_loss': 1.1206722259521484, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9612676056338029, 'eval_runtime': 0.6924, 'eval_samples_per_second': 1.444, 'eval_steps_per_second': 1.444, 'epoch': 1.0}


 67%|██████▋   | 2/3 [00:12<00:06,  6.25s/it]
 67%|██████▋   | 2/3 [00:13<00:06,  6.25s/it]

{'eval_loss': 1.0189059972763062, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9647887323943662, 'eval_runtime': 1.2572, 'eval_samples_per_second': 0.795, 'eval_steps_per_second': 0.795, 'epoch': 2.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

100%|██████████| 3/3 [00:29<00:00,  9.83s/it]


{'eval_loss': 0.965729832649231, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9647887323943662, 'eval_runtime': 1.1206, 'eval_samples_per_second': 0.892, 'eval_steps_per_second': 0.892, 'epoch': 3.0}
{'train_runtime': 29.4889, 'train_samples_per_second': 0.102, 'train_steps_per_second': 0.102, 'train_loss': 1.1376570065816243, 'epoch': 3.0}


100%|██████████| 1/1 [00:00<00:00, 90.65it/s]


Evaluation results for xlm-roberta-base: {'eval_loss': 0.965729832649231, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9647887323943662, 'eval_runtime': 1.0934, 'eval_samples_per_second': 0.915, 'eval_steps_per_second': 0.915, 'epoch': 3.0}
Training model: distilroberta-base


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RobertaTokenizer'. 
The class this function is called from is 'DistilBertTokenizerFast'.


Error loading model distilroberta-base: 
 requires the protobuf library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.

Training model: bert-base-multilingual-cased


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1/1 [00:02<00:00,  2.89s/ examples]
Map: 100%|██████████| 1/1 [00:02<00:00,  2.64s/ examples]
 33%|███▎      | 1/3 [00:07<00:14,  7.41s/it]
 33%|███▎      | 1/3 [00:08<00:14,  7.41s/it]

{'eval_loss': 0.7742921710014343, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9090909090909091, 'eval_runtime': 0.6847, 'eval_samples_per_second': 1.46, 'eval_steps_per_second': 1.46, 'epoch': 1.0}


 67%|██████▋   | 2/3 [00:13<00:06,  6.57s/it]
 67%|██████▋   | 2/3 [00:14<00:06,  6.57s/it]

{'eval_loss': 0.5040915012359619, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9602272727272727, 'eval_runtime': 0.92, 'eval_samples_per_second': 1.087, 'eval_steps_per_second': 1.087, 'epoch': 2.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

100%|██████████| 3/3 [00:24<00:00,  8.13s/it]


{'eval_loss': 0.42024707794189453, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9602272727272727, 'eval_runtime': 1.0443, 'eval_samples_per_second': 0.958, 'eval_steps_per_second': 0.958, 'epoch': 3.0}
{'train_runtime': 24.3896, 'train_samples_per_second': 0.123, 'train_steps_per_second': 0.123, 'train_loss': 0.8668272495269775, 'epoch': 3.0}


100%|██████████| 1/1 [00:00<00:00, 55.39it/s]


Evaluation results for bert-base-multilingual-cased: {'eval_loss': 0.42024707794189453, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9602272727272727, 'eval_runtime': 1.6265, 'eval_samples_per_second': 0.615, 'eval_steps_per_second': 0.615, 'epoch': 3.0}

Model Comparison Results:
xlm-roberta-base: Precision=0.0000, Recall=0.0000, F1=0.0000, Accuracy=0.9648
bert-base-multilingual-cased: Precision=0.0000, Recall=0.0000, F1=0.0000, Accuracy=0.9602
Model and tokenizer for xlm-roberta-base saved successfully!
Model and tokenizer for distilroberta-base saved successfully!
Model and tokenizer for bert-base-multilingual-cased saved successfully!
