In [None]:
# Install required libraries
!pip install datasets

# Clone the repository
!git clone https://github.com/hausanlp/HERDPhobia

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [None]:
# Import of important library
import pandas as pd
import torch
import os
from datasets import Dataset, DatasetDict
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    default_data_collator,
    DataCollatorWithPadding
)
import numpy as np
from huggingface_hub import notebook_login
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from transformers import TrainingArguments, Trainer
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.preprocessing import LabelEncoder

In [None]:

# Load and preprocess data
train_df = pd.read_csv("HERDPhobia/train.tsv", sep="\t")
dev_df = pd.read_csv("HERDPhobia/dev.tsv", sep="\t")
test_df = pd.read_csv("HERDPhobia/test.tsv", sep="\t")

# Encode labels (if they're strings)
le = LabelEncoder()
train_df["label"] = le.fit_transform(train_df["label"])
dev_df["label"] = le.transform(dev_df["label"])
test_df["label"] = le.transform(test_df["label"])

# Convert to Hugging Face datasets
herdphobia_dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(dev_df),
    "test": Dataset.from_pandas(test_df),
})

# Load model and tokenizer
model_name = "masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(le.classes_))

# Tokenize
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

tokenized_datasets = herdphobia_dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

# Define metrics function for multi-class classification
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted') # Changed to 'weighted'
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Training setup
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
)



In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
    data_collator=default_data_collator,
    tokenizer=tokenizer,
)



  trainer = Trainer(


In [None]:
# Train!
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmohamed-pagna[0m ([33mmohamed-pagna-aims-cameroon[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.709105,0.804989,0.724457,0.843279,0.804989
2,0.424600,0.496367,0.857143,0.85281,0.850515,0.857143
3,0.257400,0.867683,0.85941,0.858192,0.857157,0.85941
4,0.092600,1.049722,0.861678,0.856756,0.854628,0.861678
5,0.092600,1.101179,0.848073,0.847751,0.847441,0.848073


TrainOutput(global_step=1935, training_loss=0.2064034843937679, metrics={'train_runtime': 2094.0395, 'train_samples_per_second': 7.378, 'train_steps_per_second': 0.924, 'total_flos': 3599584890854400.0, 'train_loss': 0.2064034843937679, 'epoch': 5.0})

In [None]:
# Evaluate on test set
test_results = trainer.evaluate(tokenized_datasets["test"])
print(f"Test set results: {test_results}")

Test set results: {'eval_loss': 1.2534674406051636, 'eval_accuracy': 0.8382352941176471, 'eval_f1': 0.8387349895926451, 'eval_precision': 0.8392580898237008, 'eval_recall': 0.8382352941176471, 'eval_runtime': 18.6519, 'eval_samples_per_second': 47.395, 'eval_steps_per_second': 3.002, 'epoch': 5.0}


In [None]:
# Evaluate improved model
test_results = trainer.evaluate(tokenized_datasets["test"])

# Print all available metrics first to verify keys
print("All available metrics:")
for key, value in test_results.items():
    print(f"{key}: {value:.4f}")

# Binary classification report
print("\nBinary Classification Report:")
print(f"Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"F1 Score: {test_results['eval_f1']:.4f}")
print(f"Precision: {test_results['eval_precision']:.4f}")
print(f"Recall: {test_results['eval_recall']:.4f}")

# If you have per-class metrics (common format)
if 'eval_class_0_f1' in test_results:
    print("\nPer-class Metrics:")
    print(f"Class 0 (Negative) F1: {test_results['eval_class_0_f1']:.4f}")
    print(f"Class 1 (Positive) F1: {test_results['eval_class_1_f1']:.4f}")
elif 'eval_f1_class0' in test_results:  # Alternative naming convention
    print("\nPer-class Metrics:")
    print(f"Class 0 (Negative) F1: {test_results['eval_f1_class0']:.4f}")
    print(f"Class 1 (Positive) F1: {test_results['eval_f1_class1']:.4f}")

All available metrics:
eval_loss: 1.2535
eval_accuracy: 0.8382
eval_f1: 0.8387
eval_precision: 0.8393
eval_recall: 0.8382
eval_runtime: 18.4397
eval_samples_per_second: 47.9400
eval_steps_per_second: 3.0370
epoch: 5.0000

Binary Classification Report:
Accuracy: 0.8382
F1 Score: 0.8387
Precision: 0.8393
Recall: 0.8382


In [None]:
from sklearn.metrics import classification_report

predictions = trainer.predict(tokenized_datasets["test"])
print(classification_report(predictions.label_ids, predictions.predictions.argmax(-1)))

              precision    recall  f1-score   support

           0       0.90      0.90      0.90       705
           1       0.60      0.61      0.60       179

    accuracy                           0.84       884
   macro avg       0.75      0.75      0.75       884
weighted avg       0.84      0.84      0.84       884

