## Install Dependencies

In [1]:
# !pip install -q transformers datasets evaluate

ERROR: Could not install packages due to an OSError: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\Hp\\AppData\\Local\\Temp\\pip-unpack-vzhwn5g4\\pyarrow-21.0.0-cp313-cp313-win_amd64.whl'
Check the permissions.



## Import Libraries

In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import re
import torch


  from .autonotebook import tqdm as notebook_tqdm


## Load and Prepare the Dataset

In [None]:
df1 = pd.read_csv("arwanda_hard_texts.csv")
df2 = pd.read_csv("realistic_kinyarwanda_hate_sarcasm_normal.csv")
df = pd.concat([df1, df2], ignore_index=True)
df.head()

# Clean labels
df["label"] = df["label"].str.strip().str.lower()
df["label"] = df["label"].replace({"sarcasm/joke": "sarcasm"})
df.head()


In [None]:
df.shape

### label distribution

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(data=df, x="label", order=df["label"].value_counts().index)
plt.title("Label Distribution")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()

### label encoding

In [None]:
# Encode labels
label_encoder = LabelEncoder()
df["label_id"] = label_encoder.fit_transform(df["label"])

num_labels = len(label_encoder.classes_)
label_encoder.classes_

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"(.)\1{2,}", r"\1\1", text)
    return text.strip()

df["text"] = df["text"].apply(clean_text)
df.head()

## Train-Test Split + HuggingFace Dataset Conversion

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, stratify=df["label_id"], test_size=0.2, random_state=42)

train_ds = Dataset.from_pandas(train_df[["text", "label_id"]])
test_ds = Dataset.from_pandas(test_df[["text", "label_id"]])


### word cloud visuals

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

for label in df["label"].unique():
    text = " ".join(df[df["label"] == label]["text"])
    wc = WordCloud(width=800, height=400, background_color="white").generate(text)

    plt.figure(figsize=(10, 5))
    plt.imshow(wc, interpolation='bilinear')
    plt.title(f"WordCloud for '{label}'")
    plt.axis("off")
    plt.show()


## Load Tokenizer & Tokenize Dataset

In [None]:
model_name = "Davlan/afro-xlmr-mini"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_batch(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

train_ds = train_ds.map(tokenize_batch, batched=True)
test_ds = test_ds.map(tokenize_batch, batched=True)


## Load Model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

## Define Metrics

In [None]:
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "f1_macro": f1.compute(predictions=predictions, references=labels, average="macro")["f1"]
    }


## Set Training Arguments

In [None]:
training_args = TrainingArguments(
    output_dir="afro-xlmr-mini-output",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    save_total_limit=2,
    fp16=torch.cuda.is_available()
)


In [None]:
# Rename label column to what Trainer expects
train_ds = train_ds.rename_column("label_id", "labels")
test_ds = test_ds.rename_column("label_id", "labels")


## Create Trainer and Train

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


## Evaluate Model

In [None]:
metrics = trainer.evaluate()
metrics

### Confusion Matrix

In [None]:
# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# # Get true labels and predictions
# true_labels = test_df["label_id"].values
# test_texts = test_df["text"].tolist()

# pred_labels = []
# for text in test_texts:
#     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
#     # Move input tensors to the same device as the model
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     inputs = {k: v.to(device) for k, v in inputs.items()}
#     model.to(device)  # Ensure model is on the correct device

#     outputs = model(**inputs)
#     pred = torch.argmax(outputs.logits, dim=1).item()
#     pred_labels.append(pred)

# # Plot confusion matrix
# cm = confusion_matrix(true_labels, pred_labels)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)
# disp.plot(cmap="Blues", xticks_rotation=45)
# plt.title("Confusion Matrix")
# plt.show()

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import torch
import matplotlib.pyplot as plt

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Get true labels and texts
true_labels = test_df["label_id"].values
test_texts = test_df["text"].tolist()

# Run model predictions
pred_labels = []
model.eval()  # Set model to eval mode

with torch.no_grad():
    for text in test_texts:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        outputs = model(**inputs)
        pred = torch.argmax(outputs.logits, dim=1).item()
        pred_labels.append(pred)

# Generate confusion matrix
cm = confusion_matrix(true_labels, pred_labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)

# Plot
fig, ax = plt.subplots(figsize=(6, 6))
disp.plot(cmap="Blues", xticks_rotation=45, ax=ax)
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()

### Classification Report

In [None]:
from sklearn.metrics import classification_report
print(classification_report(true_labels, pred_labels, target_names=label_encoder.classes_))


## Inference Examples

In [None]:
def predict_label(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    # Move input tensors to the same device as the model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    model.to(device)  # Ensure model is on the correct device

    outputs = model(**inputs)
    predicted_class = torch.argmax(outputs.logits, dim=1).item()
    return label_encoder.inverse_transform([predicted_class])[0]

# Test examples
examples = [
    "Umwana na se",  # should be normal
    "Abatutsi baracyariho",  # hate
    "Uwanga umwana ntakabone ijuru",  # normal
    "U Rwanda n igihugu kiza",  # normal
    "Inyenzi ninde wakubeshye ko zashize mu Rwanda",  # ambiguous
]

for ex in examples:
    print(f"{ex} ‚Üí {predict_label(ex)}")

## more examples (adding emojis)

In [None]:
examples = [
    "Sha murabeshya cyane üòÇ ngo mwatsinze? Hmmm",
    "Abahutu ntacyo bamaze, rwose.",
    "Muraho neza, ndabashimiye ku bufasha bwanyu üôè",
    "Ese wowe urumva nkwiye kwemera ibyo? Sha uransetsa kabisa",
    "Iyo myitwarire y‚Äôabatutsi ntitwakwihanganira pe!",
    "Turashimira Imana ko turi amahoro n‚Äôubumwe",
    "Ngo uri umuyobozi? Hahahaha! Urimo kudusetsa sha.",
    "Inkotanyi nizindi nyenzi ziri kwica igihugu cyacu",
    "Imana ihe umugisha u Rwanda rwacu",
    "Kabisa uri umuntu mwiza, courage kuri byose",
    "Ngo mukunda abaturage, Ariko se ibyo murimo ni iki?",
    "Sha nta kindi nari ntegereje ku batutsi!",
]

for ex in examples:
    print(f"{ex} ‚Üí {predict_label(ex)}")


## Save Model & Label Encoder

In [None]:
model.save_pretrained("kinyarwanda-hatespeech-model")
tokenizer.save_pretrained("kinyarwanda-hatespeech-model")
import joblib
joblib.dump(label_encoder, "label_encoder.pkl")
