In [1]:
# mount google drive to access files and save outputs persistently within Colab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [30]:
# check for GPU availability
!nvidia-smi

Mon Aug  4 20:42:23 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   31C    P0             49W /  400W |    5879MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [31]:
import warnings

# Filter out the specific FutureWarning related to encoder_attention_mask
warnings.filterwarnings(
    "ignore",
    message="`encoder_attention_mask` is deprecated and will be removed in version 4.55.0 for `BertSdpaSelfAttention.forward`.",
    category=FutureWarning
)

In [2]:
# install wandb
!pip install wandb --quiet

In [32]:
# Disable Weights & Biases (wandb) to allow training without requiring an API key or logging
import os
os.environ["WANDB_DISABLED"] = "true"

In [33]:
# load the preprocessed train, validation and test datasets
import pandas as pd

train_df = pd.read_csv('/content/drive/MyDrive/Safaricom-processed-dataset/train_processed.csv')
val_df = pd.read_csv('/content/drive/MyDrive/Safaricom-processed-dataset/val_processed.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Safaricom-processed-dataset/test_processed.csv')

In [34]:
# view the first 5 rows in train dataset
train_df.head()

Unnamed: 0,Content,Likes,Retweets,Replies,Quotes,Views,Labels,cleaned_sentence,encoded_labels
0,@ledamalekina @safaricom Exactly,1,0,0,0,0,Neutral,ledamalekina safaricom Exactly,6
1,@InteriorKE And why are FanakaLotto still oper...,0,0,0,0,0,Data protection and privacy concern,InteriorKE And why are FanakaLotto still opera...,1
2,@safaricom rudisheni hii na mnipee bundles .sa...,1,0,0,0,16,Internet or airtime bundle complaint,safaricom rudisheni hii na mnipee bundles sasa...,3
3,@RobertAlai @safaricom @PeterNdegwa_ Hii manen...,0,0,0,0,48,Neutral,RobertAlai safaricom PeterNdegwa Hii maneno pe...,6
4,"@evenmaina @safaricom PLC I hate you , mlikwam...",0,0,0,0,16,Customer care complaint,evenmaina safaricom PLC I hate you mlikwamilia...,0


In [35]:
# Check the distribution of the target variable in the training data
label_counts = train_df['encoded_labels'].value_counts().sort_index()
print("Class distribution in training data:")
print(label_counts)

Class distribution in training data:
encoded_labels
0     327
1     155
2     238
3     260
4     215
5     218
6    1598
Name: count, dtype: int64


In [36]:
# initialize the pre-trained XLM-RoBERTa tokenizer
from transformers import BertTokenizer, XLMRobertaTokenizer, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large')

In [37]:
'''
Tokenizes a list of texts using the pre-defined tokenizer.

Args:
    texts (list or pd.Series): List or Series of sentences to tokenize.
    max_length (int): Maximum sequence length after padding/truncation.

    Returns:
    dict: Dictionary of tokenized outputs as PyTorch tensors.
'''
def tokenize_texts(texts, max_length=256):
    return tokenizer(
        list(texts),
        padding='max_length', # pad shorter texts to max_length
        truncation=True, # truncate longer texts
        max_length=max_length,
        return_tensors='pt' # return PyTorch tensors
    )
# Tokenize the cleaned sentences from each dataset split for model input
train_tokens = tokenize_texts(train_df['cleaned_sentence'].tolist())
val_tokens = tokenize_texts(val_df['cleaned_sentence'].tolist())
test_tokens = tokenize_texts(test_df['cleaned_sentence'].tolist())

In [38]:
'''
Convert label columns from each dataset split into PyTorch tensors
For compatibility with model training and loss calculation
'''
import torch

train_labels = torch.tensor(train_df['encoded_labels'].values)
val_labels = torch.tensor(val_df['encoded_labels'].values)
test_labels = torch.tensor(test_df['encoded_labels'].values)

In [39]:
# define the custom Pytorch Dataset
from torch.utils.data import Dataset

class TweetDataset(Dataset):
  """
  A custom Pytorch dataset for handling tokenized inputs and labels for tweet classification
  It allows easy batching and data loading during training and evaluation
  """
  def __init__(self, tokens, labels):
    """
    Args:
        tokens (list): Tokenized inputs like input_ids and attention mask (Pytorch tensors).
        labels (torch.Tensor): Corresponding labels tensor
    """
    self.tokens = tokens
    self.labels = labels

  def __getitem__(self, idx):
    """
    Retrieve a single sample by index

    Returns:
        dict: Toknized inputs and corresponding labels
    """
    item = {key: val[idx] for key, val in self.tokens.items()}
    item['labels'] = self.labels[idx]
    return item

  def __len__(self):
    # returns the total number of samples
    return len(self.labels)

In [40]:
# create the dataset objects for train, validation and test sets
train_dataset = TweetDataset(train_tokens, train_labels)
val_dataset = TweetDataset(val_tokens, val_labels)
test_dataset = TweetDataset(test_tokens, test_labels)

## Model Training and Optimization

In [41]:
from transformers import BertForSequenceClassification, XLMRobertaForSequenceClassification, AutoModelForSequenceClassification

# load the pre-trained XLM-RoBERTa model with a classification head for 7 classes
model = AutoModelForSequenceClassification.from_pretrained('xlm-roberta-large', num_labels=7)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# # Set up the optimizer and learning rate scheduler for training

# from torch.optim import AdamW
# from transformers import get_linear_schedule_with_warmup

# batch_size = 16
# num_epochs = 3

# # AdamW optimizer is commonly used with transformer models for weight decay regularization
# optimizer = AdamW(model.parameters(), lr=2e-5)

# # Calculate total number of training steps (batches * epochs)
# num_training_steps = len(train_dataset) // batch_size * num_epochs

# # Linear learning rate scheduler with optional warmup steps (none here)
# # Gradually decreases the learning rate from the initial value to zero over training
# lr_scheduler = get_linear_schedule_with_warmup(
#     optimizer,
#     num_warmup_steps=0,
#     num_training_steps=num_training_steps
# )

In [42]:
# define the training configurations and hyperparameters for the trainer API
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='/content/results/',     # output directory
    num_train_epochs=5,                 # number of training epochs
    per_device_train_batch_size=16,     # training batch size per device
    per_device_eval_batch_size=16,      # evaluation batch size
    eval_strategy='epoch',              # evaluate at the end of each epoch
    save_strategy='epoch',              # save checkpoint every epoch
    learning_rate=2e-5,                 # learning rate
    weight_decay=0.01,                  # weight decay for regularization
    logging_dir=None,                   # logging directory
    logging_steps=50,                   # log every 50 steps
    load_best_model_at_end=False,       # load the best model at the end of training
    metric_for_best_model='f1',         # metric to monitor for best model
    greater_is_better=True              # higher metric values are better
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [43]:
# define the metric computation function
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    """
    Compute accuracy, precision, recall, and F1-score for model evaluation.

    Args:
        eval_pred (tuple): Tuple containing logits (model outputs) and true labels.

    Returns:
        dict: Dictionary with 'accuracy', 'f1', 'precision', and 'recall' scores.
    """

    logits, labels = eval_pred
    # Convert logits to predicted class indices
    predictions = np.argmax(logits, axis=-1)

    # Calculate precision, recall, f1-score with weighted average (handles class imbalance)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')

    # Calculate overall accuracy
    acc = accuracy_score(labels, predictions)

    # Return all metrics in a dictionary format expected by Trainer
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
from transformers import Trainer

# initialize the Hugging Face trainer with model, datasets, tokenizer, and evaluation metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# initialize the training process
trainer.train()

  trainer = Trainer(
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.2806,0.96249,0.674671,0.642883,0.660637,0.674671
2,0.8691,0.742392,0.749032,0.734229,0.737821,0.749032
3,0.5916,0.705653,0.779241,0.771078,0.773241,0.779241
4,0.4317,0.69325,0.775368,0.774539,0.776855,0.775368
5,0.2886,0.730946,0.77227,0.771129,0.773421,0.77227


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


TrainOutput(global_step=945, training_loss=0.7231592168252935, metrics={'train_runtime': 644.9981, 'train_samples_per_second': 23.341, 'train_steps_per_second': 1.465, 'total_flos': 7015231850411520.0, 'train_loss': 0.7231592168252935, 'epoch': 5.0})

In [45]:
# Save the fine-tuned model and tokenizer to the specified directory
# This allows loading the trained model/tokenizer later for inference or further training
model.save_pretrained('model/')
tokenizer.save_pretrained('model/')

('model/tokenizer_config.json',
 'model/special_tokens_map.json',
 'model/sentencepiece.bpe.model',
 'model/added_tokens.json',
 'model/tokenizer.json')

In [48]:
# evaluate on the test dataset
from sklearn.metrics import classification_report
import numpy as np
import torch

# Get predictions from the trainer
results = trainer.evaluate(test_dataset)
print(results)

  return forward_call(*args, **kwargs)


{'eval_loss': 0.7143315076828003, 'eval_accuracy': 0.7885032537960954, 'eval_f1': 0.7866287463898118, 'eval_precision': 0.7876804781308033, 'eval_recall': 0.7885032537960954, 'eval_runtime': 18.8787, 'eval_samples_per_second': 97.676, 'eval_steps_per_second': 6.144, 'epoch': 5.0}


In [49]:
# Extract logits and labels
predictions = trainer.predict(test_dataset)
logits = predictions.predictions
labels = predictions.label_ids

# Convert logits to predicted class indices
predicted_labels = np.argmax(logits, axis=-1)

# Print the classification report
print(classification_report(labels, predicted_labels))

  return forward_call(*args, **kwargs)


              precision    recall  f1-score   support

           0       0.65      0.55      0.59       201
           1       0.72      0.60      0.66        95
           2       0.61      0.66      0.63       146
           3       0.74      0.84      0.79       159
           4       0.70      0.80      0.75       132
           5       0.74      0.68      0.71       133
           6       0.88      0.88      0.88       978

    accuracy                           0.79      1844
   macro avg       0.72      0.72      0.71      1844
weighted avg       0.79      0.79      0.79      1844



## mBERT Fine Tuning

In [50]:
# define the mBERT tokenizer
from transformers import BertTokenizer, AutoTokenizer

bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

In [51]:
'''
Tokenizes a list of texts using the pre-defined tokenizer.

Args:
    texts (list or pd.Series): List or Series of sentences to tokenize.
    max_length (int): Maximum sequence length after padding/truncation.

    Returns:
    dict: Dictionary of tokenized outputs as PyTorch tensors.
'''
def bert_tokenize_texts(texts, max_length=128):
    return bert_tokenizer(
        list(texts),
        padding='max_length', # pad shorter texts to max_length
        truncation=True, # truncate longer texts
        max_length=max_length,
        return_tensors='pt' # return PyTorch tensors
    )
# Tokenize the cleaned sentences from each dataset split for model input
bert_train_tokens = bert_tokenize_texts(train_df['cleaned_sentence'].tolist())
bert_val_tokens = bert_tokenize_texts(val_df['cleaned_sentence'].tolist())
bert_test_tokens = bert_tokenize_texts(test_df['cleaned_sentence'].tolist())

In [52]:
# create the dataset objects for train, validation and test sets
bert_train_dataset = TweetDataset(bert_train_tokens, train_labels)
bert_val_dataset = TweetDataset(bert_val_tokens, val_labels)
bert_test_dataset = TweetDataset(bert_test_tokens, test_labels)

In [53]:
# define the BERT model
from transformers import AutoModelForSequenceClassification
bert_model = AutoModelForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=7)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [54]:
# setup the trainer and instantiate the train process
# initialize the Hugging Face trainer with model, datasets, tokenizer, and evaluation metrics
trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=bert_train_dataset,
    eval_dataset=bert_val_dataset,
    tokenizer=bert_tokenizer,
    compute_metrics=compute_metrics
)

# initialize the training process
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.3206,1.138213,0.596437,0.512707,0.542236,0.596437
2,0.971,0.942054,0.687064,0.667162,0.68224,0.687064
3,0.6431,0.892908,0.706429,0.701238,0.700917,0.706429
4,0.4692,0.890552,0.718048,0.722392,0.732509,0.718048
5,0.3322,0.910906,0.72347,0.727008,0.732072,0.72347


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=945, training_loss=0.7596374249332165, metrics={'train_runtime': 128.4466, 'train_samples_per_second': 117.208, 'train_steps_per_second': 7.357, 'total_flos': 990328691424000.0, 'train_loss': 0.7596374249332165, 'epoch': 5.0})

In [55]:
# Save the fine-tuned model and tokenizer to the specified directory
# This allows loading the trained model/tokenizer later for inference or further training
bert_model.save_pretrained('bert_model/')
bert_tokenizer.save_pretrained('bert_model/')

('bert_model/tokenizer_config.json',
 'bert_model/special_tokens_map.json',
 'bert_model/vocab.txt',
 'bert_model/added_tokens.json',
 'bert_model/tokenizer.json')

In [56]:
# evaluate on the test dataset
from sklearn.metrics import classification_report
import numpy as np
import torch

predictions = trainer.evaluate(bert_test_dataset)
print(predictions)

{'eval_loss': 0.9302851557731628, 'eval_accuracy': 0.7131236442516269, 'eval_f1': 0.7185259273475617, 'eval_precision': 0.7266192824938815, 'eval_recall': 0.7131236442516269, 'eval_runtime': 3.3717, 'eval_samples_per_second': 546.909, 'eval_steps_per_second': 34.404, 'epoch': 5.0}


## Load Saved Models and Predict on Sample Tweets

In [57]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Define the paths to the saved models and tokenizers
xlm_roberta_model_path = 'model/'
bert_model_path = 'bert_model/'

# Load the XLM-RoBERTa model and tokenizer
loaded_xlm_roberta_tokenizer = AutoTokenizer.from_pretrained(xlm_roberta_model_path)
loaded_xlm_roberta_model = AutoModelForSequenceClassification.from_pretrained(xlm_roberta_model_path)

# Load the mBERT model and tokenizer
loaded_bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_path)
loaded_bert_model = AutoModelForSequenceClassification.from_pretrained(bert_model_path)

print("Models and tokenizers loaded successfully.")

Models and tokenizers loaded successfully.


In [58]:
# Define sample tweets
sample_tweets = [
    "This is a great service!",
    "I have a problem with my data bundle.",
    "Safaricom Your network is very slow today.",
    "Thank you for your help, your service has been amazing Safaricom."
]

# Define the mapping from encoded labels back to original labels
label_map = {
    0: "Customer care complaint",
    1: "Data protection and privacy concern",
    2: "Hate Speech",
    3: "Internet or airtime bundle complaint",
    4: "MPESA complaint",
    5: "Network reliability problem",
    6: "Neutral"
}

# Function to get predictions for a list of tweets
def get_predictions(tweets, model, tokenizer, label_map):
    # Tokenize the sample tweets
    inputs = tokenizer(tweets, padding=True, truncation=True, return_tensors="pt")

    # Move inputs to the same device as the model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)

    # Get predicted class indices
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

    # Convert predicted indices back to labels
    predicted_labels = [label_map[prediction.item()] for prediction in predictions]

    return predicted_labels

# Get predictions using the loaded XLM-RoBERTa model
xlm_roberta_predictions = get_predictions(sample_tweets, loaded_xlm_roberta_model, loaded_xlm_roberta_tokenizer, label_map)
print("XLM-RoBERta Predictions:")
for tweet, prediction in zip(sample_tweets, xlm_roberta_predictions):
    print(f"Tweet: '{tweet}' -> Prediction: {prediction}")

print("\n---\n")

# Get predictions using the loaded mBERT model
bert_predictions = get_predictions(sample_tweets, loaded_bert_model, loaded_bert_tokenizer, label_map)
print("mBERT Predictions:")
for tweet, prediction in zip(sample_tweets, bert_predictions):
    print(f"Tweet: '{tweet}' -> Prediction: {prediction}")

  return forward_call(*args, **kwargs)


XLM-RoBERta Predictions:
Tweet: 'This is a great service!' -> Prediction: Neutral
Tweet: 'I have a problem with my data bundle.' -> Prediction: Internet or airtime bundle complaint
Tweet: 'Safaricom Your network is very slow today.' -> Prediction: Network reliability problem
Tweet: 'Thank you for your help, your service has been amazing Safaricom.' -> Prediction: Neutral

---

mBERT Predictions:
Tweet: 'This is a great service!' -> Prediction: Neutral
Tweet: 'I have a problem with my data bundle.' -> Prediction: Internet or airtime bundle complaint
Tweet: 'Safaricom Your network is very slow today.' -> Prediction: Network reliability problem
Tweet: 'Thank you for your help, your service has been amazing Safaricom.' -> Prediction: Neutral


In [59]:
# zip the two models for download
!zip -r /content/model.zip /content/model/
!zip -r /content/bert_model.zip /content/bert_model/

  adding: content/model/ (stored 0%)
  adding: content/model/model.safetensors (deflated 21%)
  adding: content/model/sentencepiece.bpe.model (deflated 49%)
  adding: content/model/special_tokens_map.json (deflated 52%)
  adding: content/model/tokenizer.json (deflated 76%)
  adding: content/model/config.json (deflated 55%)
  adding: content/model/tokenizer_config.json (deflated 76%)
  adding: content/bert_model/ (stored 0%)
  adding: content/bert_model/model.safetensors (deflated 7%)
  adding: content/bert_model/special_tokens_map.json (deflated 42%)
  adding: content/bert_model/vocab.txt (deflated 45%)
  adding: content/bert_model/tokenizer.json (deflated 67%)
  adding: content/bert_model/config.json (deflated 58%)
  adding: content/bert_model/tokenizer_config.json (deflated 75%)
