In [1]:
# mount google drive to access files and save outputs persistently within Colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# check for GPU availability
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


In [2]:
# load the dataset
import pandas as pd
safaricom_df = pd.read_csv('labeled_data_openai.csv')
safaricom_df.head()

Unnamed: 0,Tweet ID,URL,Content,Likes,Retweets,Replies,Quotes,Views,Date,Labels
0,1.95e+18,https://x.com/MawiaDorothy/status/194955836816...,How comes I have overdue debts.. na sijakopa.....,1,0,0,0,21,"July 27, 2025 at 07:51 PM",Customer care complaint
1,1.95e+18,https://x.com/KruiGeofrey/status/1949310365839...,@Monty_Hasashi @Safaricom 😂😂,0,0,0,0,22,"July 27, 2025 at 03:26 AM",Neutral
2,1.95e+18,https://x.com/martozgicha/status/1949022872242...,"@safaricom weka data ,wacheni jokes...Thank yo...",0,0,0,0,6,"July 26, 2025 at 08:23 AM",Neutral
3,1.95e+18,https://x.com/liyansmutembei/status/1948476756...,@SafaricomPLC Hello @SafaricomPLC @safaricom...,0,0,0,0,47,"July 24, 2025 at 08:13 PM",Neutral
4,1.95e+18,https://x.com/SsirNixoNdugire/status/194833516...,@PeterNdegwa_ @SafaricomPLC @Safaricom_Care @S...,0,0,0,0,5,"July 24, 2025 at 10:51 AM",Neutral


In [3]:
# show the basic info
safaricom_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6146 entries, 0 to 6145
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Tweet ID  6146 non-null   float64
 1   URL       6146 non-null   object 
 2   Content   6146 non-null   object 
 3   Likes     6146 non-null   int64  
 4   Retweets  6146 non-null   int64  
 5   Replies   6146 non-null   int64  
 6   Quotes    6146 non-null   int64  
 7   Views     6146 non-null   int64  
 8   Date      6146 non-null   object 
 9   Labels    6146 non-null   object 
dtypes: float64(1), int64(5), object(4)
memory usage: 480.3+ KB


In [4]:
# check for missing values
safaricom_df.isnull().sum()

Unnamed: 0,0
Tweet ID,0
URL,0
Content,0
Likes,0
Retweets,0
Replies,0
Quotes,0
Views,0
Date,0
Labels,0


In [5]:
# handle the missing value in Labels column
# safaricom_df.dropna(subset=['Labels'], inplace=True, axis=0)
# safaricom_df.isnull().sum()

In [6]:
# check for duplicate rows
safaricom_df.duplicated().sum()

np.int64(0)

In [9]:
# drop duplicate rows
# safaricom_df.drop_duplicates(inplace=True)
# safaricom_df.duplicated().sum()

In [7]:
# check the label distribution
print(safaricom_df['Labels'].value_counts())

Labels
Neutral                                 3261
Customer care complaint                  669
Internet or airtime bundle complaint     531
Hate Speech                              486
Network reliability problem              444
MPESA complaint                          439
Data protection and privacy concern      316
Name: count, dtype: int64


## Data Cleaning

In [8]:
# import the re library
import re

# define a dictionary of contractions
contractions = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "I'd": "I would",
    "I'd've": "I would have",
    "I'll": "I will",
    "I'll've": "I will have",
    "I'm": "I am",
    "I've": "I have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so is",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
}

def expand_contractions(text, contractions_dict):
    """Expand contractions in text"""
    expanded_text = text
    for contraction, expansion in contractions_dict.items():
        expanded_text = re.sub(r'\b' + re.escape(contraction) + r'\b', expansion, expanded_text, flags=re.IGNORECASE)
    return expanded_text

def remove_repeated_characters(text):
  """Remove repeated characters (e.g., 'goooood' -> 'good')"""
  # Replace 3+ repeated characters with 2
  return re.sub(r'(.)\1{2,}', r'\1\1', text)


# define a cleaning function
def clean_text(text):
  """Basic text cleaning and contraction expansion while preserving important information"""

  # Expand contractions
  text = expand_contractions(text, contractions)

  # Remove URLs but keep @mentions for context
  text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

  # Remove only the @ and # signs, keep the words
  text = re.sub(r'[@#]', '', text)

  # Remove special characters (but keep numbers)
  text = re.sub(r'[^A-Za-z0-9\s]', '', text)

  # Remove repeated characters
  text = remove_repeated_characters(text)

  # Remove excessive whitespace
  text = re.sub(r'\s+', ' ', text)
  return text.strip()

# apply to the dataframe
safaricom_df['cleaned_sentence'] = safaricom_df['Content'].apply(clean_text)
safaricom_df.head()

Unnamed: 0,Tweet ID,URL,Content,Likes,Retweets,Replies,Quotes,Views,Date,Labels,cleaned_sentence
0,1.95e+18,https://x.com/MawiaDorothy/status/194955836816...,How comes I have overdue debts.. na sijakopa.....,1,0,0,0,21,"July 27, 2025 at 07:51 PM",Customer care complaint,How comes I have overdue debts na sijakopawhat...
1,1.95e+18,https://x.com/KruiGeofrey/status/1949310365839...,@Monty_Hasashi @Safaricom 😂😂,0,0,0,0,22,"July 27, 2025 at 03:26 AM",Neutral,MontyHasashi Safaricom
2,1.95e+18,https://x.com/martozgicha/status/1949022872242...,"@safaricom weka data ,wacheni jokes...Thank yo...",0,0,0,0,6,"July 26, 2025 at 08:23 AM",Neutral,safaricom weka data wacheni jokesThank you for...
3,1.95e+18,https://x.com/liyansmutembei/status/1948476756...,@SafaricomPLC Hello @SafaricomPLC @safaricom...,0,0,0,0,47,"July 24, 2025 at 08:13 PM",Neutral,SafaricomPLC Hello SafaricomPLC safaricom can ...
4,1.95e+18,https://x.com/SsirNixoNdugire/status/194833516...,@PeterNdegwa_ @SafaricomPLC @Safaricom_Care @S...,0,0,0,0,5,"July 24, 2025 at 10:51 AM",Neutral,PeterNdegwa SafaricomPLC SafaricomCare SafBusi...


In [9]:
# encode the target column
from sklearn.preprocessing import LabelEncoder

# initialize the label encoder
le = LabelEncoder()

# fit and transform the 'Labels' column to numeric labels
safaricom_df['encoded_labels'] = le.fit_transform(safaricom_df['Labels'])

# check the mapping
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label mapping:", label_mapping)

Label mapping: {'Customer care complaint': np.int64(0), 'Data protection and privacy concern': np.int64(1), 'Hate Speech': np.int64(2), 'Internet or airtime bundle complaint': np.int64(3), 'MPESA complaint': np.int64(4), 'Network reliability problem': np.int64(5), 'Neutral': np.int64(6)}


In [10]:
# omit the date column for data splitting
safaricom_df.drop(columns=['Date', 'Tweet ID', 'URL'], axis=1, inplace=True)
safaricom_df.head()

Unnamed: 0,Content,Likes,Retweets,Replies,Quotes,Views,Labels,cleaned_sentence,encoded_labels
0,How comes I have overdue debts.. na sijakopa.....,1,0,0,0,21,Customer care complaint,How comes I have overdue debts na sijakopawhat...,0
1,@Monty_Hasashi @Safaricom 😂😂,0,0,0,0,22,Neutral,MontyHasashi Safaricom,6
2,"@safaricom weka data ,wacheni jokes...Thank yo...",0,0,0,0,6,Neutral,safaricom weka data wacheni jokesThank you for...,6
3,@SafaricomPLC Hello @SafaricomPLC @safaricom...,0,0,0,0,47,Neutral,SafaricomPLC Hello SafaricomPLC safaricom can ...,6
4,@PeterNdegwa_ @SafaricomPLC @Safaricom_Care @S...,0,0,0,0,5,Neutral,PeterNdegwa SafaricomPLC SafaricomCare SafBusi...,6


In [11]:
# split the data into train and test sets
from sklearn.model_selection import train_test_split
train_val_df, test_df = train_test_split(
    safaricom_df,
    test_size=0.3,
    stratify=safaricom_df['Labels'], # maintain class proportions
    random_state=42
)

# split the train_val_df into train and validation sets
train_df, val_df = train_test_split(
    train_val_df,
    test_size=0.3,
    stratify=train_val_df['Labels'], # maintain class proportions
    random_state=42
)

# check the shape of the splits
# Check the shape of splits
print(f"Train size: {train_df.shape[0]}")
print(f"Validation size: {val_df.shape[0]}")
print(f"Test size: {test_df.shape[0]}")

# Check label distribution in each set
print("Train label distribution:")
print(train_df['Labels'].value_counts(normalize=True))
print("Validation label distribution:")
print(val_df['Labels'].value_counts(normalize=True))
print("Test label distribution:")
print(test_df['Labels'].value_counts(normalize=True))

Train size: 3011
Validation size: 1291
Test size: 1844
Train label distribution:
Labels
Neutral                                 0.530721
Customer care complaint                 0.108602
Internet or airtime bundle complaint    0.086350
Hate Speech                             0.079044
Network reliability problem             0.072401
MPESA complaint                         0.071405
Data protection and privacy concern     0.051478
Name: proportion, dtype: float64
Validation label distribution:
Labels
Neutral                                 0.530596
Customer care complaint                 0.109218
Internet or airtime bundle complaint    0.086754
Hate Speech                             0.079009
Network reliability problem             0.072037
MPESA complaint                         0.071263
Data protection and privacy concern     0.051123
Name: proportion, dtype: float64
Test label distribution:
Labels
Neutral                                 0.530369
Customer care complaint                 0

In [15]:
# install the transformers library
!pip install transformers



## Data Tokenization

In [13]:
from transformers import XLMRobertaTokenizer, AutoTokenizer

# initialize the XLM-RoBERTa tokenizer
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

In [14]:
# tokenize the texts
def tokenize_texts(texts, max_length=128):
  return tokenizer(
      texts,                    # list of texts
      padding='max_length',     # pad all to max_length
      truncation=True,          # truncate longer texts
      max_length=max_length,    # max token length
      return_tensors='pt'       # Pytorch tensors
)

# create the train, val and test tokens
train_tokens = tokenize_texts(train_df['cleaned_sentence'].tolist())
val_tokens = tokenize_texts(val_df['cleaned_sentence'].tolist())
test_tokens = tokenize_texts(test_df['cleaned_sentence'].tolist())

In [15]:
# convert the labels columns from each dataset split into Pytorch tensors
# this is required for compatibility with the Pytorch model during training and evaluation
import torch
train_labels = torch.tensor(train_df['encoded_labels'].values)
val_labels = torch.tensor(val_df['encoded_labels'].values)
test_labels = torch.tensor(test_df['encoded_labels'].values)

In [16]:
# create a custom Pytorch dataset
from torch.utils.data import Dataset

class TweetDataset(Dataset):
  """
  A custom Pytorch dataset for handling tokenized inputs and labels for tweet classification
  It allows easy batching and data loading during training and evaluation
  """
  def __init__(self, tokens, labels):
    """
    Args:
        tokens (list): Tokenized inputs like input_ids and attention mask (Pytorch tensors).
        labels (torch.Tensor): Corresponding labels tensor
    """
    self.tokens = tokens
    self.labels = labels

  def __getitem__(self, idx):
    """
    Retrieve a single sample by index

    Returns:
        dict: Toknized inputs and corresponding labels
    """
    item = {key: val[idx] for key, val in self.tokens.items()}
    item['labels'] = self.labels[idx]

  def __len__(self):
    # returns the total number of samples
    return len(self.labels)

# create the dataset objects for train, validation and test sets
train_dataset = TweetDataset(train_tokens, train_labels)
val_dataset = TweetDataset(val_tokens, val_labels)
test_dataset = TweetDataset(test_tokens, test_labels)

In [17]:
# save the processed training, validation and test dataframes as CSV files
train_df.to_csv('/content/drive/MyDrive/Safaricom-processed-dataset/train_processed.csv', index=False)
val_df.to_csv('/content/drive/MyDrive/Safaricom-processed-dataset/val_processed.csv', index=False)
test_df.to_csv('/content/drive/MyDrive/Safaricom-processed-dataset/test_processed.csv', index=False)

print("Train, validation, and test sets saved to the Google Drive folder.")

Train, validation, and test sets saved to the Google Drive folder.
