# Safaricom Tweets Classification for complaints & Hate Speech Detection

In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# nltk libraries
import nltk
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re

## Load the dataset

In [2]:
# load dataset
safaricom_df = pd.read_csv('data/safaricom_data.csv')
safaricom_df.head()

Unnamed: 0,Tweet ID,URL,Content,Likes,Retweets,Replies,Quotes,Views,Date,Labels
0,1.95e+18,https://x.com/MawiaDorothy/status/194955836816...,How comes I have overdue debts.. na sijakopa.....,1,0,0,0,21,"July 27, 2025 at 07:51 PM",Customer care complaint
1,1.95e+18,https://x.com/KruiGeofrey/status/1949310365839...,@Monty_Hasashi @Safaricom 😂😂,0,0,0,0,22,"July 27, 2025 at 03:26 AM",Neutral
2,1.95e+18,https://x.com/martozgicha/status/1949022872242...,"@safaricom weka data ,wacheni jokes...Thank yo...",0,0,0,0,6,"July 26, 2025 at 08:23 AM",Internet or airtime bundle complaint
3,1.95e+18,https://x.com/liyansmutembei/status/1948476756...,@SafaricomPLC Hello @SafaricomPLC @safaricom...,0,0,0,0,47,"July 24, 2025 at 08:13 PM",Customer care complaint
4,1.95e+18,https://x.com/SsirNixoNdugire/status/194833516...,@PeterNdegwa_ @SafaricomPLC @Safaricom_Care @S...,0,0,0,0,5,"July 24, 2025 at 10:51 AM",Customer care complaint


In [3]:
# check the dataset info and shape
print(safaricom_df.info())
print("\nDataset shape:\n", safaricom_df.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2574 entries, 0 to 2573
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Tweet ID  2574 non-null   float64
 1   URL       2574 non-null   object 
 2   Content   2574 non-null   object 
 3   Likes     2574 non-null   int64  
 4   Retweets  2574 non-null   int64  
 5   Replies   2574 non-null   int64  
 6   Quotes    2574 non-null   int64  
 7   Views     2574 non-null   int64  
 8   Date      2574 non-null   object 
 9   Labels    2573 non-null   object 
dtypes: float64(1), int64(5), object(4)
memory usage: 201.2+ KB
None

Dataset shape:
 (2574, 10)


In [4]:
# check for duplicates
safaricom_df.duplicated().sum()

np.int64(0)

In [31]:
# check the distribution of the target variable
print("Distribution of the target variable:\n", safaricom_df['Labels'].value_counts())

fig = px.bar(
    safaricom_df['Labels'].value_counts().reset_index(name='count'),
    x='Labels',
    y='count',
    labels={'Labels': 'Label', 'count': 'Count'},
    title='Distribution of Target Variable'
)

# Get unique labels to assign a color to each
unique_labels = safaricom_df['Labels'].value_counts().reset_index(name='count')['Labels']
color_discrete_sequence = px.colors.qualitative.Plotly  # or use another palette like 'Set2', 'Pastel', etc.

# If there are more labels than colors in the palette, repeat the palette
if len(unique_labels) > len(color_discrete_sequence):
    color_discrete_sequence = color_discrete_sequence * (len(unique_labels) // len(color_discrete_sequence) + 1)

fig.update_traces(marker_color=color_discrete_sequence[:len(unique_labels)])
fig.show()

Distribution of the target variable:
 Labels
Neutral                                 1032
Customer care complaint                  397
Internet or airtime bundle complaint     299
Hate Speech                              297
MPESA complaint                          189
Network reliability problem              184
Data protection and privacy concern      175
Name: count, dtype: int64


## Feature Engineering

In [6]:
# function for generating columns with number of characters, words and sentences
def generate_columns(df):
    safaricom_df['chars'] = safaricom_df['Content'].apply(len)
    safaricom_df['words'] = safaricom_df['Content'].apply(lambda x: nltk.word_tokenize(x)).apply(len)
    safaricom_df['sentences'] = safaricom_df['Content'].apply(lambda x: nltk.sent_tokenize(x)).apply(len)
    return df
safaricom_df = generate_columns(safaricom_df)

# display safaricom_df with new columns
safaricom_df.head(10)

Unnamed: 0,Tweet ID,URL,Content,Likes,Retweets,Replies,Quotes,Views,Date,Labels,chars,words,sentences
0,1.95e+18,https://x.com/MawiaDorothy/status/194955836816...,How comes I have overdue debts.. na sijakopa.....,1,0,0,0,21,"July 27, 2025 at 07:51 PM",Customer care complaint,99,19,1
1,1.95e+18,https://x.com/KruiGeofrey/status/1949310365839...,@Monty_Hasashi @Safaricom 😂😂,0,0,0,0,22,"July 27, 2025 at 03:26 AM",Neutral,28,5,1
2,1.95e+18,https://x.com/martozgicha/status/1949022872242...,"@safaricom weka data ,wacheni jokes...Thank yo...",0,0,0,0,6,"July 26, 2025 at 08:23 AM",Internet or airtime bundle complaint,181,38,2
3,1.95e+18,https://x.com/liyansmutembei/status/1948476756...,@SafaricomPLC Hello @SafaricomPLC @safaricom...,0,0,0,0,47,"July 24, 2025 at 08:13 PM",Customer care complaint,280,53,4
4,1.95e+18,https://x.com/SsirNixoNdugire/status/194833516...,@PeterNdegwa_ @SafaricomPLC @Safaricom_Care @S...,0,0,0,0,5,"July 24, 2025 at 10:51 AM",Customer care complaint,255,48,2
5,1.95e+18,https://x.com/sokodirectory/status/19483095914...,"Update: @Safaricom PLC (NSE: SCOM), has annou...",0,0,0,0,80,"July 24, 2025 at 09:09 AM",Neutral,262,50,2
6,1.95e+18,https://x.com/obwedede/status/1948301344843919753,"Customer Growth: 50M Connected, The @Safaricom...",0,2,0,0,19,"July 24, 2025 at 08:36 AM",Neutral,196,39,4
7,1.95e+18,https://x.com/obwedede/status/1948295655346012530,"Through the @Safaricom and M-PESA Foundations,...",0,2,0,0,29,"July 24, 2025 at 08:14 AM",Neutral,226,39,3
8,1.95e+18,https://x.com/obwedede/status/1948295244396548199,"In the last financial year, @Safaricom contrib...",0,0,0,0,6,"July 24, 2025 at 08:12 AM",Neutral,224,44,4
9,1.95e+18,https://x.com/obwedede/status/1948294724365795439,"Beyond Kenya, @Safaricom is also winning in Et...",0,1,1,0,17,"July 24, 2025 at 08:10 AM",Neutral,243,43,3


In [7]:
# drop the Tweet ID and URL columns
safaricom_df = safaricom_df.drop(columns=['Tweet ID', 'URL'], axis=1)
safaricom_df.head()

Unnamed: 0,Content,Likes,Retweets,Replies,Quotes,Views,Date,Labels,chars,words,sentences
0,How comes I have overdue debts.. na sijakopa.....,1,0,0,0,21,"July 27, 2025 at 07:51 PM",Customer care complaint,99,19,1
1,@Monty_Hasashi @Safaricom 😂😂,0,0,0,0,22,"July 27, 2025 at 03:26 AM",Neutral,28,5,1
2,"@safaricom weka data ,wacheni jokes...Thank yo...",0,0,0,0,6,"July 26, 2025 at 08:23 AM",Internet or airtime bundle complaint,181,38,2
3,@SafaricomPLC Hello @SafaricomPLC @safaricom...,0,0,0,0,47,"July 24, 2025 at 08:13 PM",Customer care complaint,280,53,4
4,@PeterNdegwa_ @SafaricomPLC @Safaricom_Care @S...,0,0,0,0,5,"July 24, 2025 at 10:51 AM",Customer care complaint,255,48,2


## Data Cleaning

In [8]:
# define a data cleaning function
def clean_data(text):
    # remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)

    # remove only the @ and # signs, keep the words
    text = re.sub(r'[@#]', '', text)

    # remove special characters (but keep numbers)
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)

    # convert to lowercase
    text = text.lower()

    # remove additional whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# apply the function to the Content column
safaricom_df['Content'] = safaricom_df['Content'].apply(clean_data)

# display the first 10 rows of the cleaned dataset
safaricom_df.head(10)

Unnamed: 0,Content,Likes,Retweets,Replies,Quotes,Views,Date,Labels,chars,words,sentences
0,how comes i have overdue debts na sijakopawhat...,1,0,0,0,21,"July 27, 2025 at 07:51 PM",Customer care complaint,99,19,1
1,montyhasashi safaricom,0,0,0,0,22,"July 27, 2025 at 03:26 AM",Neutral,28,5,1
2,safaricom weka data wacheni jokesthank you for...,0,0,0,0,6,"July 26, 2025 at 08:23 AM",Internet or airtime bundle complaint,181,38,2
3,safaricomplc hello safaricomplc safaricom can ...,0,0,0,0,47,"July 24, 2025 at 08:13 PM",Customer care complaint,280,53,4
4,peterndegwa safaricomplc safaricomcare safbusi...,0,0,0,0,5,"July 24, 2025 at 10:51 AM",Customer care complaint,255,48,2
5,update safaricom plc nse scom has announced a ...,0,0,0,0,80,"July 24, 2025 at 09:09 AM",Neutral,262,50,2
6,customer growth 50m connected the safaricom st...,0,2,0,0,19,"July 24, 2025 at 08:36 AM",Neutral,196,39,4
7,through the safaricom and mpesa foundations 8m...,0,2,0,0,29,"July 24, 2025 at 08:14 AM",Neutral,226,39,3
8,in the last financial year safaricom contribut...,0,0,0,0,6,"July 24, 2025 at 08:12 AM",Neutral,224,44,4
9,beyond kenya safaricom is also winning in ethi...,0,1,1,0,17,"July 24, 2025 at 08:10 AM",Neutral,243,43,3


## Data Preprocessing

In [9]:
# tokenize the tweet content
safaricom_df['Tokenized Text'] = safaricom_df['Content'].apply(lambda document: str.lower(document)).apply(lambda doc: nltk.word_tokenize(doc))
safaricom_df.head()

Unnamed: 0,Content,Likes,Retweets,Replies,Quotes,Views,Date,Labels,chars,words,sentences,Tokenized Text
0,how comes i have overdue debts na sijakopawhat...,1,0,0,0,21,"July 27, 2025 at 07:51 PM",Customer care complaint,99,19,1,"[how, comes, i, have, overdue, debts, na, sija..."
1,montyhasashi safaricom,0,0,0,0,22,"July 27, 2025 at 03:26 AM",Neutral,28,5,1,"[montyhasashi, safaricom]"
2,safaricom weka data wacheni jokesthank you for...,0,0,0,0,6,"July 26, 2025 at 08:23 AM",Internet or airtime bundle complaint,181,38,2,"[safaricom, weka, data, wacheni, jokesthank, y..."
3,safaricomplc hello safaricomplc safaricom can ...,0,0,0,0,47,"July 24, 2025 at 08:13 PM",Customer care complaint,280,53,4,"[safaricomplc, hello, safaricomplc, safaricom,..."
4,peterndegwa safaricomplc safaricomcare safbusi...,0,0,0,0,5,"July 24, 2025 at 10:51 AM",Customer care complaint,255,48,2,"[peterndegwa, safaricomplc, safaricomcare, saf..."


In [10]:
# lemmatize the tweet content
safaricom_df['Lematized Text'] = safaricom_df['Tokenized Text'].apply(lambda word_token: [nltk.WordNetLemmatizer().lemmatize(token) for token in word_token])
safaricom_df.head()

Unnamed: 0,Content,Likes,Retweets,Replies,Quotes,Views,Date,Labels,chars,words,sentences,Tokenized Text,Lematized Text
0,how comes i have overdue debts na sijakopawhat...,1,0,0,0,21,"July 27, 2025 at 07:51 PM",Customer care complaint,99,19,1,"[how, comes, i, have, overdue, debts, na, sija...","[how, come, i, have, overdue, debt, na, sijako..."
1,montyhasashi safaricom,0,0,0,0,22,"July 27, 2025 at 03:26 AM",Neutral,28,5,1,"[montyhasashi, safaricom]","[montyhasashi, safaricom]"
2,safaricom weka data wacheni jokesthank you for...,0,0,0,0,6,"July 26, 2025 at 08:23 AM",Internet or airtime bundle complaint,181,38,2,"[safaricom, weka, data, wacheni, jokesthank, y...","[safaricom, weka, data, wacheni, jokesthank, y..."
3,safaricomplc hello safaricomplc safaricom can ...,0,0,0,0,47,"July 24, 2025 at 08:13 PM",Customer care complaint,280,53,4,"[safaricomplc, hello, safaricomplc, safaricom,...","[safaricomplc, hello, safaricomplc, safaricom,..."
4,peterndegwa safaricomplc safaricomcare safbusi...,0,0,0,0,5,"July 24, 2025 at 10:51 AM",Customer care complaint,255,48,2,"[peterndegwa, safaricomplc, safaricomcare, saf...","[peterndegwa, safaricomplc, safaricomcare, saf..."


In [11]:
# save the cleaned and preprocessed dataframe
safaricom_df.to_csv('data/cleaned_safaricom_data.csv', index=False)

In [20]:
# join the lemmatized tokens back into strings for vectorization
safaricom_df['processed_text'] = safaricom_df['Lematized Text'].apply(lambda x: ' '.join(x))

# fill any NaN values in the 'Labels' column
safaricom_df['Labels'].fillna('unknown', inplace=True)

# Filter out unknown labels before training
safaricom_df = safaricom_df[safaricom_df['Labels'] != 'unknown']

# split the data into train and test sets
X = safaricom_df.drop(['Labels', 'Date'], axis=1)
y = safaricom_df['Labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





In [21]:
# Feature Extraction using CountVectorizer (Bag of Words)

# initialize the CountVectorizer
vectorizer = CountVectorizer(max_features=5000, stop_words='english')

# fit on training data and transform both training and validation data
X_train_vectorized = vectorizer.fit_transform(X_train['processed_text'])
X_test_vectorized = vectorizer.transform(X_test['processed_text'])

print(f"Training features shape: {X_train_vectorized.shape}")
print(f"Testing features shape: {X_test_vectorized.shape}")
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")

Training features shape: (2058, 5000)
Testing features shape: (515, 5000)
Vocabulary size: 5000


## Modeling

### 1. Logistic Regression

In [24]:
# instantiate the Logistic Regression model, and fit the data
log_model = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
log_model.fit(X_train_vectorized, y_train)

# make predictions on the test set
y_pred = log_model.predict(X_test_vectorized)

# evaluate the model
print(f"Logistic Regression F1-Score: {f1_score(y_test, y_pred, average='weighted', zero_division=0)}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Logistic Regression F1-Score: 0.620129656665342

Classification Report:
                                       precision    recall  f1-score   support

             Customer care complaint       0.42      0.47      0.45        72
 Data protection and privacy concern       0.55      0.31      0.39        39
                         Hate Speech       0.43      0.53      0.47        51
Internet or airtime bundle complaint       0.67      0.69      0.68        55
                     MPESA complaint       0.67      0.79      0.72        43
         Network reliability problem       0.68      0.55      0.61        42
                             Neutral       0.73      0.71      0.72       213

                            accuracy                           0.62       515
                           macro avg       0.59      0.58      0.58       515
                        weighted avg       0.63      0.62      0.62       515



### 2. Naive Bayes Model

In [25]:
# instantiate the Multinomial Naive Bayes model, and fit the data
nb_model = MultinomialNB()
nb_model.fit(X_train_vectorized, y_train)

# make predictions on the test set
y_pred = nb_model.predict(X_test_vectorized)

# evaluate the model
print(f"Naive Bayes F1-Score: {f1_score(y_test, y_pred, average='weighted', zero_division=0)}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Naive Bayes F1-Score: 0.5663385402113926

Classification Report:
                                       precision    recall  f1-score   support

             Customer care complaint       0.46      0.57      0.51        72
 Data protection and privacy concern       0.62      0.13      0.21        39
                         Hate Speech       0.53      0.37      0.44        51
Internet or airtime bundle complaint       0.61      0.65      0.63        55
                     MPESA complaint       0.70      0.49      0.58        43
         Network reliability problem       0.80      0.19      0.31        42
                             Neutral       0.63      0.83      0.71       213

                            accuracy                           0.60       515
                           macro avg       0.62      0.46      0.48       515
                        weighted avg       0.61      0.60      0.57       515



### 3. Random Forest Model

In [26]:
from sklearn.ensemble import RandomForestClassifier

# instantiate the Random Forest model, and fit the data
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_model.fit(X_train_vectorized, y_train)

# make predictions on the test set
y_pred = rf_model.predict(X_test_vectorized)

# evaluate the model
print(f"Random Forest F1-Score: {f1_score(y_test, y_pred, average='weighted', zero_division=0)}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Random Forest F1-Score: 0.5904624558969022

Classification Report:
                                       precision    recall  f1-score   support

             Customer care complaint       0.56      0.33      0.42        72
 Data protection and privacy concern       0.86      0.15      0.26        39
                         Hate Speech       0.63      0.37      0.47        51
Internet or airtime bundle complaint       0.60      0.78      0.68        55
                     MPESA complaint       0.62      0.74      0.67        43
         Network reliability problem       0.67      0.38      0.48        42
                             Neutral       0.63      0.85      0.72       213

                            accuracy                           0.62       515
                           macro avg       0.65      0.52      0.53       515
                        weighted avg       0.63      0.62      0.59       515



In [29]:
# test on unseen data
new_tweet = "There have been so many abductions in the country the last couple of months, and I bet on my life that Safaricom is taking part in it by sharing our information with those involved"

cleaned_new_tweet = clean_data(new_tweet)
new_tweet_tfidf = vectorizer.transform([cleaned_new_tweet])
predicted_label = log_model.predict(new_tweet_tfidf) # use the best performing model for inference

print(f"The predicted label for the new tweet is: {predicted_label[0]}")

The predicted label for the new tweet is: Data protection and privacy concern
