In [1]:
#importing the necessary libraries
import pandas as pd
import numpy as np
import os
import sys
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)



True

In [2]:
#checking files present
!ls

Cleaning.ipynb
README.md
notebook.ipynb
saf_tweets_cleaned.csv


In [4]:
#Loading the Safaricom tweets dataset
Saf_tweets=pd.read_csv('Safaricom tweets.csv')
#Displaying the first few rows of the dataset
Saf_tweets.head()


Unnamed: 0,Tweet ID,URL,Content,Likes,Retweets,Replies,Quotes,Views,Date,Labels
0,1.95e+18,https://x.com/MawiaDorothy/status/194955836816...,How comes I have overdue debts.. na sijakopa.....,1,0,0,0,21,"July 27, 2025 at 07:51 PM",Customer care complaint
1,1.95e+18,https://x.com/KruiGeofrey/status/1949310365839...,@Monty_Hasashi @Safaricom 😂😂,0,0,0,0,22,"July 27, 2025 at 03:26 AM",Neutral
2,1.95e+18,https://x.com/martozgicha/status/1949022872242...,"@safaricom weka data ,wacheni jokes...Thank yo...",0,0,0,0,6,"July 26, 2025 at 08:23 AM",Internet or airtime bundle complaint
3,1.95e+18,https://x.com/liyansmutembei/status/1948476756...,@SafaricomPLC Hello @SafaricomPLC @safaricom...,0,0,0,0,47,"July 24, 2025 at 08:13 PM",Customer care complaint
4,1.95e+18,https://x.com/SsirNixoNdugire/status/194833516...,@PeterNdegwa_ @SafaricomPLC @Safaricom_Care @S...,0,0,0,0,5,"July 24, 2025 at 10:51 AM",Customer care complaint


In [5]:
#checking info
Saf_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2574 entries, 0 to 2573
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Tweet ID  2574 non-null   float64
 1   URL       2574 non-null   object 
 2   Content   2574 non-null   object 
 3   Likes     2574 non-null   int64  
 4   Retweets  2574 non-null   int64  
 5   Replies   2574 non-null   int64  
 6   Quotes    2574 non-null   int64  
 7   Views     2574 non-null   int64  
 8   Date      2574 non-null   object 
 9   Labels    2573 non-null   object 
dtypes: float64(1), int64(5), object(4)
memory usage: 201.2+ KB


In [6]:
#Check unique counts in labels
Saf_tweets['Labels'].value_counts()

Neutral                                 1032
Customer care complaint                  397
Internet or airtime bundle complaint     299
Hate Speech                              297
MPESA complaint                          189
Network reliability problem              184
Data protection and privacy concern      175
Name: Labels, dtype: int64

In [7]:
#Check for duplicated values
Saf_tweets.duplicated().sum()

0

### Data cleaning and preparation

##### Contradiction dictionary

In [8]:
# Global tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Contractions dictionary
contractions = {
    "won't": "will not", "can't": "cannot", "n't": " not",
    "'re": " are", "'ve": " have", "'ll": " will",
    "'d": " would", "'m": " am", "it's": "it is",
    "that's": "that is", "what's": "what is",
    "there's": "there is", "here's": "here is"
}

In [9]:
def expand_contractions_text(text, contractions=contractions):
    """Expand contractions in the text."""
    for contraction, expansion in contractions.items():
        text = text.replace(contraction, expansion)
    return text


#### Removing repeated characters

In [10]:
def remove_repeated_characters(text):
    """Reduce repeated characters (e.g., soooo → soo)."""
    return re.sub(r'(.)\1{2,}', r'\1\1', text)


##### Basic cleaning function

In [11]:
def basic_cleaning(text, 
                   remove_urls=True,
                   remove_mentions=True,
                   remove_hashtags=True):
    """Apply basic regex cleaning to text."""
    if pd.isna(text):
        return ''
    
    text = str(text)

    if remove_urls:
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)

    if remove_mentions:
        text = re.sub(r'@\w+', '', text)

    if remove_hashtags:
        text = re.sub(r'#', '', text)

    text = re.sub(r'[^a-zA-Z\s!?]', '', text)  # Remove special chars but keep ! ?
    text = re.sub(r'!+', '!', text)
    text = re.sub(r'\?+', '?', text)
    text = re.sub(r'\s+', ' ', text).strip()

    return text


In [12]:
def clean_text_pipeline(text,
                        expand_contractions=True,
                        remove_repeated_chars=True,
                        remove_urls=True,
                        remove_mentions=True,
                        remove_hashtags=True):
    """Complete cleaning pipeline (no tokenizing or lemmatizing)."""
    if pd.isna(text):
        return ''
    
    text = str(text)

    if expand_contractions:
        text = expand_contractions_text(text)
    
    if remove_repeated_chars:
        text = remove_repeated_characters(text)

    text = basic_cleaning(
        text,
        remove_urls=remove_urls,
        remove_mentions=remove_mentions,
        remove_hashtags=remove_hashtags
    )

    return text


## Testing if it has worked with some tweets

In [13]:
tweets = [
    "My @safaricom network is misbehaving",
    "@safaricom rudisheni hii na mnipee bundles .sasa sms nazifanyia nini https://t.co/CvaD1kd5wM",
    "@Shikanda_00 @safaricom",
    "@safaricom you are a scam https://t.co/80BRkJ5uB2"
]

for t in tweets:
    print("Cleaned:", clean_text_pipeline(t))


Cleaned: My network is misbehaving
Cleaned: rudisheni hii na mnipee bundles sasa sms nazifanyia nini
Cleaned: 
Cleaned: you are a scam


In [15]:
## testing it on the Safaricom tweets dataset
Saf_tweets['Cleaned_Text'] = Saf_tweets['Content'].apply(clean_text_pipeline)

In [16]:
Saf_tweets[['Content', 'Cleaned_Text']].head()

Unnamed: 0,Content,Cleaned_Text
0,How comes I have overdue debts.. na sijakopa.....,How comes I have overdue debts na sijakopawhat...
1,@Monty_Hasashi @Safaricom 😂😂,
2,"@safaricom weka data ,wacheni jokes...Thank yo...",weka data wacheni jokesThank you for being par...
3,@SafaricomPLC Hello @SafaricomPLC @safaricom...,Hello can you borrow from Airtel and allow man...
4,@PeterNdegwa_ @SafaricomPLC @Safaricom_Care @S...,Jambo Kindly consider introducing a Narration ...


In [17]:
#savinga the cleaned tweets to a new CSV file
Saf_tweets[['Content', 'Cleaned_Text']].to_csv('saf_tweets_cleaned.csv', index=False)


In [19]:
!ls

Cleaning.ipynb
README.md
Safaricom tweets.csv
notebook.ipynb
saf_tweets_cleaned.csv
