In [11]:
#importing the necessary libraries
import pandas as pd
import numpy as np
import os
import sys
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from typing import Union, List, Optional, Dict, Any
import logging


In [2]:
#checking files present
!ls

Cleaning.ipynb
Safaricom tweets.csv
online_hate-speech_and_complaints_detection
saf_tweets_cleaned.csv


In [3]:
#Loading the Safaricom tweets dataset
Saf_tweets=pd.read_csv('Safaricom tweets.csv')
#Displaying the first few rows of the dataset
Saf_tweets.head()


Unnamed: 0,Tweet ID,URL,Content,Likes,Retweets,Replies,Quotes,Views,Date,Labels
0,1.95e+18,https://x.com/MawiaDorothy/status/194955836816...,How comes I have overdue debts.. na sijakopa.....,1,0,0,0,21,"July 27, 2025 at 07:51 PM",Customer care complaint
1,1.95e+18,https://x.com/KruiGeofrey/status/1949310365839...,@Monty_Hasashi @Safaricom 😂😂,0,0,0,0,22,"July 27, 2025 at 03:26 AM",Neutral
2,1.95e+18,https://x.com/martozgicha/status/1949022872242...,"@safaricom weka data ,wacheni jokes...Thank yo...",0,0,0,0,6,"July 26, 2025 at 08:23 AM",Internet or airtime bundle complaint
3,1.95e+18,https://x.com/liyansmutembei/status/1948476756...,@SafaricomPLC Hello @SafaricomPLC @safaricom...,0,0,0,0,47,"July 24, 2025 at 08:13 PM",Customer care complaint
4,1.95e+18,https://x.com/SsirNixoNdugire/status/194833516...,@PeterNdegwa_ @SafaricomPLC @Safaricom_Care @S...,0,0,0,0,5,"July 24, 2025 at 10:51 AM",Customer care complaint


In [4]:
#checking info
Saf_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2574 entries, 0 to 2573
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Tweet ID  2574 non-null   float64
 1   URL       2574 non-null   object 
 2   Content   2574 non-null   object 
 3   Likes     2574 non-null   int64  
 4   Retweets  2574 non-null   int64  
 5   Replies   2574 non-null   int64  
 6   Quotes    2574 non-null   int64  
 7   Views     2574 non-null   int64  
 8   Date      2574 non-null   object 
 9   Labels    2573 non-null   object 
dtypes: float64(1), int64(5), object(4)
memory usage: 201.2+ KB


In [5]:
#Check unique counts in labels
Saf_tweets['Labels'].value_counts()

Neutral                                 1032
Customer care complaint                  397
Internet or airtime bundle complaint     299
Hate Speech                              297
MPESA complaint                          189
Network reliability problem              184
Data protection and privacy concern      175
Name: Labels, dtype: int64

In [6]:
#Check for duplicated values
Saf_tweets.duplicated().sum()

0

### Data cleaning and preparation

##### Contradiction dictionary

In [8]:
# Global tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Contractions dictionary
contractions = {
    "won't": "will not", "can't": "cannot", "n't": " not",
    "'re": " are", "'ve": " have", "'ll": " will",
    "'d": " would", "'m": " am", "it's": "it is",
    "that's": "that is", "what's": "what is",
    "there's": "there is", "here's": "here is"
}

In [9]:
def expand_contractions_text(text, contractions=contractions):
    """Expand contractions in the text."""
    for contraction, expansion in contractions.items():
        text = text.replace(contraction, expansion)
    return text


#### Removing repeated characters

In [10]:
def remove_repeated_characters(text):
    """Reduce repeated characters (e.g., soooo → soo)."""
    return re.sub(r'(.)\1{2,}', r'\1\1', text)


##### Basic cleaning function

In [11]:
def basic_cleaning(text, 
                   remove_urls=True,
                   remove_mentions=True,
                   remove_hashtags=True):
    """Apply basic regex cleaning to text."""
    if pd.isna(text):
        return ''
    
    text = str(text)

    if remove_urls:
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)

    if remove_mentions:
        text = re.sub(r'@\w+', '', text)

    if remove_hashtags:
        text = re.sub(r'#', '', text)

    text = re.sub(r'[^a-zA-Z\s!?]', '', text)  # Remove special chars but keep ! ?
    text = re.sub(r'!+', '!', text)
    text = re.sub(r'\?+', '?', text)
    text = re.sub(r'\s+', ' ', text).strip()

    return text


In [12]:
def clean_text_pipeline(text,
                        expand_contractions=True,
                        remove_repeated_chars=True,
                        remove_urls=True,
                        remove_mentions=True,
                        remove_hashtags=True):
    """Complete cleaning pipeline (no tokenizing or lemmatizing)."""
    if pd.isna(text):
        return ''
    
    text = str(text)

    if expand_contractions:
        text = expand_contractions_text(text)
    
    if remove_repeated_chars:
        text = remove_repeated_characters(text)

    text = basic_cleaning(
        text,
        remove_urls=remove_urls,
        remove_mentions=remove_mentions,
        remove_hashtags=remove_hashtags
    )

    return text


## Testing if it has worked with some tweets

In [13]:
tweets = [
    "My @safaricom network is misbehaving",
    "@safaricom rudisheni hii na mnipee bundles .sasa sms nazifanyia nini https://t.co/CvaD1kd5wM",
    "@Shikanda_00 @safaricom",
    "@safaricom you are a scam https://t.co/80BRkJ5uB2"
]

for t in tweets:
    print("Cleaned:", clean_text_pipeline(t))


Cleaned: My network is misbehaving
Cleaned: rudisheni hii na mnipee bundles sasa sms nazifanyia nini
Cleaned: 
Cleaned: you are a scam


In [14]:
## testing it on the Safaricom tweets dataset
Saf_tweets['Cleaned_Text'] = Saf_tweets['Content'].apply(clean_text_pipeline)

In [15]:
Saf_tweets[['Content', 'Cleaned_Text']].head()

Unnamed: 0,Content,Cleaned_Text
0,How comes I have overdue debts.. na sijakopa.....,How comes I have overdue debts na sijakopawhat...
1,@Monty_Hasashi @Safaricom 😂😂,
2,"@safaricom weka data ,wacheni jokes...Thank yo...",weka data wacheni jokesThank you for being par...
3,@SafaricomPLC Hello @SafaricomPLC @safaricom...,Hello can you borrow from Airtel and allow man...
4,@PeterNdegwa_ @SafaricomPLC @Safaricom_Care @S...,Jambo Kindly consider introducing a Narration ...


In [16]:
Saf_tweets[['Content', 'Cleaned_Text']].to_csv('saf_tweets_cleaned.csv', index=False)


In [17]:
!ls

Cleaning.ipynb
Safaricom tweets.csv
online_hate-speech_and_complaints_detection
saf_tweets_cleaned.csv


## Preprocessing

In [12]:
class EnhancedTextCleaner:
    """A preprocessing class for Safaricom tweets analysis.
    
    This class handles:
    - Data cleaning (removing URLs, mentions, hashtags, special characters)
    - Text preprocessing (tokenization, lemmatization, stop words removal)
    - Feature extraction using TF-IDF or Count Vectorization

    This class can be implemented directly in a scikit-learn pipeline"""
    
    def __init__(self, 
                 lemmatizer=None, 
                 stop_words=None, 
                 min_length=2,
                 max_length=50,
                 remove_numbers=True,
                 custom_patterns=None):
        self.lemmatizer = lemmatizer or WordNetLemmatizer()
        self.stop_words = stop_words or set(stopwords.words('english'))
        self.min_length = min_length
        self.max_length = max_length
        self.remove_numbers = remove_numbers
        self.custom_patterns = custom_patterns or []
        
    def clean_and_lemmatize(self, tokens: List[str]) -> List[str]:
        """Clean tokens with enhanced filtering and lemmatization."""
        if not tokens:
            return []
            
        # Filter tokens
        filtered = []
        for token in tokens:
            # Length check
            if not (self.min_length <= len(token) <= self.max_length):
                continue
            # Stop words check
            if token.lower() in self.stop_words:
                continue
            # Numbers check
            if self.remove_numbers and token.isdigit():
                continue
            # Custom patterns check
            if any(re.search(pattern, token) for pattern in self.custom_patterns):
                continue
                
            filtered.append(token)
        
        # Lemmatize
        return [self.lemmatizer.lemmatize(token.lower()) for token in filtered]

class EnhancedTweetPreprocessor(BaseEstimator, TransformerMixin):
    """
    Enhanced tweet preprocessor with advanced features:
    - Better text cleaning with configurable options
    - Support for multiple input formats
    - Robust error handling and logging
    - Feature extraction statistics
    - Memory-efficient processing
    """
    
    def __init__(self,
                 text_cleaner: Optional[EnhancedTextCleaner] = None,
                 lowercase: bool = True,
                 use_tfidf: bool = True,
                 vectorizer_params: Optional[Dict[str, Any]] = None,
                 preserve_case_words: Optional[List[str]] = None,
                 enable_logging: bool = False):
        
        # Initialize text cleaner
        if text_cleaner is None:
            try:
                lemmatizer = WordNetLemmatizer()
                stop_words = set(stopwords.words('english'))
                self.text_cleaner = EnhancedTextCleaner(lemmatizer, stop_words)
            except LookupError:
                # Fallback if NLTK data not available
                self.text_cleaner = EnhancedTextCleaner()
        else:
            self.text_cleaner = text_cleaner
            
        self.lowercase = lowercase
        self.use_tfidf = use_tfidf
        self.preserve_case_words = set(preserve_case_words or [])
        self.vectorizer = None
        self.feature_stats_ = {}
        
        # Default vectorizer parameters
        default_params = {
            'max_features': 5000,
            'ngram_range': (1, 2),
            'min_df': 2,
            'max_df': 0.95,
            'stop_words': 'english'
        }
        self.vectorizer_params = {**default_params, **(vectorizer_params or {})}
        
        # Setup logging
        if enable_logging:
            logging.basicConfig(level=logging.INFO)
            self.logger = logging.getLogger(__name__)
        else:
            self.logger = None

    def _log(self, message: str):
        """Log message if logging is enabled."""
        if self.logger:
            self.logger.info(message)

    def clean_text(self, text: str) -> str:
        """Enhanced text cleaning with better pattern matching."""
        if pd.isna(text) or not text:
            return ""
        
        text = str(text)
        
        # Preserve case for specific words before lowercasing
        preserved_words = {}
        for word in self.preserve_case_words:
            if word in text:
                placeholder = f"__PRESERVE_{len(preserved_words)}__"
                preserved_words[placeholder] = word
                text = text.replace(word, placeholder)
        
        if self.lowercase:
            text = text.lower()
        
        # Enhanced cleaning patterns
        patterns = [
            (r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ''),  # URLs
            (r'www\.(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ''),  # www URLs
            (r'@[A-Za-z0-9_]+', ''),  # Mentions
            (r'#[A-Za-z0-9_]+', ''),  # Hashtags
            (r'RT\s+', ''),  # Retweet indicators
            (r'\\d+\', ''),  # Numbers
            (r'[^\w\s]', ' '),  # Non-alphanumeric except spaces
            (r'\s+', ' ')  # Multiple spaces
        ]
        
        for pattern, replacement in patterns:
            text = re.sub(pattern, replacement, text, flags=re.MULTILINE)
        
        # Restore preserved words
        for placeholder, original in preserved_words.items():
            text = text.replace(placeholder, original)
        
        return text.strip()

    def preprocess_text(self, text: str) -> str:
        """Preprocess text with enhanced tokenization."""
        if not text:
            return ""
        
        tokens = text.split()
        cleaned_tokens = self.text_cleaner.clean_and_lemmatize(tokens)
        
        return ' '.join(cleaned_tokens)

    def _extract_text_from_input(self, X) -> List[str]:
        """Extract text from various input formats with better error handling."""
        if isinstance(X, pd.DataFrame):
            # Try common text column names
            text_columns = ['text', 'content', 'tweet', 'message', 'Content', 'Cleaned_Text', 'body']
            for col in text_columns:
                if col in X.columns:
                    self._log(f"Using column '{col}' for text extraction")
                    return X[col].fillna('').astype(str).tolist()
            
            # Fallback to first column
            self._log("Using first column for text extraction")
            return X.iloc[:, 0].fillna('').astype(str).tolist()
            
        elif isinstance(X, pd.Series):
            return X.fillna('').astype(str).tolist()
        elif isinstance(X, (list, tuple)):
            return [str(x) for x in X]
        elif isinstance(X, np.ndarray):
            return [str(x) for x in X.flatten()]
        else:
            return [str(X)]

    def fit(self, X, y=None):
        """Fit the preprocessor with enhanced statistics tracking."""
        self._log("Starting fit process")
        
        text_data = self._extract_text_from_input(X)
        self._log(f"Extracted {len(text_data)} text samples")
        
        # Preprocess texts
        processed_texts = []
        empty_count = 0
        
        for text in text_data:
            cleaned = self.clean_text(text)
            processed = self.preprocess_text(cleaned)
            
            if processed.strip():
                processed_texts.append(processed)
            else:
                empty_count += 1
        
        if not processed_texts:
            raise ValueError("No valid text data found after preprocessing")
        
        # Store statistics
        self.feature_stats_ = {
            'total_samples': len(text_data),
            'valid_samples': len(processed_texts),
            'empty_samples': empty_count,
            'avg_length': np.mean([len(text.split()) for text in processed_texts])
        }
        
        self._log(f"Preprocessing stats: {self.feature_stats_}")
        
        # Initialize vectorizer
        if self.use_tfidf:
            self.vectorizer = TfidfVectorizer(**self.vectorizer_params)
        else:
            # Adjust parameters for CountVectorizer
            count_params = self.vectorizer_params.copy()
            if count_params.get('min_df', 0) < 1:
                count_params['min_df'] = 1
            self.vectorizer = CountVectorizer(**count_params)

        self.vectorizer.fit(processed_texts)
        self._log("Vectorizer fitted successfully")
        
        return self

    def transform(self, X):
        """Transform with better error handling."""
        if self.vectorizer is None:
            raise ValueError("Preprocessor has not been fitted yet. Call fit() first.")
        
        text_data = self._extract_text_from_input(X)
        processed_texts = [
            self.preprocess_text(self.clean_text(text)) 
            for text in text_data
        ]
        
        return self.vectorizer.transform(processed_texts)

    def fit_transform(self, X, y=None):
        """Fit and transform in one step."""
        return self.fit(X, y).transform(X)

    def get_feature_names_out(self, input_features=None):
        """Get feature names with error handling."""
        if self.vectorizer is None:
            raise ValueError("Preprocessor has not been fitted yet. Call fit() first")
        return self.vectorizer.get_feature_names_out()

    def get_feature_stats(self) -> Dict[str, Any]:
        """Get preprocessing statistics."""
        return self.feature_stats_.copy()

    def get_top_features(self, n=20) -> List[str]:
        """Get top n features by importance (for TF-IDF)."""
        if self.vectorizer is None:
            raise ValueError("Preprocessor has not been fitted yet")
        
        if hasattr(self.vectorizer, 'idf_'):
            # For TF-IDF, get features with highest IDF scores
            feature_names = self.get_feature_names_out()
            idf_scores = self.vectorizer.idf_
            top_indices = np.argsort(idf_scores)[-n:][::-1]
            return [feature_names[i] for i in top_indices]
        else:
            # For CountVectorizer, just return first n features
            return list(self.get_feature_names_out()[:n])

print("Enhanced TweetPreprocessor created with advanced features:")

Enhanced TweetPreprocessor created with advanced features:


### Testing the whole class

In [23]:
def test_text_cleaning(tweets):
    for i, tweet in enumerate(tweets[:3], start=1):
        cleaned = preprocessor.preprocess_text(tweet)
        print(f"Tweet {i}:\nOriginal: {tweet}\nCleaned:  {cleaned}\n")
    print("=" * 50)

In [24]:
sample_tweets = [
    "Safaricom's network has been down all morning — can't even make a simple call. 🙄 #Fail",
    "Tried buying airtime on M-PESA and it vanished. No confirmation, no refund. Typical Safaricom. 😡",
    "Safaricom's data bundles expire faster than my morning coffee. Absolute scam. ☕📉"
]
test_text_cleaning(sample_tweets)


Tweet 1:
Original: Safaricom's network has been down all morning — can't even make a simple call. 🙄 #Fail
Cleaned:  safaricom network morning even make simple call fail

Tweet 2:
Original: Tried buying airtime on M-PESA and it vanished. No confirmation, no refund. Typical Safaricom. 😡
Cleaned:  tried buying airtime pesa vanished confirmation refund typical safaricom

Tweet 3:
Original: Safaricom's data bundles expire faster than my morning coffee. Absolute scam. ☕📉
Cleaned:  safaricom data bundle expire faster morning coffee absolute scam



In [26]:
print("Step 2: Testing fit and transform...\n")

# Fit and transform
preprocessor.fit(sample_tweets)
feature_matrix = preprocessor.transform(sample_tweets)

# Inspect the results
print(f"Feature matrix shape: {feature_matrix.shape}")
print(f"Feature matrix type: {type(feature_matrix)}")
print(f"Number of features: {len(preprocessor.get_feature_names())}\n")

# Preview feature names
feature_names = preprocessor.get_feature_names()
print("Sample features:")
print(", ".join(feature_names[:20]))

print("\n" + "=" * 50)

Step 2: Testing fit and transform...

Feature matrix shape: (3, 45)
Feature matrix type: <class 'scipy.sparse.csr.csr_matrix'>
Number of features: 45

Sample features:
absolute, absolute scam, airtime, airtime pesa, bundle, bundle expire, buying, buying airtime, call, call fail, coffee, coffee absolute, confirmation, confirmation refund, data, data bundle, even, even make, expire, expire faster

