In [1]:
import re
import pandas as pd
import spacy
from transformers import AutoTokenizer
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nlp = spacy.load("en_core_web_sm")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

Error while downloading from https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt: HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out.
Trying to resume download...


In [3]:
df = pd.read_csv("../data/scam_detection_labeled.csv")
print("First sms:", df["message_content"].iloc[:5])

First sms: 0    Christa, finish your account sign-up and you c...
1    CONGRATULATION!\nYOUR ACCOUNT 254757547986 HAS...
2    🙏🙏 I can do all this through him who gives me ...
3    Hi, I'm Alice, a recruiter at VG Investments. ...
4    TAL6FH2DN CONFIRMED, YOU HAVE RECEIVED KES. 70...
Name: message_content, dtype: object


In [4]:
import re
import pandas as pd
import spacy

nlp = spacy.load("en_core_web_sm")


def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'\\n', ' ', text)
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'[\r\t\f\v]+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = re.sub(r'\s+', ' ', text.strip())  
    text = re.sub(r'http\S+|www\S+|https\S+|[A-Za-z0-9.-]+\.(com|org|net)\b', '', text, flags=re.MULTILINE)
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', '', text)
    text = re.sub(r'\+\d{10,12}\b|\+\d{1,3}-\d{3}-\d{3}-\d{4}\b|\b(?:\+?254|0)(7\d{8}|11\d{7})\b', '', text)
    text = re.sub(r'\b[A-Z0-9]{10}\b|\bconfirmed\b|\bcompleted\b', '', text, flags=re.IGNORECASE)
    text = re.sub(r'[^\w\s]', ' ', text) 

    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            text = text.replace(ent.text, '')

    text = re.sub(r'\s+', ' ', text.strip())

    # Second NER pass to catch missed names
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            text = text.replace(ent.text, '')

    doc = nlp(text)

    tokens = []

    for token in doc:
        if not token.is_stop and token.is_alpha and len(token.text) > 2:
            lemma = token.lemma_
            tokens.append(lemma)


    # Join tokens for traditional feature engineering
    cleaned_text = " ".join(tokens)

    
    return tokens, cleaned_text


In [5]:
# Define batch BERT tokenization function
def batch_bert_tokenize(texts, batch_size=32):
    """Tokenize texts in batches using BERT tokenizer."""
    
    valid_texts = [text for text in texts if text.strip()]
    if not valid_texts:
        raise ValueError("No valid texts to tokenize")
    max_len = min(max(len(tokenizer.encode(text, truncation=True, max_length=512)) for text in valid_texts), 200)
    print(f"Computed max_len: {max_len}")

    all_input_ids = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        # Replace empty texts with placeholder to avoid tokenizer errors
        batch_texts = [text if text.strip() else "placeholder" for text in batch_texts]
        encodings = tokenizer(
            batch_texts,
            return_tensors="np",
            padding="max_length",
            truncation=True,
            max_length=max_len
        )
        # Print first batch's input IDs
        if i == 0:
            print("Sample input IDs from first batch:", encodings["input_ids"][:2])
        all_input_ids.append(encodings["input_ids"])
    return np.concatenate(all_input_ids, axis=0)

In [6]:

preprocessed_results = df["message_content"].apply(preprocess_text)

# Split results into tokens and cleaned_text
df["tokens"] = preprocessed_results.apply(lambda x: x[0])
df["cleaned_text"] = preprocessed_results.apply(lambda x: x[1])


empty_texts = df["cleaned_text"].isna() | (df["cleaned_text"] == "")
print(f"Empty cleaned_text rows: {sum(empty_texts)} ({sum(empty_texts)/len(df)*100:.2f}%)")


long_messages = df["message_content"].apply(lambda x: len(tokenizer.encode(str(x), truncation=True, max_length=512)) > 200)
print(f"Messages with >200 tokens: {sum(long_messages)}")


print("Generating BERT tokens...")
bert_input_ids = batch_bert_tokenize(df["cleaned_text"].tolist())
df["bert_input_ids"] = list(bert_input_ids)


# Save preprocessed data
df.to_pickle("preprocessed_scam_data.pkl")
print("Preprocessed data saved to 'preprocessed_scam_data.pkl'")

Empty cleaned_text rows: 6 (1.10%)
Messages with >200 tokens: 10
Generating BERT tokens...
Computed max_len: 200
Sample input IDs from first batch: [[  101  4070  3696 29535  2232 21462 21124  7514  2644  4895  6342  5910
  26775 20755   102     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0 

In [7]:
# Evaluate preprocessing
print("\n--- Preprocessing Evaluation ---")

# 1. Noise Removal
docs = list(nlp.pipe(df["cleaned_text"].fillna("")))
remaining_names = sum(1 for doc in docs for ent in doc.ents if ent.label_ == "PERSON")
print(f"Remaining names detected: {remaining_names} ({remaining_names/len(df)*100:.2f}% of rows)")

url_pattern = r'http\S+|www\S+|https\S+'
urls_left = sum(1 for text in df["cleaned_text"].fillna("") if re.search(url_pattern, text))
print(f"Remaining URLs: {urls_left} ({urls_left/len(df)*100:.2f}% of rows)")

non_alpha_tokens = sum(1 for tokens in df["tokens"] for token in tokens if not token.isalpha())
print(f"Non-alphabetic tokens: {non_alpha_tokens} (should be 0)")

# 2. Token Quality
avg_tokens = df["tokens"].apply(len).mean()
vocab = len(set(token for tokens in df["tokens"] for token in tokens))
print(f"Average tokens per message: {avg_tokens:.2f}")
print(f"Vocabulary size: {vocab}")

# 3. Data Retention
empty_tokens = sum(1 for tokens in df["tokens"] if not tokens)
print(f"Empty token lists: {empty_tokens} ({empty_tokens/len(df)*100:.2f}% of rows)")

# 4. BERT Tokenization
valid_bert = sum(1 for ids in df["bert_input_ids"] if ids[0] == 101 and any(id == 102 for id in ids))
print(f"Valid BERT input IDs (CLS/SEP): {valid_bert} ({valid_bert/len(df)*100:.2f}% of rows)")

# 5. Manual Inspection
print("\nSample of 5 rows for inspection:")
sample_df = df[["message_content", "tokens", "cleaned_text"]].head(5)
for i, row in sample_df.iterrows():
    bert_decoded = tokenizer.decode(df["bert_input_ids"][i], skip_special_tokens=True)
    print(f"\nRow {i+1}:")
    print(f"Original: {row['message_content']}")
    print(f"Tokens: {row['tokens']}")
    print(f"Cleaned Text: {row['cleaned_text']}")
    print(f"BERT Decoded: {bert_decoded}")

# Print tokens for the first 10 rows
print("\nLemmatized tokens for first 10 messages:")
for i, tokens in enumerate(df["tokens"][:10]):
    print(f"Message {i+1} tokens:", tokens)

# Print sample BERT input IDs
print("\nSample BERT input IDs for first 2 messages:")
for i in range(min(2, len(df))):
    print(f"Message {i+1} BERT input IDs:", df["bert_input_ids"][i][:10], "...")


--- Preprocessing Evaluation ---
Remaining names detected: 65 (11.93% of rows)
Remaining URLs: 0 (0.00% of rows)
Non-alphabetic tokens: 0 (should be 0)
Average tokens per message: 15.15
Vocabulary size: 1794
Empty token lists: 6 (1.10% of rows)
Valid BERT input IDs (CLS/SEP): 545 (100.00% of rows)

Sample of 5 rows for inspection:

Row 1:
Original: Christa, finish your account sign-up and you could make Ksh 287 hourly in Nairobi. * t.uber.com/lJ8929 Reply STOP 2 to +1 415-237-0403 to unsubscribe
Tokens: ['account', 'sign', 'ksh', 'hourly', 'nairobi', 'reply', 'stop', 'unsubscribe']
Cleaned Text: account sign ksh hourly nairobi reply stop unsubscribe
BERT Decoded: account sign ksh hourly nairobi reply stop unsubscribe

Row 2:
Original: CONGRATULATION!\nYOUR ACCOUNT 254757547986 HAS BEEN CREDITED WITH KES 62,950\n\nNew BONUS  Balance: KES 62,950 \n\nLOGIN>wekelea.com\n \n DEPOSIT&PLAY
Tokens: ['congratulation', 'account', 'credit', 'kes', 'new', 'bonus', 'balance', 'kes', 'login', 'depo