In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
import fasttext
import fasttext.util
from scipy.sparse import save_npz
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_pickle('../data/preprocessed_scam_data.pkl')
print('Loaded preprocessed data:', df.shape)
print('Sample cleaned_text:', df['cleaned_text'].iloc[:2].tolist())

Loaded preprocessed data: (545, 20)
Sample cleaned_text: ['account sign ksh hourly nairobi reply stop unsubscribe', 'congratulation account credit kes new bonus balance kes login deposit play']


In [3]:
# 1. TF-IDF Vectorization

tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 3), min_df=2)


texts = df['cleaned_text'].fillna('').str.strip()


tfidf_matrix = tfidf_vectorizer.fit_transform(texts)


save_npz('tfidf_matrix.npz', tfidf_matrix)
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

print('TF-IDF matrix shape:', tfidf_matrix.shape)
print('Sample TF-IDF features:', tfidf_vectorizer.get_feature_names_out()[:10])

TF-IDF matrix shape: (545, 3447)
Sample TF-IDF features: ['abroad' 'academic' 'acc' 'acc past' 'acc past yrs' 'acc secret'
 'acc secret pin' 'accept' 'accept deadline' 'accept deadline september']


In [None]:
# 2. Sentence-BERT Embeddings

sbert_model = SentenceTransformer('all-mpnet-base-v2')


batch_size = 32
texts = df['cleaned_text'].fillna('').str.strip().tolist()
sbert_embeddings = []

for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i + batch_size]
    batch_texts = [text if text else 'placeholder' for text in batch_texts]
    embeddings = sbert_model.encode(batch_texts, show_progress_bar=(i == 0))
    sbert_embeddings.append(embeddings)

sbert_embeddings = np.vstack(sbert_embeddings)
np.save('sbert_embeddings.npy', sbert_embeddings)
print('Sentence-BERT embeddings shape:', sbert_embeddings.shape)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Error while downloading from https://huggingface.co/sentence-transformers/paraphrase-mpnet-base-v2/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...
Error while downloading from https://huggingface.co/sentence-transformers/paraphrase-mpnet-base-v2/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


OSError: sentence-transformers/paraphrase-mpnet-base-v2 does not appear to have a file named pytorch_model.bin but there is a file for TensorFlow weights. Use `from_tf=True` to load this model from those weights.

In [6]:
# 3. FastText Word Embeddings

fasttext.util.download_model('en', if_exists='ignore')
ft_model = fasttext.load_model('cc.en.300.bin')


def get_fasttext_embedding(text, model, dim=300):
    if not text or text.isspace():
        return np.zeros(dim)
    words = text.split()
    vectors = [model.get_word_vector(word) for word in words if word in model]
    if not vectors:
        return np.zeros(dim)
    return np.mean(vectors, axis=0)


fasttext_embeddings = np.array([
    get_fasttext_embedding(text, ft_model) for text in texts
])


np.save('fasttext_embeddings.npy', fasttext_embeddings)

print('FastText embeddings shape:', fasttext_embeddings.shape)

Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
 (5.96%) [==>                                                ]>                                                  ]>                                                  ]>                                                  ]>                                                  ]>                                                  ]>                                                  ]>                                                  ]>                                                  ]>                                                  ]>                                                  ]>                                                  ]>                                                  ]>                                                  ]>                                                  ]>                                                  ]>                                                  ]>                      

KeyboardInterrupt: 