In [1]:
import pandas as pd

msgs = pd.read_csv('SMSSpamCollection.txt', sep='\t', names=["label","messages"])

msgs

Unnamed: 0,label,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


# Data Cleaning and Presprocessing

In [2]:
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

Stemming and Stopwords

In [3]:
corpus = []
for i in range(0, len(msgs)):
    review = re.sub('[^a-zA-Z0-9]', ' ', msgs['messages'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [4]:
corpus[:10]

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl 3 week word back like fun still tb ok xxx std chg send 1 50 rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun',
 'winner valu network custom select receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 08002986030']

# Bag Of Words

In [5]:
# Creating Bag of Words Model
from sklearn.feature_extraction.text import CountVectorizer 
cv = CountVectorizer(max_features=2500, binary= True, ngram_range=(2,2) ) #Limiting to 2500 most frequent words
X = cv.fit_transform(corpus).toarray()
X.shape

(5572, 2500)

In [6]:
y = pd.get_dummies(msgs['label'])['spam'].astype(int).values
y

array([0, 0, 1, ..., 0, 0, 0])

In [7]:
# Training and Testing Data
from sklearn.model_selection import train_test_split    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [8]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)


In [9]:
pred = model.predict(X_test)
pred

array([0, 1, 0, ..., 0, 1, 0])

In [10]:
from sklearn.metrics import accuracy_score, classification_report
print(accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

0.9721973094170404
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       955
           1       1.00      0.81      0.89       160

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115



# TF-IDF

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=2500, ngram_range=(1,2))
X = tfidf.fit_transform(corpus).toarray()

In [12]:
# Training and Testing Data
from sklearn.model_selection import train_test_split    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [13]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [14]:
pred = model.predict(X_test)
pred

array([0, 1, 0, ..., 0, 1, 0])

In [15]:
from sklearn.metrics import accuracy_score, classification_report
print(accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

0.9829596412556054
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       955
           1       1.00      0.88      0.94       160

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



# Word2Vec

Lemmatizing

In [18]:
from nltk.stem import WordNetLemmatizer
lt = WordNetLemmatizer()

In [19]:
corpus = []
for i in range(0, len(msgs)):
    review = re.sub('[^a-zA-Z0-9]', ' ', msgs['messages'][i])
    review = review.lower()
    review = review.split()
    
    review = [lt.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

corpus[:10]

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry question std txt rate c apply 08452810075over18',
 'u dun say early hor u c already say',
 'nah think go usf life around though',
 'freemsg hey darling 3 week word back like fun still tb ok xxx std chgs send 1 50 rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press 9 copy friend callertune',
 'winner valued network customer selected receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobile 11 month u r entitled update latest colour mobile camera free call mobile update co free 08002986030']

In [20]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [21]:
sentences = [simple_preprocess(sent) for sent in corpus]

sentences[:10]

[['go',
  'jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'wkly',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply',
  'over'],
 ['dun', 'say', 'early', 'hor', 'already', 'say'],
 ['nah', 'think', 'go', 'usf', 'life', 'around', 'though'],
 ['freemsg',
  'hey',
  'darling',
  'week',
  'word',
  'back',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'send',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'melle',
  'melle',
  'oru',
  'minnaminunginte',
  'nurungu',
  'vettam',
  'set',
  'callertune',
  'caller',
  'press',
  'copy',
  'friend',
  'callertune'],
 ['winner',
  'valued',
  'network',
  'customer',
  'selected',
  're

In [22]:
import gensim
w2v_model = gensim.models.Word2Vec(sentences, window=5, min_count=2)
w2v_model.wv.index_to_key[:20]


['call',
 'get',
 'ur',
 'gt',
 'lt',
 'go',
 'ok',
 'day',
 'free',
 'know',
 'come',
 'like',
 'good',
 'time',
 'got',
 'text',
 'love',
 'want',
 'send',
 'need']

In [23]:
w2v_model.corpus_count

5572

In [24]:
w2v_model.wv.similar_by_word('happy')

[('amp', 0.9995289444923401),
 ('day', 0.9995237588882446),
 ('make', 0.9995118975639343),
 ('could', 0.9995074272155762),
 ('even', 0.9995034337043762),
 ('said', 0.9995020031929016),
 ('one', 0.9994964003562927),
 ('thing', 0.9994940161705017),
 ('would', 0.9994856119155884),
 ('go', 0.9994829893112183)]

Average Word2Vec

In [25]:
import numpy as np


def avg_word2vec(words, model=w2v_model):
    vectors = [model.wv[w] for w in words if w in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size, dtype=float)  # safe fallback
    return np.mean(vectors, axis=0)

In [26]:
from tqdm import tqdm
X = []

for i in tqdm(range(len(sentences))):
    X.append(avg_word2vec(sentences[i]))

100%|██████████| 5572/5572 [00:00<00:00, 26079.08it/s]


In [27]:
X_new = np.array(X) #input features
X_new.shape

(5572, 100)

In [28]:
# Training and Testing Data
from sklearn.model_selection import train_test_split    
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.20, random_state=0)

In [29]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [30]:
pred = model.predict(X_test)
pred

array([0, 0, 0, ..., 0, 1, 0])

In [31]:
from sklearn.metrics import accuracy_score, classification_report
print(accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

0.9650224215246637
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       955
           1       0.97      0.78      0.87       160

    accuracy                           0.97      1115
   macro avg       0.97      0.89      0.92      1115
weighted avg       0.97      0.97      0.96      1115



# TF-IDF Weighted Averaging

In [32]:
import numpy as np

# 1. Build TF-IDF model
tfidf = TfidfVectorizer()
tfidf.fit([" ".join(s) for s in sentences])  # sentences = list of token lists

# Create mapping word -> idf weight
tfidf_weights = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))

# 2. Weighted average function
def tfidf_weighted_avg(words, model=w2v_model, tfidf_weights=tfidf_weights):
    vectors = []
    weights = []
    for w in words:
        if w in model.wv and w in tfidf_weights:
            vectors.append(model.wv[w])
            weights.append(tfidf_weights[w])
    if not vectors:
        return np.zeros(model.vector_size, dtype=float)  # fallback for OOV/empty
    return np.average(vectors, axis=0, weights=weights)

In [33]:
from tqdm import tqdm
X = []

for i in tqdm(range(len(sentences))):
    X.append(tfidf_weighted_avg(sentences[i]))

100%|██████████| 5572/5572 [00:00<00:00, 15164.76it/s]


In [34]:
X_new = np.array(X) #input features
X_new.shape

(5572, 100)

In [35]:
# Training and Testing Data
from sklearn.model_selection import train_test_split    
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.20, random_state=0)

In [36]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [37]:
pred = model.predict(X_test)
pred

array([0, 0, 0, ..., 0, 1, 0])

In [38]:
from sklearn.metrics import accuracy_score, classification_report
print(accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

0.9560538116591928
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       955
           1       0.97      0.72      0.82       160

    accuracy                           0.96      1115
   macro avg       0.96      0.86      0.90      1115
weighted avg       0.96      0.96      0.95      1115

