In [64]:
import gensim
import pandas as pd
from gensim.models import Word2Vec,KeyedVectors
import gensim.downloader as api
from nltk.stem import WordNetLemmatizer
import re
from nltk.corpus import stopwords
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import numpy as np

In [65]:
# Loading the Google Word2Vec Model
wv = api.load('word2vec-google-news-300')
v_k_king = wv['king']

In [66]:
v_k_king

array([ 1.25976562e-01,  2.97851562e-02,  8.60595703e-03,  1.39648438e-01,
       -2.56347656e-02, -3.61328125e-02,  1.11816406e-01, -1.98242188e-01,
        5.12695312e-02,  3.63281250e-01, -2.42187500e-01, -3.02734375e-01,
       -1.77734375e-01, -2.49023438e-02, -1.67968750e-01, -1.69921875e-01,
        3.46679688e-02,  5.21850586e-03,  4.63867188e-02,  1.28906250e-01,
        1.36718750e-01,  1.12792969e-01,  5.95703125e-02,  1.36718750e-01,
        1.01074219e-01, -1.76757812e-01, -2.51953125e-01,  5.98144531e-02,
        3.41796875e-01, -3.11279297e-02,  1.04492188e-01,  6.17675781e-02,
        1.24511719e-01,  4.00390625e-01, -3.22265625e-01,  8.39843750e-02,
        3.90625000e-02,  5.85937500e-03,  7.03125000e-02,  1.72851562e-01,
        1.38671875e-01, -2.31445312e-01,  2.83203125e-01,  1.42578125e-01,
        3.41796875e-01, -2.39257812e-02, -1.09863281e-01,  3.32031250e-02,
       -5.46875000e-02,  1.53198242e-02, -1.62109375e-01,  1.58203125e-01,
       -2.59765625e-01,  

# **Load the Data**

In [67]:
# Load the dataset
url = 'https://raw.githubusercontent.com/sumankumarsubudhi/Spam-Classifier/master/SMSSpamCollection'
df = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])

In [68]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [69]:
lemmatizer = WordNetLemmatizer()

In [70]:
corpus = []

for i in range(len(df)):
    review = re.sub('[^a-zA-z]',' ',df['message'][i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review]
    review = ' '.join(review)
    corpus.append(review)

In [105]:
a = list(map(len,corpus))
list(filter(lambda num: num<1,a))  # There are 3 rows in the corpus that have no words

[0, 0, 0]

In [71]:
corpus

['go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat',
 'ok lar joking wif u oni',
 'free entry in a wkly comp to win fa cup final tkts st may text fa to to receive entry question std txt rate t c s apply over s',
 'u dun say so early hor u c already then say',
 'nah i don t think he go to usf he life around here though',
 'freemsg hey there darling it s been week s now and no word back i d like some fun you up for it still tb ok xxx std chgs to send to rcv',
 'even my brother is not like to speak with me they treat me like aid patent',
 'a per your request melle melle oru minnaminunginte nurungu vettam ha been set a your callertune for all caller press to copy your friend callertune',
 'winner a a valued network customer you have been selected to receivea prize reward to claim call claim code kl valid hour only',
 'had your mobile month or more u r entitled to update to the latest colour mobile with camera for free call the mobile up

In [73]:
words = []

for sent in corpus:
    sent_token = sent_tokenize(sent)
    for word in sent_token:
        words.append(simple_preprocess(word))

In [74]:
words

[['go',
  'until',
  'jurong',
  'point',
  'crazy',
  'available',
  'only',
  'in',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'there',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'in',
  'wkly',
  'comp',
  'to',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'to',
  'to',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply',
  'over'],
 ['dun', 'say', 'so', 'early', 'hor', 'already', 'then', 'say'],
 ['nah',
  'don',
  'think',
  'he',
  'go',
  'to',
  'usf',
  'he',
  'life',
  'around',
  'here',
  'though'],
 ['freemsg',
  'hey',
  'there',
  'darling',
  'it',
  'been',
  'week',
  'now',
  'and',
  'no',
  'word',
  'back',
  'like',
  'some',
  'fun',
  'you',
  'up',
  'for',
  'it',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'to',
  'send',
  'to',
  'rcv'],
 ['even',
  'my',
  'brother',
  'is',
  'not',
  'like',
  'to',
  'spea

In [75]:
# Let's train this word2vec model from scratch
model = gensim.models.Word2Vec(words)

In [76]:
# to get all the vocabulary
model.wv.index_to_key

['to',
 'you',
 'the',
 'it',
 'and',
 'in',
 'is',
 'me',
 'my',
 'for',
 'your',
 'call',
 'of',
 'that',
 'have',
 'on',
 'now',
 'are',
 'can',
 'so',
 'but',
 'not',
 'or',
 'we',
 'do',
 'get',
 'at',
 'ur',
 'will',
 'if',
 'be',
 'with',
 'no',
 'just',
 'this',
 'gt',
 'lt',
 'go',
 'how',
 'up',
 'when',
 'ok',
 'day',
 'what',
 'free',
 'from',
 'all',
 'out',
 'know',
 'll',
 'come',
 'like',
 'good',
 'time',
 'am',
 'then',
 'got',
 'wa',
 'there',
 'he',
 'love',
 'text',
 'only',
 'want',
 'send',
 'one',
 'need',
 'txt',
 'today',
 'by',
 'going',
 'don',
 'stop',
 'home',
 'she',
 'about',
 'lor',
 'sorry',
 'see',
 'still',
 'mobile',
 'take',
 'back',
 'da',
 'reply',
 'dont',
 'our',
 'think',
 'tell',
 'week',
 'hi',
 'phone',
 'they',
 'new',
 'please',
 'later',
 'pls',
 'any',
 'her',
 'ha',
 'co',
 'did',
 'been',
 'msg',
 'min',
 'some',
 'an',
 'night',
 'make',
 'dear',
 'who',
 'here',
 'message',
 'say',
 'well',
 'where',
 're',
 'thing',
 'much',
 'oh',

In [77]:
model.corpus_count

5569

In [78]:
model.epochs

5

In [79]:
model.wv.similar_by_word('kid')

[('yet', 0.997535765171051),
 ('boy', 0.9975336790084839),
 ('went', 0.9974470138549805),
 ('too', 0.9974285960197449),
 ('care', 0.9974014163017273),
 ('bit', 0.9973664283752441),
 ('buy', 0.9973466396331787),
 ('smiling', 0.9973446726799011),
 ('finish', 0.9973424673080444),
 ('many', 0.997329592704773)]

In [80]:
model.wv.similar_by_word('good')

[('wa', 0.9984362125396729),
 ('morning', 0.9982641935348511),
 ('there', 0.9980887770652771),
 ('in', 0.9979994297027588),
 ('not', 0.9979330897331238),
 ('night', 0.9979327321052551),
 ('oh', 0.9978918433189392),
 ('here', 0.997826099395752),
 ('is', 0.9977712035179138),
 ('say', 0.9977651834487915)]

In [81]:
model.wv['good']  #100 differnt vector

array([-2.74536520e-01,  3.75517368e-01,  1.78862542e-01,  2.09223121e-01,
        8.87646824e-02, -6.64529622e-01,  2.32087672e-01,  7.33398199e-01,
       -3.68417829e-01, -2.43602470e-01, -2.16807902e-01, -5.81200123e-01,
       -4.81170416e-02,  1.10259481e-01,  3.05671841e-01, -2.65676111e-01,
        1.45925820e-01, -4.35182631e-01, -2.83537265e-02, -8.51378918e-01,
        3.90345722e-01,  1.14796609e-01,  1.95532471e-01, -2.59853512e-01,
       -4.33305986e-02,  5.80201624e-04, -2.14491025e-01, -2.52231359e-01,
       -4.33845758e-01,  9.17588770e-02,  3.89142632e-01,  1.77854039e-02,
        1.88349500e-01, -3.36074114e-01, -1.29471764e-01,  5.41183949e-01,
        6.15846440e-02, -2.05575451e-01, -1.89946786e-01, -6.93773031e-01,
        7.74313509e-02, -2.61471927e-01, -2.65188932e-01,  2.80265920e-02,
        2.38741234e-01, -8.63582417e-02, -2.04132587e-01, -8.89758095e-02,
        3.18487376e-01,  1.41086727e-01,  2.08698869e-01, -2.51598656e-01,
       -6.53693825e-02,  

**AVG Word2Vec**

In [82]:
words[0]  # In Word2Vec it will Provide 100 differnt vectors for every word 
#like: [go = 100 vectors, until = 100 vectors....n]  ## But in AVGword2Vec will get the Avg vectors and total [100 vectors] in sentence

['go',
 'until',
 'jurong',
 'point',
 'crazy',
 'available',
 'only',
 'in',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'there',
 'got',
 'amore',
 'wat']

In [83]:
model.wv['go']

array([-0.317337  ,  0.35581738,  0.2010746 ,  0.19520128,  0.13522665,
       -0.74474937,  0.20964098,  0.7760501 , -0.39785126, -0.2885965 ,
       -0.27992207, -0.5780501 , -0.11880627,  0.10843948,  0.33331925,
       -0.28124213,  0.15979137, -0.5170187 , -0.05867481, -0.8974257 ,
        0.33469155,  0.1194978 ,  0.18309467, -0.27617937, -0.01645119,
        0.04496595, -0.2712411 , -0.27226084, -0.44481608,  0.11881588,
        0.39285335,  0.03992462,  0.12839003, -0.3223249 , -0.16477738,
        0.5473086 ,  0.07274551, -0.23702994, -0.20713916, -0.7392015 ,
        0.09705719, -0.26616916, -0.28404087,  0.04654923,  0.22356494,
       -0.13203481, -0.19333792, -0.08843889,  0.30897492,  0.13480397,
        0.14979103, -0.28993642,  0.00819273,  0.08047166, -0.15932927,
        0.16606161,  0.2178085 ,  0.06960455, -0.5585416 ,  0.22294313,
        0.04776302,  0.18660212, -0.00942617, -0.11662532, -0.38333014,
        0.38648465,  0.12532477,  0.29962075, -0.3853001 ,  0.54

In [84]:
words

[['go',
  'until',
  'jurong',
  'point',
  'crazy',
  'available',
  'only',
  'in',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'there',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'in',
  'wkly',
  'comp',
  'to',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'to',
  'to',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply',
  'over'],
 ['dun', 'say', 'so', 'early', 'hor', 'already', 'then', 'say'],
 ['nah',
  'don',
  'think',
  'he',
  'go',
  'to',
  'usf',
  'he',
  'life',
  'around',
  'here',
  'though'],
 ['freemsg',
  'hey',
  'there',
  'darling',
  'it',
  'been',
  'week',
  'now',
  'and',
  'no',
  'word',
  'back',
  'like',
  'some',
  'fun',
  'you',
  'up',
  'for',
  'it',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'to',
  'send',
  'to',
  'rcv'],
 ['even',
  'my',
  'brother',
  'is',
  'not',
  'like',
  'to',
  'spea

In [86]:
def avg_word2vec(doc):
    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key])

In [87]:
! pip install tqdm




[notice] A new release of pip is available: 24.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [89]:
# applying this Avg Word2Vec on every sentences
X = []

for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 5569/5569 [00:01<00:00, 5138.77it/s]


In [90]:
def avg_word2vec(doc):
    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key])

In [91]:
def avg_word2vec(doc):
    # Collect vectors only for words that are in the model's vocabulary
    vectors = [model.wv[word] for word in doc if word in model.wv.index_to_key]
    
    # If no word is in the model, return a zero vector (or you can return None based on your use case)
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        # Return a zero vector with the same dimensions as Word2Vec vectors (adjust size based on model)
        return np.zeros(model.vector_size)

# Apply the Avg Word2Vec to every sentence
X = []

for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))


100%|██████████| 5569/5569 [00:00<00:00, 9086.98it/s]


In [92]:
X

[array([-0.18719213,  0.2509979 ,  0.12901096,  0.13669497,  0.07344408,
        -0.47757465,  0.14398809,  0.49434024, -0.2494788 , -0.18875067,
        -0.16050315, -0.38195795, -0.07123038,  0.07631177,  0.20976153,
        -0.17800172,  0.08802482, -0.32290995, -0.03383623, -0.574716  ,
         0.23253658,  0.07680644,  0.11979183, -0.17802097, -0.03028701,
         0.006355  , -0.16502377, -0.1656168 , -0.2924407 ,  0.07219968,
         0.24711508,  0.02091545,  0.09243449, -0.21471824, -0.09979898,
         0.3662319 ,  0.06060939, -0.15365642, -0.15752803, -0.48393637,
         0.05050123, -0.17389157, -0.19146736,  0.01626236,  0.15325119,
        -0.071288  , -0.13621014, -0.04795562,  0.22510321,  0.09665819,
         0.11497691, -0.18707456, -0.02792443,  0.03224042, -0.09373572,
         0.10196259,  0.1542742 ,  0.04742685, -0.37951046,  0.12676948,
         0.00733674,  0.13118769, -0.01366482, -0.09731547, -0.24372295,
         0.24584404,  0.08755852,  0.20172031, -0.2

In [93]:
# Independent variables
X_new = np.array(X)

In [98]:
df['message'].shape

(5572,)

In [94]:
X_new.shape

(5569, 100)

In [95]:
X_new[0].shape

(100,)

In [106]:
# Output feature
y = df[list(map(lambda x: len(x)>0,corpus))]
y = pd.get_dummies(y['label'],drop_first=True,dtype='int')
y = y.values

In [112]:
y.shape

(5569, 1)

In [119]:
new_df = pd.DataFrame(X_new)

In [120]:
new_df['output'] = y

In [121]:
new_df.shape

(5569, 101)

In [122]:
new_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,output
0,-0.187192,0.250998,0.129011,0.136695,0.073444,-0.477575,0.143988,0.49434,-0.249479,-0.188751,...,0.180523,0.0634,0.052094,0.501428,0.232469,0.140947,-0.237918,0.176107,0.037903,0
1,-0.170682,0.220576,0.111288,0.119657,0.070248,-0.41799,0.114414,0.436934,-0.219224,-0.159479,...,0.14942,0.051146,0.03844,0.425701,0.195773,0.123837,-0.216054,0.156589,0.025295,0
2,-0.189861,0.270108,0.143143,0.156136,0.057047,-0.51485,0.143052,0.481285,-0.260896,-0.223208,...,0.185897,0.041295,0.041136,0.511434,0.22651,0.083951,-0.269427,0.204013,0.065687,1
3,-0.258519,0.335335,0.169145,0.189663,0.101805,-0.645423,0.18652,0.675381,-0.341235,-0.245656,...,0.236276,0.085126,0.072772,0.663343,0.311677,0.206556,-0.326907,0.235924,0.041501,0
4,-0.226634,0.278177,0.150798,0.157035,0.094465,-0.543074,0.157219,0.576215,-0.292683,-0.216244,...,0.196622,0.073744,0.062925,0.561832,0.267637,0.166793,-0.286986,0.196341,0.039067,0


In [135]:
X = new_df.drop(columns='output')
y = new_df['output']

In [136]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=42)

In [138]:
model = RandomForestClassifier()

In [139]:
model.fit(X_train,y_train)

In [140]:
y_pred = model.predict(X_test)

In [143]:
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

0.973967684021544
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       955
           1       0.94      0.87      0.91       159

    accuracy                           0.97      1114
   macro avg       0.96      0.93      0.95      1114
weighted avg       0.97      0.97      0.97      1114



# **Thankyou**