The dataset contains 5573 emails where they are labeled as spam and ham, where 4825 are ham
(non spam) and 747 spam emails. We need to build a NLP classifier that specially uses word2vec
from Google. Divide the dataset into 80 and 20 percent and build 3 types of models
1. CBOW
2. Skipgram
3. Pretrained word2vec model from Google
Please do necessary pre-processing methods before building the NLP classification model.

In [1]:
from google.colab import files
uploaded = files.upload()

Saving spam.csv to spam.csv


In [2]:
import pandas as pd
import numpy as np

In [3]:
import gensim

In [4]:
df = pd.read_csv('/content/spam.csv', encoding='ISO-8859-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [6]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

In [7]:
df.rename({'v1': 'category', 'v2': 'mail'}, axis=1, inplace=True)

In [8]:
df.head()

Unnamed: 0,category,mail
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
df.mail[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [10]:
df.category.value_counts()

ham     4825
spam     747
Name: category, dtype: int64

In [11]:
mail_processed = df['mail'].apply(gensim.utils.simple_preprocess)

In [12]:
mail_processed[0]

['go',
 'until',
 'jurong',
 'point',
 'crazy',
 'available',
 'only',
 'in',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'there',
 'got',
 'amore',
 'wat']

In [13]:
cbow_model = gensim.models.Word2Vec(size=300, window=9, min_count=2)
sgram_model = gensim.models.Word2Vec(size=300, window=9, 
                                       min_count=2, sg=1)

In [14]:
cbow_model.build_vocab(mail_processed)

In [15]:
sgram_model.build_vocab(mail_processed)

In [16]:
cbow_model.corpus_count

5572

In [17]:
sgram_model.corpus_count

5572

In [18]:
cbow_model.epochs

5

In [19]:
sgram_model.epochs

5

In [20]:
cbow_model.train(mail_processed, total_examples=cbow_model.corpus_count, 
                 epochs=cbow_model.epochs)

(300603, 391395)

In [21]:
sgram_model.train(mail_processed, total_examples=sgram_model.corpus_count, 
                  epochs=sgram_model.epochs)

(300328, 391395)

In [22]:
print('cbow model vocabulary size:', len(cbow_model.wv.vocab))

print('skgram model vocabulary size:', len(sgram_model.wv.vocab))

cbow model vocabulary size: 3892
skgram model vocabulary size: 3892


In [23]:
cbow_model.wv.most_similar('great')

[('as', 0.9999638795852661),
 ('dear', 0.9999634623527527),
 ('of', 0.9999613165855408),
 ('life', 0.9999600648880005),
 ('boy', 0.9999599456787109),
 ('night', 0.9999592304229736),
 ('all', 0.999958872795105),
 ('well', 0.999957263469696),
 ('even', 0.9999566674232483),
 ('hope', 0.9999562501907349)]

In [24]:
sgram_model.wv.most_similar('great')

[('princess', 0.9711344838142395),
 ('hope', 0.9667274355888367),
 ('year', 0.9658629298210144),
 ('thanks', 0.958254873752594),
 ('thank', 0.9561683535575867),
 ('semester', 0.9555940628051758),
 ('morning', 0.9529229402542114),
 ('love', 0.9508692026138306),
 ('wish', 0.9495470523834229),
 ('kiss', 0.9448017477989197)]

In [25]:
for token in mail_processed[1]:
    if token in sgram_model.wv.vocab:
        print(token)

ok
lar
joking
wif
oni


In [26]:
(np.mean(sgram_model.wv.word_vec('go'), axis=0)).tolist()

0.007954874075949192

In [27]:
file_name = '/content/drive/MyDrive/Ml_course/NLP_sessions/GoogleNews-vectors-negative300.bin'

In [None]:
google_model = gensim.models.KeyedVectors.load_word2vec_format(file_name, binary=True)

In [41]:
file_name = '/content/drive/MyDrive/Ml_course/NLP_sessions/GoogleNews-vectors-negative300.bin'

In [38]:
google_vectors = mail_processed.apply(lambda x: get_embedding_w2v(x))
google_vectors = pd.DataFrame(google_vectors.tolist())
print('X shape:', google_vectors.shape)

TypeError: ignored

In [37]:
google_vectors

0       [0.015089277, -0.12072642, -0.013997965, 0.085...
1       [0.045284443, -0.107318416, -0.032457147, 0.04...
2       [-0.12185624, -0.23072797, -0.07818787, 0.0452...
3       [0.07965654, -0.069833584, -0.013506977, 0.064...
4       [0.035352215, -0.092364766, 0.042695656, 0.170...
                              ...                        
5567    [-0.05794714, -0.18977302, -0.020355985, 0.101...
5568    [0.022935549, -0.11641582, -0.012877032, 0.082...
5569    [0.023593366, -0.11432418, 0.02159409, 0.14439...
5570    [-0.0008337673, -0.12838025, 0.011425349, 0.13...
5571    [0.00426941, -0.13072045, 0.029651565, 0.12330...
Name: mail, Length: 5572, dtype: object