# Text Classification Using Word2Vec and AverageWord2Vec

In [122]:
import gensim
from gensim.models import Word2Vec,KeyedVectors
from gensim.utils import simple_preprocess

import numpy as np
import pandas as pd

import chardet

import re
import nltk
nltk.download('stopwrods')
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize

from tqdm import tqdm

[nltk_data] Error loading stopwrods: Package 'stopwrods' not found in
[nltk_data]     index


In [123]:
# import gensim.downloader as api
# wv=api.load('word2vec-google-news-300')
# vec_king=wv['king']

In [124]:
# vec_king.shape

In [125]:
# Detect the encoding
with open('spam.csv', 'rb') as f:
    result = chardet.detect(f.read())
print(result['encoding'])

# Use the detected encoding
messages = pd.read_csv("spam.csv", encoding=result['encoding'])

Windows-1252


In [126]:
messages=messages.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)
messages=messages.rename(columns={'v1':'label','v2':'message'})

In [127]:
lemmatizer=WordNetLemmatizer()

In [128]:
corpus=[]
for i in range(0,len(messages)):
    review=re.sub('[^a-zA-Z]',' ',messages['message'][i])
    review=review.lower()
    review=review.split()
    review=[lemmatizer.lemmatize(word) for word in review]
    review=' '.join(review)
    corpus.append(review)

In [129]:
words=[]
for sent in corpus:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))

In [130]:
words

[['go',
  'until',
  'jurong',
  'point',
  'crazy',
  'available',
  'only',
  'in',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'there',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'in',
  'wkly',
  'comp',
  'to',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'to',
  'to',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply',
  'over'],
 ['dun', 'say', 'so', 'early', 'hor', 'already', 'then', 'say'],
 ['nah',
  'don',
  'think',
  'he',
  'go',
  'to',
  'usf',
  'he',
  'life',
  'around',
  'here',
  'though'],
 ['freemsg',
  'hey',
  'there',
  'darling',
  'it',
  'been',
  'week',
  'now',
  'and',
  'no',
  'word',
  'back',
  'like',
  'some',
  'fun',
  'you',
  'up',
  'for',
  'it',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'to',
  'send',
  'to',
  'rcv'],
 ['even',
  'my',
  'brother',
  'is',
  'not',
  'like',
  'to',
  'spea

### Train Word2Vec from Scratch

In [131]:
model=gensim.models.Word2Vec(words,vector_size=100)

In [132]:
# To get all vocabulary
model.wv.index_to_key

['you',
 'to',
 'the',
 'and',
 'it',
 'in',
 'is',
 'me',
 'my',
 'for',
 'your',
 'call',
 'of',
 'that',
 'have',
 'on',
 'now',
 'are',
 'can',
 'so',
 'but',
 'not',
 'or',
 'we',
 'do',
 'get',
 'at',
 'be',
 'if',
 'will',
 'ur',
 'with',
 'no',
 'just',
 'this',
 'gt',
 'lt',
 'how',
 'go',
 'up',
 'when',
 'ok',
 'day',
 'what',
 'free',
 'from',
 'out',
 'all',
 'know',
 'll',
 'come',
 'like',
 'time',
 'good',
 'then',
 'am',
 'got',
 'wa',
 'there',
 'he',
 'text',
 'only',
 'love',
 'want',
 'send',
 'txt',
 'need',
 'one',
 'today',
 'by',
 'going',
 'don',
 'home',
 'stop',
 'about',
 'she',
 'lor',
 'sorry',
 'see',
 'still',
 'mobile',
 'take',
 'back',
 'da',
 'reply',
 'our',
 'think',
 'dont',
 'tell',
 'week',
 'phone',
 'hi',
 'new',
 'later',
 'they',
 'her',
 'any',
 'pls',
 'ha',
 'please',
 'co',
 'did',
 'msg',
 'been',
 'min',
 'some',
 'an',
 'make',
 'dear',
 'here',
 'night',
 'who',
 'message',
 'say',
 'well',
 're',
 'where',
 'thing',
 'much',
 'clai

In [133]:
model.corpus_count

5569

In [134]:
model.epochs

5

In [135]:
model.wv.similar_by_word('good')

[('wa', 0.9987514615058899),
 ('morning', 0.9986362457275391),
 ('day', 0.9985965490341187),
 ('happy', 0.9985893368721008),
 ('about', 0.9984652400016785),
 ('well', 0.9984211325645447),
 ('night', 0.9983405470848083),
 ('all', 0.9983283281326294),
 ('very', 0.9983083009719849),
 ('same', 0.9982889890670776)]

In [136]:
model.wv['good']

array([-0.27921343,  0.27407393,  0.28975052,  0.20310763,  0.14451621,
       -0.677417  ,  0.2844815 ,  0.7838257 , -0.2713166 , -0.25750962,
       -0.26824263, -0.6497459 , -0.13059832,  0.1597682 ,  0.21275549,
       -0.27524504,  0.06780277, -0.46547294, -0.00509058, -0.72465736,
        0.24483879,  0.3041009 ,  0.21512511, -0.2785855 , -0.01606855,
        0.08972745, -0.29617676, -0.3538645 , -0.39694595, -0.02994239,
        0.38601255,  0.05644832,  0.23986661, -0.26114184, -0.18356837,
        0.5074632 ,  0.08571646, -0.3465934 , -0.11794188, -0.730683  ,
        0.10921338, -0.21472296, -0.26313335,  0.05839761,  0.33264032,
       -0.1791019 , -0.19135831, -0.09295879,  0.15113032,  0.29256478,
        0.18997635, -0.21409096, -0.17534447,  0.02315746, -0.15526448,
        0.16362919,  0.2242075 ,  0.1931119 , -0.4913801 ,  0.19540349,
       -0.06030245, -0.00864466,  0.10364847, -0.15063818, -0.5193698 ,
        0.32645884,  0.13583827,  0.25827765, -0.5081975 ,  0.42

In [137]:
model.wv['good'].shape

(100,)

In [138]:
def avg_word2vec(doc):
    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis=0)

In [139]:
X=[]
for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 5569/5569 [00:00<00:00, 9533.88it/s]


In [140]:
X

[array([-0.20007722,  0.20160528,  0.19396503,  0.15541072,  0.09745434,
        -0.47538757,  0.17012215,  0.5490614 , -0.21519718, -0.16511682,
        -0.2093575 , -0.43636665, -0.08349514,  0.11761138,  0.14165753,
        -0.20123258,  0.04146515, -0.34325185, -0.00324214, -0.5008009 ,
         0.17152628,  0.18603747,  0.16118501, -0.16371669, -0.00426897,
         0.08090473, -0.20769484, -0.23643951, -0.27133495,  0.00219051,
         0.2599197 ,  0.02741282,  0.13890244, -0.18523486, -0.11223095,
         0.35876507,  0.05320287, -0.24066986, -0.08198079, -0.51642734,
         0.06414416, -0.16607812, -0.20634665,  0.04016157,  0.20341758,
        -0.12588151, -0.11857155, -0.07162106,  0.1080645 ,  0.19183922,
         0.13643773, -0.18384796, -0.11298994,  0.01658632, -0.12461051,
         0.12081357,  0.14162368,  0.12918502, -0.36096373,  0.14524154,
        -0.03101687, -0.01441275,  0.08665207, -0.09158726, -0.36021024,
         0.2352341 ,  0.09353409,  0.17987299, -0.3

In [141]:
len(X)

5569

### Independent Features

In [142]:
X_new = np.array(X, dtype=object)

In [143]:
messages.shape

(5572, 2)

In [144]:
X[1]

array([-0.18290591,  0.17358918,  0.16347197,  0.12851383,  0.08992246,
       -0.40625045,  0.14106399,  0.47109717, -0.18238078, -0.1481898 ,
       -0.18107101, -0.37311128, -0.06840485,  0.1018046 ,  0.11914653,
       -0.18387468,  0.03574204, -0.29947194, -0.00949675, -0.43516505,
        0.15342854,  0.15832964,  0.13416615, -0.13508247, -0.00260686,
        0.05678041, -0.18336074, -0.20553952, -0.23342042,  0.00358833,
        0.22614671,  0.01749574,  0.10755986, -0.15833221, -0.09838893,
        0.3045855 ,  0.03312296, -0.20288427, -0.06380151, -0.45109266,
        0.06161053, -0.14054324, -0.17698236,  0.03339133,  0.17531875,
       -0.11042278, -0.10225219, -0.06124637,  0.09282793,  0.17092007,
        0.11955005, -0.16495456, -0.09679697,  0.01673869, -0.10977407,
        0.10449005,  0.1220317 ,  0.10619886, -0.31281143,  0.13013792,
       -0.01986147, -0.01472339,  0.07336771, -0.0799538 , -0.3182175 ,
        0.19214424,  0.0786772 ,  0.15257886, -0.28968978,  0.28

In [145]:
X_new.shape

(5569,)

In [146]:
X_new[0].shape

(100,)

### Dependent Features

In [147]:
# Output Features
y = messages[list(map(lambda x: len(x)>0 ,corpus))]
y=pd.get_dummies(y['label'])
y=y.iloc[:,0].values

In [148]:
y.shape

(5569,)

In [149]:
X[0].reshape(1,-1).shape

(1, 100)

In [150]:
# # This is the final independent features
# df = pd.DataFrame()

# for i in range(len(X)):
#     df = pd.concat([df, pd.DataFrame(X[i].reshape(1, -1))], ignore_index=True)
#     # This is the final independent features
df = pd.DataFrame([x.reshape(1, -1)[0] for x in X])

# Check the final DataFrame
print(df.head())


         0         1         2         3         4         5         6   \
0 -0.200077  0.201605  0.193965  0.155411  0.097454 -0.475388  0.170122   
1 -0.182906  0.173589  0.163472  0.128514  0.089922 -0.406250  0.141064   
2 -0.221289  0.206126  0.235812  0.218010  0.109153 -0.530684  0.144043   
3 -0.288763  0.275317  0.256684  0.203615  0.135273 -0.647789  0.234489   
4 -0.235288  0.226806  0.221812  0.161132  0.123621 -0.543085  0.199137   

         7         8         9   ...        90        91        92        93  \
0  0.549061 -0.215197 -0.165117  ...  0.259301  0.211459 -0.010104  0.107170   
1  0.471097 -0.182381 -0.148190  ...  0.226626  0.177399 -0.007838  0.097346   
2  0.598543 -0.250346 -0.161609  ...  0.301391  0.227213 -0.016644  0.112252   
3  0.753850 -0.283387 -0.236449  ...  0.353915  0.272413 -0.010947  0.159333   
4  0.632165 -0.242063 -0.202828  ...  0.292224  0.221862 -0.008627  0.130277   

         94        95        96        97        98        99  
0  0

In [151]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.200077,0.201605,0.193965,0.155411,0.097454,-0.475388,0.170122,0.549061,-0.215197,-0.165117,...,0.259301,0.211459,-0.010104,0.10717,0.427405,0.238593,0.143463,-0.155768,0.019436,-0.085183
1,-0.182906,0.173589,0.163472,0.128514,0.089922,-0.40625,0.141064,0.471097,-0.182381,-0.14819,...,0.226626,0.177399,-0.007838,0.097346,0.357236,0.2015,0.121807,-0.150265,0.019282,-0.080151
2,-0.221289,0.206126,0.235812,0.21801,0.109153,-0.530684,0.144043,0.598543,-0.250346,-0.161609,...,0.301391,0.227213,-0.016644,0.112252,0.448947,0.26354,0.141514,-0.146512,-0.011659,-0.074466
3,-0.288763,0.275317,0.256684,0.203615,0.135273,-0.647789,0.234489,0.75385,-0.283387,-0.236449,...,0.353915,0.272413,-0.010947,0.159333,0.575828,0.316368,0.198064,-0.237109,0.028647,-0.108779
4,-0.235288,0.226806,0.221812,0.161132,0.123621,-0.543085,0.199137,0.632165,-0.242063,-0.202828,...,0.292224,0.221862,-0.008627,0.130277,0.485601,0.270226,0.163451,-0.204245,0.020748,-0.097352


In [152]:
df['Output']=y

In [153]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Output
0,-0.200077,0.201605,0.193965,0.155411,0.097454,-0.475388,0.170122,0.549061,-0.215197,-0.165117,...,0.211459,-0.010104,0.10717,0.427405,0.238593,0.143463,-0.155768,0.019436,-0.085183,True
1,-0.182906,0.173589,0.163472,0.128514,0.089922,-0.40625,0.141064,0.471097,-0.182381,-0.14819,...,0.177399,-0.007838,0.097346,0.357236,0.2015,0.121807,-0.150265,0.019282,-0.080151,True
2,-0.221289,0.206126,0.235812,0.21801,0.109153,-0.530684,0.144043,0.598543,-0.250346,-0.161609,...,0.227213,-0.016644,0.112252,0.448947,0.26354,0.141514,-0.146512,-0.011659,-0.074466,False
3,-0.288763,0.275317,0.256684,0.203615,0.135273,-0.647789,0.234489,0.75385,-0.283387,-0.236449,...,0.272413,-0.010947,0.159333,0.575828,0.316368,0.198064,-0.237109,0.028647,-0.108779,True
4,-0.235288,0.226806,0.221812,0.161132,0.123621,-0.543085,0.199137,0.632165,-0.242063,-0.202828,...,0.221862,-0.008627,0.130277,0.485601,0.270226,0.163451,-0.204245,0.020748,-0.097352,True


In [154]:
df.dropna(inplace=True)

In [155]:
df.isnull().sum()

0         0
1         0
2         0
3         0
4         0
         ..
96        0
97        0
98        0
99        0
Output    0
Length: 101, dtype: int64

In [156]:
## Independent Feature
X=df.drop(columns=['Output'],axis=1)

In [157]:
X.isnull().sum()

0     0
1     0
2     0
3     0
4     0
     ..
95    0
96    0
97    0
98    0
99    0
Length: 100, dtype: int64

In [158]:
y=df['Output']

In [159]:
## Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20)

In [160]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
5193,-0.207724,0.188849,0.218819,0.196363,0.103176,-0.495684,0.131138,0.55475,-0.232659,-0.152432,...,0.281324,0.212944,-0.018168,0.105379,0.417947,0.243775,0.128868,-0.140715,-0.003294,-0.07358
5447,-0.321341,0.298018,0.312987,0.269421,0.160326,-0.74689,0.219558,0.851384,-0.354215,-0.249728,...,0.40936,0.317569,-0.017812,0.15829,0.646056,0.373028,0.209142,-0.231281,0.00449,-0.103326
2516,-0.278613,0.291008,0.275137,0.252133,0.122419,-0.68072,0.226667,0.815862,-0.298364,-0.239791,...,0.432368,0.304318,-0.054245,0.127475,0.609391,0.346323,0.168162,-0.227478,-0.026139,-0.122645
1586,-0.254594,0.257602,0.247571,0.167037,0.143545,-0.621922,0.233478,0.72784,-0.275674,-0.243091,...,0.332648,0.260871,-0.007753,0.145082,0.573646,0.317003,0.178402,-0.240268,0.029038,-0.102748
1201,-0.261085,0.261009,0.24117,0.152645,0.149641,-0.618816,0.2298,0.716991,-0.269276,-0.245448,...,0.332357,0.233828,-0.01915,0.158554,0.549025,0.303459,0.174907,-0.257441,0.021747,-0.108126


In [161]:
y_train

5193    False
5447     True
2516     True
1586     True
1201     True
        ...  
4182     True
340      True
2698     True
4263     True
4554     True
Name: Output, Length: 4445, dtype: bool

In [162]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()

In [163]:
classifier.fit(X_train,y_train)

In [164]:
y_pred=classifier.predict(X_test)

In [165]:
from sklearn.metrics import accuracy_score,classification_report
print(accuracy_score(y_test,y_pred))

0.9694244604316546


In [166]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

       False       0.90      0.85      0.87       137
        True       0.98      0.99      0.98       975

    accuracy                           0.97      1112
   macro avg       0.94      0.92      0.93      1112
weighted avg       0.97      0.97      0.97      1112

