In [1]:
# importing the Dataset

import pandas as pd

df = pd.read_csv('smsspamcollection/SMSSpamCollection', sep='\t',
                           names=["label", "message"])

In [2]:
df.head(3)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [3]:
df.isnull().sum()

label      0
message    0
dtype: int64

In [4]:
import re
from nltk.corpus import stopwords

### download it to perform lemmatization on the data 
import nltk

nltk.download('wordnet')

nltk.download('omw-1.4')

In [5]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [6]:
corpus = []
for i in range(0, len(df.message)):
    review = re.sub('[^a-zA-z]', " ", df.message[i])
    review = review.lower()
    review = review.split()
    
    review = [lemmatizer.lemmatize(words) for words in review if not words in stopwords.words('english')]
    review = " ".join(review)
    corpus.append(review)   

### map function is used to apply a fn on the whole list as in our case we are applying the len fn on corpus to get len of each sentence and map fn return us an obj, so to display it's result we need to convert it into list

### Zip fn is used to iterate through the 2 or more list at a time in our case we pass 3 lists
#### first is : len of tense (zip fn)
#### sec is : corpus itself
#### third is: the df column 

#### in the print statement it first display the len, corpus tense and the actuall message from df

In [7]:
print([[i, j, k]for i, j,k in zip(list(map(len,corpus)),corpus, df.message) if i<1])

[[0, '', 'What you doing?how are you?'], [0, '', 'Where @'], [0, '', '645'], [0, '', 'Can a not?'], [0, '', ':) '], [0, '', 'What you doing?how are you?'], [0, '', ':( but your not here....'], [0, '', ':-) :-)']]


In [8]:
corpus[0]

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [43]:
corpus

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'u dun say early hor u c already say',
 'nah think go usf life around though',
 'freemsg hey darling week word back like fun still tb ok xxx std chgs send rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press copy friend callertune',
 'winner valued network customer selected receivea prize reward claim call claim code kl valid hour',
 'mobile month u r entitled update latest colour mobile camera free call mobile update co free',
 'gonna home soon want talk stuff anymore tonight k cried enough today',
 'six chance win cash pound txt csh send cost p day day tsandcs apply reply hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw'

In [10]:
len(corpus)

5572

#### Here we again use map fn we are only considering those elements where the len is greater than 0

In [11]:
y = df[list(map(lambda x : len(x) > 0, corpus))]

In [12]:
y = pd.get_dummies(y.label)

In [13]:
y.head()

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0


In [14]:
y = y.iloc[:,1]

In [15]:
y.shape

(5564,)

In [16]:
y

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: spam, Length: 5564, dtype: uint8

In [55]:
import gensim 
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')

In [56]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

### In order to Create our Custom Model we need to pass words

In [57]:
data=[]
for sent in corpus:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        data.append(simple_preprocess(sent))

In [58]:
data

[['go',
  'jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'wkly',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply'],
 ['dun', 'say', 'early', 'hor', 'already', 'say'],
 ['nah', 'think', 'go', 'usf', 'life', 'around', 'though'],
 ['freemsg',
  'hey',
  'darling',
  'week',
  'word',
  'back',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'send',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'melle',
  'melle',
  'oru',
  'minnaminunginte',
  'nurungu',
  'vettam',
  'set',
  'callertune',
  'caller',
  'press',
  'copy',
  'friend',
  'callertune'],
 ['winner',
  'valued',
  'network',
  'customer',
  'selected',
  'receivea',
 

## Creating a model here

In [59]:
gen_model = gensim.models.Word2Vec(data , window = 5, min_count=2)

In [60]:
gen_model.corpus_count

5564

In [61]:
gen_model.wv['true']

array([-0.06955916,  0.19918968,  0.04472566,  0.00323539,  0.02958924,
       -0.24830404,  0.06872506,  0.3865112 , -0.15748863, -0.13047616,
       -0.16094357, -0.25219616, -0.00832602,  0.08995714,  0.05514419,
       -0.14880903, -0.00886152, -0.21105322,  0.00715304, -0.3859799 ,
        0.07948924,  0.08129758,  0.09995139, -0.10358785, -0.07191981,
        0.03750145, -0.12349802, -0.10249162, -0.1810371 ,  0.05846952,
        0.22536889,  0.02581494,  0.10104671, -0.1332759 , -0.10462256,
        0.2618104 , -0.01020282, -0.1225917 , -0.09598018, -0.27830774,
       -0.01892437, -0.18544014, -0.05047188,  0.06059856,  0.17091952,
       -0.11377553, -0.16383941, -0.01231472,  0.1074466 ,  0.14653024,
        0.0842393 , -0.17577203, -0.07261598, -0.02942367, -0.11227053,
        0.12445485,  0.07824145, -0.00204713, -0.24304931,  0.04486886,
        0.07429691,  0.07755686, -0.05959117,  0.01613203, -0.21256138,
        0.15580262,  0.10682927,  0.12675789, -0.2517039 ,  0.28

In [68]:
def avg_word2vec(doc):
    return np.mean([gen_model.wv[word] for word in doc if word in gen_model.wv.index_to_key], axis = 0)

In [69]:
## tqdm is used to display the progress bar in case of loop
from tqdm import tqdm
import numpy as np

In [70]:
x = []
for i in tqdm(range(0,len(data))):
    x.append(avg_word2vec(data[i]))

100%|████████████████████████████████████████████████████████████████████████████| 5564/5564 [00:01<00:00, 2967.32it/s]


In [89]:
x

[array([-0.08957642,  0.2760092 ,  0.06748349,  0.02053057,  0.03584648,
        -0.3330439 ,  0.10289612,  0.52521896, -0.20264691, -0.16287957,
        -0.20593466, -0.34974432, -0.01349413,  0.10987574,  0.06247725,
        -0.20727341, -0.02374696, -0.28687352,  0.00997861, -0.51630217,
         0.09860113,  0.10848317,  0.12866028, -0.13001294, -0.08785524,
         0.05499409, -0.17178251, -0.1483139 , -0.2395898 ,  0.07369483,
         0.2983684 ,  0.04245346,  0.12238222, -0.18482828, -0.15566894,
         0.34398344, -0.01571408, -0.16641775, -0.11269902, -0.3798634 ,
        -0.01239007, -0.25178644, -0.05770686,  0.08520132,  0.22819526,
        -0.14771695, -0.20684288, -0.01855484,  0.1403745 ,  0.20457979,
         0.10179054, -0.24353908, -0.08959202, -0.02787657, -0.15287411,
         0.16699553,  0.11136764, -0.00088235, -0.31364784,  0.05320905,
         0.08885884,  0.11512369, -0.09491283,  0.03084638, -0.2888398 ,
         0.21975745,  0.13298135,  0.1637237 , -0.3

In [72]:
len(x[0])

100

In [73]:
type(x)

list

In [97]:
list(map(float, x.flatten()))

AttributeError: 'list' object has no attribute 'flatten'

In [90]:
x_new = np.asarray(x,dtype="object")

In [91]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_new,y, train_size = 0.7, random_state = 4)

In [92]:
type(x_train)

numpy.ndarray

In [93]:
type(y_train)

pandas.core.series.Series

In [94]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()

In [95]:
model.fit([x_train],y_train)

  X = check_array(


ValueError: Unable to convert array of bytes/strings into decimal numbers with dtype='numeric'

In [87]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(x_train, y_train)

  array = np.asarray(array, order=order, dtype=dtype)


ValueError: Expected 2D array, got 1D array instead:
array=[array([-0.12063584,  0.37888396,  0.08983022,  0.02781296,  0.05388147,
        -0.45622817,  0.14654176,  0.721863  , -0.28364357, -0.22656696,
        -0.28275833, -0.47819152, -0.01682468,  0.15013824,  0.08243623,
        -0.28643772, -0.03454135, -0.39298046,  0.02213353, -0.7036156 ,
         0.1337224 ,  0.15181893,  0.1743403 , -0.18111785, -0.11714979,
         0.07999039, -0.22955878, -0.20065977, -0.32738587,  0.09284869,
         0.4127277 ,  0.06233066,  0.16936874, -0.25160047, -0.20957315,
         0.46780458, -0.01919525, -0.22960556, -0.14835031, -0.53310907,
        -0.01859528, -0.34887525, -0.08236568,  0.12233328,  0.3186731 ,
        -0.20355232, -0.2857738 , -0.02842459,  0.19981724,  0.27623013,
         0.13741906, -0.33269107, -0.12493903, -0.03446538, -0.21430822,
         0.23626107,  0.15661259, -0.00553119, -0.4260973 ,  0.07341314,
         0.12508593,  0.16771846, -0.12956162,  0.04112124, -0.4017924 ,
         0.30106732,  0.18215503,  0.23062496, -0.4730164 ,  0.52950644,
        -0.2425638 ,  0.21559764,  0.40816233, -0.0903269 ,  0.32729295,
         0.17945158,  0.02290123, -0.07154735, -0.28894392,  0.22540838,
        -0.05972994, -0.10815694, -0.3728266 ,  0.48345292, -0.05612162,
         0.02192461, -0.12374277,  0.4424329 ,  0.42503962,  0.20733075,
         0.5253569 ,  0.23790006,  0.05596765,  0.21305913,  0.5331757 ,
         0.37461445,  0.20145518, -0.3838552 ,  0.12983336, -0.07086682],
       dtype=float32)
 array([-0.10973924,  0.30764505,  0.0734492 ,  0.02335854,  0.03613159,
        -0.37304255,  0.11566176,  0.58931047, -0.22962476, -0.18089987,
        -0.22646928, -0.38988462, -0.01256499,  0.12420321,  0.07103506,
        -0.22964151, -0.02322757, -0.32398787,  0.01887593, -0.58427215,
         0.10764072,  0.12458944,  0.14059584, -0.14877747, -0.09763336,
         0.06521147, -0.19376254, -0.15874629, -0.2714342 ,  0.08008152,
         0.3388095 ,  0.05431941,  0.14022796, -0.20406614, -0.17709176,
         0.38691726, -0.01269506, -0.18548386, -0.1273507 , -0.42534605,
        -0.01102439, -0.28109616, -0.06689889,  0.10015011,  0.26249477,
        -0.16690631, -0.23441882, -0.01809707,  0.16522637,  0.2309813 ,
         0.11847108, -0.27168182, -0.10624518, -0.02687372, -0.16959675,
         0.19134247,  0.12566186, -0.00427523, -0.35165104,  0.06198911,
         0.1029428 ,  0.13634747, -0.10478481,  0.04212339, -0.32320789,
         0.23919095,  0.14774305,  0.18062842, -0.38562608,  0.42877802,
        -0.19360916,  0.18019457,  0.32759106, -0.0828365 ,  0.2655239 ,
         0.15520263,  0.02281827, -0.06048333, -0.2300859 ,  0.18784012,
        -0.05040294, -0.08807999, -0.30456287,  0.40073523, -0.04242945,
         0.01608059, -0.10851654,  0.3660945 ,  0.35386148,  0.17153247,
         0.4247456 ,  0.19175227,  0.04448129,  0.17328878,  0.43812618,
         0.29886386,  0.1608535 , -0.31228423,  0.11494669, -0.06188137],
       dtype=float32)
 array([-0.13594009,  0.42113513,  0.09945203,  0.02586159,  0.05340778,
        -0.50629926,  0.15859494,  0.7998657 , -0.30850285, -0.24134138,
        -0.30810827, -0.5271689 , -0.01766005,  0.16074662,  0.09167659,
        -0.3171172 , -0.03361077, -0.4357621 ,  0.0151244 , -0.7862808 ,
         0.14683554,  0.15821728,  0.19296448, -0.20308828, -0.1398577 ,
         0.08386941, -0.25252143, -0.22310814, -0.36333352,  0.10333937,
         0.4616297 ,  0.06509472,  0.18656376, -0.27892956, -0.23164964,
         0.52112895, -0.02682789, -0.2585006 , -0.1669757 , -0.5815506 ,
        -0.01325347, -0.37913632, -0.08944384,  0.12666854,  0.35029736,
        -0.22442095, -0.31837088, -0.0231641 ,  0.2185607 ,  0.30993605,
         0.15691602, -0.3701702 , -0.13818595, -0.0324818 , -0.2415617 ,
         0.26539004,  0.17027366, -0.00542794, -0.4777134 ,  0.07985117,
         0.13892722,  0.17037836, -0.14022867,  0.04774527, -0.4374631 ,
         0.33655816,  0.20337948,  0.24675564, -0.5222559 ,  0.58023244,
        -0.27084422,  0.2383719 ,  0.4425524 , -0.10415317,  0.36228874,
         0.20626703,  0.02829678, -0.08230017, -0.31993204,  0.2524889 ,
        -0.06692054, -0.11391248, -0.41331062,  0.54060984, -0.05749387,
         0.02917456, -0.13528934,  0.48896503,  0.47096816,  0.22522736,
         0.58376133,  0.25749817,  0.05018426,  0.2272733 ,  0.5841033 ,
         0.40705466,  0.21783705, -0.42706156,  0.1442761 , -0.0821398 ],
       dtype=float32)
 ...
 array([-0.05552269,  0.16166201,  0.03591287,  0.01356329,  0.02207286,
        -0.19313702,  0.06385811,  0.30554765, -0.11584522, -0.09625076,
        -0.11990505, -0.20436871, -0.00448155,  0.06314115,  0.03473741,
        -0.12419969, -0.0122303 , -0.16935048,  0.00430444, -0.29907957,
         0.05516896,  0.06184252,  0.07405461, -0.07555801, -0.05072185,
         0.03399616, -0.09603965, -0.08385868, -0.13918427,  0.03900159,
         0.17640296,  0.02537039,  0.07187341, -0.1056417 , -0.08645479,
         0.20127718, -0.00844204, -0.09858344, -0.06366743, -0.22125046,
        -0.00852706, -0.14707215, -0.03547122,  0.0490053 ,  0.1370529 ,
        -0.08274273, -0.11798032, -0.01208389,  0.08275799,  0.12043716,
         0.06170547, -0.1379132 , -0.05371585, -0.01586566, -0.09270046,
         0.0973108 ,  0.06466614, -0.00441613, -0.18758589,  0.03223964,
         0.05165968,  0.06656022, -0.05716197,  0.0143222 , -0.1668044 ,
         0.12653174,  0.07679288,  0.09433831, -0.20451646,  0.2248964 ,
        -0.10116687,  0.0898668 ,  0.17403683, -0.03775023,  0.13639173,
         0.08036463,  0.01574326, -0.03160815, -0.11776905,  0.09774558,
        -0.0292855 , -0.04418845, -0.15706638,  0.20292048, -0.02288581,
         0.01212236, -0.0531018 ,  0.18795255,  0.18201813,  0.08961976,
         0.21891546,  0.09483008,  0.02644743,  0.08823656,  0.2211036 ,
         0.15494828,  0.08532665, -0.16401757,  0.05443539, -0.02909806],
       dtype=float32)
 array([-0.05798753,  0.17731498,  0.04048159,  0.01140795,  0.02030213,
        -0.21209534,  0.0641645 ,  0.33400303, -0.13021617, -0.10348286,
        -0.131067  , -0.22018132, -0.00865837,  0.0710847 ,  0.03900975,
        -0.13695578, -0.01427647, -0.18240884,  0.00721135, -0.33046442,
         0.0644146 ,  0.0704384 ,  0.08052622, -0.08205788, -0.0589177 ,
         0.0337089 , -0.10977127, -0.09583579, -0.15170993,  0.04684811,
         0.19214523,  0.02892019,  0.07895392, -0.11780737, -0.0982327 ,
         0.2187694 , -0.00778947, -0.10429609, -0.0735653 , -0.24663253,
        -0.01049707, -0.15864277, -0.03764274,  0.05625904,  0.14995468,
        -0.09752192, -0.13157386, -0.01615825,  0.08928597,  0.12953816,
         0.06425001, -0.1565675 , -0.05733208, -0.02104555, -0.10042236,
         0.11079784,  0.06883318, -0.00305454, -0.20270684,  0.0344211 ,
         0.05857209,  0.07299273, -0.05849807,  0.01836634, -0.18384281,
         0.13743004,  0.08828773,  0.10382303, -0.2213916 ,  0.24727187,
        -0.11131122,  0.09871851,  0.18642025, -0.04149821,  0.15147357,
         0.08560459,  0.01441721, -0.03388711, -0.13128203,  0.10486598,
        -0.02853507, -0.05175673, -0.17559953,  0.22243896, -0.02503419,
         0.01118254, -0.06014832,  0.20586583,  0.19894275,  0.09693943,
         0.24515358,  0.10856266,  0.02810186,  0.0961035 ,  0.2422261 ,
         0.17148055,  0.09492617, -0.17613147,  0.06173676, -0.03539924],
       dtype=float32)
 array([-0.09420635,  0.30570945,  0.07581623,  0.02211615,  0.03710868,
        -0.3611524 ,  0.11577411,  0.57304406, -0.22140315, -0.17719798,
        -0.22350244, -0.38189557, -0.01255306,  0.1275772 ,  0.07182844,
        -0.22628792, -0.02611182, -0.309379  ,  0.01179878, -0.5662803 ,
         0.10383381,  0.11510762,  0.14088765, -0.14447841, -0.09957961,
         0.06350493, -0.1854921 , -0.16275176, -0.26607648,  0.07662739,
         0.32976374,  0.04904   ,  0.13215157, -0.20004351, -0.16398785,
         0.37599573, -0.01845703, -0.1804683 , -0.12636802, -0.42154637,
        -0.01181255, -0.27424487, -0.06754078,  0.09470016,  0.25147712,
        -0.16041048, -0.23070917, -0.02381893,  0.15557587,  0.22094174,
         0.11015059, -0.26946115, -0.10467819, -0.03021595, -0.17276007,
         0.19011196,  0.1256193 , -0.00653145, -0.34617102,  0.06027926,
         0.09645181,  0.12324057, -0.10343288,  0.03515136, -0.3213708 ,
         0.23833533,  0.14742228,  0.1753684 , -0.3785457 ,  0.42179045,
        -0.19150947,  0.1718037 ,  0.3159986 , -0.07025015,  0.26155865,
         0.14424999,  0.02863741, -0.05713363, -0.23387411,  0.1844568 ,
        -0.04809105, -0.08811957, -0.29755324,  0.39334196, -0.04215335,
         0.01852869, -0.09693482,  0.3516497 ,  0.34606808,  0.16568276,
         0.41948342,  0.18345003,  0.0417457 ,  0.16437925,  0.42258483,
         0.29432306,  0.16136757, -0.30777213,  0.10550495, -0.05522019],
       dtype=float32)                                                    ].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.