In [1]:
# importing the Dataset

import pandas as pd

messages = pd.read_csv('smsspamcollection/SMSSpamCollection', sep='\t',
                           names=["label", "message"])

In [2]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
messages.isna().sum()

label      0
message    0
dtype: int64

In [4]:
messages.shape

(5572, 2)

In [5]:
messages["message"].loc[100]

"Please don't text me anymore. I have nothing else to say."

#### data cleaning and preprocessing
1 Tokenization , stopwords , stemming , lemmatization  
2 text to vectors  -> BOW , TF-Idf , word2vec . avgword2vec

In [6]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
ps = PorterStemmer ()

In [8]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [9]:
corpus[:10]

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl week word back like fun still tb ok xxx std chg send rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press copi friend callertun',
 'winner valu network custom select receivea prize reward claim call claim code kl valid hour',
 'mobil month u r entitl updat latest colour mobil camera free call mobil updat co free']

In [10]:
# label encoding
y=pd.get_dummies(messages['label'])
y=y.iloc[:,1].values

### Train Test Split

In [11]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size = 0.20, random_state = 0)

# Creating the Bag of Words model + MultinomialNB

In [12]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500,binary=True)
X_train_BOW= cv.fit_transform(X_train).toarray()
X_test_BOW= cv.transform(X_test).toarray()

In [13]:
X_train_BOW

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [14]:
y_train

array([0, 0, 0, ..., 1, 0, 0], dtype=uint8)

In [15]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train_BOW, y_train)

In [16]:
#prediction
y_pred1=spam_detect_model.predict(X_test_BOW)

In [17]:

from sklearn.metrics import accuracy_score,classification_report
score=accuracy_score(y_test,y_pred1)
print(score)

0.9838565022421525


In [18]:
from sklearn.metrics import classification_report
print(classification_report(y_pred1,y_test))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       959
           1       0.93      0.96      0.94       156

    accuracy                           0.98      1115
   macro avg       0.96      0.97      0.97      1115
weighted avg       0.98      0.98      0.98      1115



# Creating the TFIDF model +MultinomialNB


In [19]:
# Creating the TFIDF model
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features=2500)
X_train_TFIDF= tv.fit_transform(X_train).toarray()
X_test_TFIDF= tv.transform(X_test).toarray()

In [71]:
X_train_TFIDF[1]

array([0., 0., 0., ..., 0., 0., 0.])

In [20]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train_TFIDF, y_train)

In [21]:
#prediction
y_pred2=spam_detect_model.predict(X_test_TFIDF)


In [22]:
score=accuracy_score(y_test,y_pred2)
print(score)


0.9766816143497757


In [23]:
from sklearn.metrics import classification_report
print(classification_report(y_pred2,y_test))

              precision    recall  f1-score   support

           0       1.00      0.97      0.99       981
           1       0.84      1.00      0.91       134

    accuracy                           0.98      1115
   macro avg       0.92      0.99      0.95      1115
weighted avg       0.98      0.98      0.98      1115



# Creating the TFIDF model +  random forest

In [24]:
# random forest
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()
classifier.fit(X_train_TFIDF,y_train)

RandomForestClassifier()

In [25]:
#prediction
y_pred3=classifier.predict(X_test_TFIDF)
score=accuracy_score(y_test,y_pred3)
print(score)


0.9829596412556054


In [26]:
from sklearn.metrics import classification_report
print(classification_report(y_pred3,y_test))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       974
           1       0.88      1.00      0.94       141

    accuracy                           0.98      1115
   macro avg       0.94      0.99      0.96      1115
weighted avg       0.98      0.98      0.98      1115



# Creating the Bag of Words model + random forest

In [27]:
# random forest
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()
classifier.fit(X_train_BOW,y_train)

RandomForestClassifier()

In [28]:
#prediction
y_pred4=classifier.predict(X_test_BOW)
score=accuracy_score(y_test,y_pred4)
print(score)


0.9829596412556054


In [29]:
from sklearn.metrics import classification_report
print(classification_report(y_pred4,y_test))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       974
           1       0.88      1.00      0.94       141

    accuracy                           0.98      1115
   macro avg       0.94      0.99      0.96      1115
weighted avg       0.98      0.98      0.98      1115



## Word2vec Implementation

In [84]:
import gensim.downloader as api



In [85]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [86]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [87]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess 

In [88]:
len(corpus)

5572

In [89]:
corpus[0]


'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [36]:
words=[]
for sent in corpus:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent)) #simple_preprocess Convert a document into a list of lowercase tokens, ignoring tokens that are too short or too long


In [37]:
words[0]

['go',
 'jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat']

In [38]:
len(words)

5564

In [39]:
[[i,j,k] for i,j,k in zip(list(map(len,corpus)),corpus, messages['message']) if i<1]

[[0, '', 'What you doing?how are you?'],
 [0, '', 'Where @'],
 [0, '', '645'],
 [0, '', 'Can a not?'],
 [0, '', ':) '],
 [0, '', 'What you doing?how are you?'],
 [0, '', ':( but your not here....'],
 [0, '', ':-) :-)']]

Because of this reason, 'simple_preprocess' ignores these rows in the corpus and the final dimensions of X and y do not match i.e 5564(X) vs 5572(y). Therefore train_test_split also throws an error.

To fix this, recalculate y by removing the messages dataframe rows corresponding to the above blank sentences as follows:

In [40]:
y = messages[list(map(lambda x: len(x)>0 ,corpus))]
y=pd.get_dummies(y['label'])
y=y.iloc[:,1].values

y.shape

(5564,)

In [41]:
# making word2vec from scratch

In [42]:
import gensim

In [43]:
### Lets train Word2vec from scratch
model=gensim.models.Word2Vec(words,window=5,vector_size=100,min_count=2)

In [44]:
model.wv.index_to_key[0:10]

['call', 'get', 'ur', 'gt', 'lt', 'go', 'ok', 'day', 'free', 'know']

In [45]:
model.corpus_count

5564

In [46]:
model.epochs

5

In [47]:
model.wv.similar_by_word('kid')

[('work', 0.9969403743743896),
 ('much', 0.996876060962677),
 ('money', 0.9968047738075256),
 ('went', 0.996778130531311),
 ('sent', 0.9967295527458191),
 ('oh', 0.9967256784439087),
 ('really', 0.9967249631881714),
 ('going', 0.9967221617698669),
 ('like', 0.996698260307312),
 ('would', 0.9966953992843628)]

In [48]:
model.wv.similar_by_word('price')

[('live', 0.9990087151527405),
 ('ur', 0.9989923238754272),
 ('go', 0.9989807605743408),
 ('min', 0.9989736080169678),
 ('txt', 0.9989727735519409),
 ('best', 0.9989722967147827),
 ('msg', 0.9989525079727173),
 ('next', 0.9989442825317383),
 ('back', 0.9989382028579712),
 ('give', 0.9989266991615295)]

In [49]:
model.wv.similar_by_word('happy')

[('year', 0.9994580149650574),
 ('hello', 0.9993191361427307),
 ('day', 0.9993175864219666),
 ('make', 0.9993124008178711),
 ('like', 0.9992846846580505),
 ('new', 0.999279797077179),
 ('dear', 0.9992725849151611),
 ('dont', 0.9992671012878418),
 ('keep', 0.9992635846138),
 ('money', 0.999251663684845)]

In [50]:
model.wv['happy'].shape  # dimension of single word 


(100,)

In [51]:
# now we have to apply avg word2vec

In [52]:
def avg_word2vec(doc):
    # remove out-of-vocabulary words
    #sent = [word for word in doc if word in model.wv.index_to_key]
    #print(sent)
    
    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis=0)
                #or [np.zeros(len(model.wv.index_to_key))], axis=0)
        

In [53]:
!pip install tqdm



In [54]:
from tqdm import tqdm

In [55]:
words[73]


['performed']

In [56]:
type(model.wv.index_to_key)

list

In [57]:
import numpy as np

In [58]:
#apply for the entire sentences
X=[]
count=0
for i in tqdm(range(len(words))):
    count=count+1
    X.append(avg_word2vec(words[i]))
    
print ("total words",count)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
100%|████████████████████████████████████████████████████████████████████████████| 5564/5564 [00:00<00:00, 6663.08it/s]

total words 5564





In [59]:
type(X)

list

In [60]:
X_new=np.array(X)

  """Entry point for launching an IPython kernel.


In [61]:
X_new[0]

array([-0.10532334,  0.29218706,  0.14601143,  0.02279398,  0.03180818,
       -0.32373455,  0.08819105,  0.5146341 , -0.16565207, -0.15271027,
       -0.15452074, -0.3383404 , -0.00425214,  0.095595  ,  0.07988171,
       -0.27909568,  0.01902971, -0.33567894, -0.01489698, -0.44326127,
        0.07172848,  0.15127851,  0.10022926, -0.12536488, -0.10606787,
        0.02655315, -0.21024317, -0.14019944, -0.21742932,  0.05002345,
        0.29315338,  0.05448255,  0.15369211, -0.23121105, -0.12070017,
        0.26239416,  0.03629923, -0.19900161, -0.17211859, -0.37857762,
        0.06459386, -0.23299868, -0.08966768,  0.04602126,  0.2470609 ,
       -0.13115536, -0.17995109,  0.03126285,  0.14228906,  0.2386996 ,
        0.16908745, -0.27767193, -0.0568573 , -0.02133485, -0.12629987,
        0.20706278,  0.17147042, -0.03659447, -0.267844  ,  0.06406327,
        0.10153774,  0.1179468 , -0.12325127,  0.00103988, -0.2671925 ,
        0.1614449 ,  0.08290652,  0.19251798, -0.28059915,  0.32

In [62]:
X_new[0].shape

(100,)

In [63]:
words[0]

['go',
 'jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat']

In [64]:
y.shape

(5564,)

In [65]:
X_new.shape

(5564,)