In [181]:
import pandas as pd

In [182]:
messages = pd.read_csv('SMSSpamCollection.txt', sep='\t', names=["label", "message"])
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [183]:
messages.shape

(5572, 2)

### Text preprocessing

In [184]:
import nltk
import re
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [185]:
ps = PorterStemmer()
lemmatizer=WordNetLemmatizer()

In [186]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [187]:
corpus[10:15]

['gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash pound txt csh send cost p day day tsandc appli repli hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw',
 'search right word thank breather promis wont take help grant fulfil promis wonder bless time',
 'date sunday']

## Text to vector: BOW, TFIDF, Word2Vec

####  Bag of Words model

In [188]:
# Creating the Bag of Words model

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500, binary = True, ngram_range=(2,2))
X = cv.fit_transform(corpus).toarray()

In [189]:
X.shape

(5572, 2500)

In [190]:
y=pd.get_dummies(messages['label'])
y=y.iloc[:,1].values
y[:10]

array([False, False,  True, False, False,  True, False, False,  True,
        True])

In [191]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [192]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)


In [193]:
# Training model using Naive bayes classifier

from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [194]:
y_pred=spam_detect_model.predict(X_test)

In [195]:
from sklearn.metrics import accuracy_score,classification_report
print(accuracy_score(y_test,y_pred))

0.9730941704035875


In [196]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

       False       0.97      1.00      0.98       955
        True       1.00      0.81      0.90       160

    accuracy                           0.97      1115
   macro avg       0.98      0.91      0.94      1115
weighted avg       0.97      0.97      0.97      1115



## TF-IDF

In [197]:
# Creating the TFIDF model
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features=2500, binary=True, ngram_range=(2,2))
X = tv.fit_transform(corpus).toarray()

In [198]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [199]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [200]:
#prediction
y_pred=spam_detect_model.predict(X_test)

In [201]:
score=accuracy_score(y_test,y_pred)
print(score)

0.957847533632287


In [203]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

       False       1.00      0.95      0.98      1002
        True       0.71      1.00      0.83       113

    accuracy                           0.96      1115
   macro avg       0.85      0.98      0.90      1115
weighted avg       0.97      0.96      0.96      1115



## Word2vec Implementation

In [204]:
#!pip install gensim

In [205]:
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')

In [206]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [207]:
words=[]
for sent in corpus:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))

In [208]:
words[:10]

[['go',
  'jurong',
  'point',
  'crazi',
  'avail',
  'bugi',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amor',
  'wat'],
 ['ok', 'lar', 'joke', 'wif', 'oni'],
 ['free',
  'entri',
  'wkli',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkt',
  'st',
  'may',
  'text',
  'fa',
  'receiv',
  'entri',
  'question',
  'std',
  'txt',
  'rate',
  'appli'],
 ['dun', 'say', 'earli', 'hor', 'alreadi', 'say'],
 ['nah', 'think', 'goe', 'usf', 'live', 'around', 'though'],
 ['freemsg',
  'hey',
  'darl',
  'week',
  'word',
  'back',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chg',
  'send',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'mell',
  'mell',
  'oru',
  'minnaminungint',
  'nurungu',
  'vettam',
  'set',
  'callertun',
  'caller',
  'press',
  'copi',
  'friend',
  'callertun'],
 ['winner',
  'valu',
  'network',
  'custom',
  'select',
  'receivea',
  'prize',
  'reward',
  

In [138]:
import numpy as np
import gensim

# Train Word2Vec
model = gensim.models.Word2Vec(sentences=words, vector_size=300, window=5, min_count=2, workers=4)

# Create feature vectors for each message (average of word embeddings)
X = []
for sent in corpus:
    tokens = simple_preprocess(sent)
    word_vecs = [model.wv[w] for w in tokens if w in model.wv]
    if len(word_vecs) > 0:
        X.append(np.mean(word_vecs, axis=0))
    else:
        X.append(np.zeros(300))  # fallback if no words in vocab
X = np.array(X)


In [139]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Logistic Regression works well with embeddings
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8565022421524664
              precision    recall  f1-score   support

       False       0.86      1.00      0.92       955
        True       0.00      0.00      0.00       160

    accuracy                           0.86      1115
   macro avg       0.43      0.50      0.46      1115
weighted avg       0.73      0.86      0.79      1115



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [141]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train SVM
clf = svm.SVC(kernel='linear')  # You can also try kernel='rbf' or 'poly'
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8565022421524664
              precision    recall  f1-score   support

       False       0.86      1.00      0.92       955
        True       0.00      0.00      0.00       160

    accuracy                           0.86      1115
   macro avg       0.43      0.50      0.46      1115
weighted avg       0.73      0.86      0.79      1115



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Avg Word2Vec

In [142]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [lemmatizer.lemmatize(word) for word in review]
    review = ' '.join(review)
    corpus.append(review)

In [143]:
[[i,j,k] for i,j,k in zip(list(map(len,corpus)),corpus, messages['message']) if i<1]

[[0, '', '645'], [0, '', ':) '], [0, '', ':-) :-)']]

In [144]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [209]:
words=[]
for sent in corpus:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))


In [None]:
words[:10]

[['go',
  'jurong',
  'point',
  'crazi',
  'avail',
  'bugi',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amor',
  'wat'],
 ['ok', 'lar', 'joke', 'wif', 'oni'],
 ['free',
  'entri',
  'wkli',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkt',
  'st',
  'may',
  'text',
  'fa',
  'receiv',
  'entri',
  'question',
  'std',
  'txt',
  'rate',
  'appli'],
 ['dun', 'say', 'earli', 'hor', 'alreadi', 'say'],
 ['nah', 'think', 'goe', 'usf', 'live', 'around', 'though'],
 ['freemsg',
  'hey',
  'darl',
  'week',
  'word',
  'back',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chg',
  'send',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'mell',
  'mell',
  'oru',
  'minnaminungint',
  'nurungu',
  'vettam',
  'set',
  'callertun',
  'caller',
  'press',
  'copi',
  'friend',
  'callertun'],
 ['winner',
  'valu',
  'network',
  'custom',
  'select',
  'receivea',
  'prize',
  'reward',
  

In [147]:
## Lets train Word2vec from scratch
model=gensim.models.Word2Vec(words)

In [None]:
## To Get All the Vocabulary
model.wv.index_to_key[:10]

['to',
 'you',
 'the',
 'it',
 'and',
 'in',
 'is',
 'me',
 'my',
 'for',
 'your',
 'call',
 'of',
 'that',
 'have',
 'on',
 'now',
 'are',
 'can',
 'so',
 'but',
 'not',
 'or',
 'we',
 'do',
 'get',
 'at',
 'ur',
 'will',
 'if',
 'be',
 'with',
 'no',
 'just',
 'this',
 'gt',
 'lt',
 'go',
 'how',
 'up',
 'when',
 'ok',
 'day',
 'what',
 'free',
 'from',
 'all',
 'out',
 'know',
 'll',
 'come',
 'like',
 'good',
 'time',
 'am',
 'then',
 'got',
 'wa',
 'there',
 'he',
 'love',
 'text',
 'only',
 'want',
 'send',
 'one',
 'need',
 'txt',
 'today',
 'by',
 'going',
 'don',
 'stop',
 'home',
 'she',
 'about',
 'lor',
 'sorry',
 'see',
 'still',
 'mobile',
 'take',
 'back',
 'da',
 'reply',
 'dont',
 'our',
 'think',
 'tell',
 'week',
 'hi',
 'phone',
 'they',
 'new',
 'please',
 'later',
 'pls',
 'any',
 'her',
 'ha',
 'co',
 'did',
 'been',
 'msg',
 'min',
 'some',
 'an',
 'night',
 'make',
 'dear',
 'who',
 'here',
 'message',
 'say',
 'well',
 'where',
 're',
 'thing',
 'much',
 'oh',

In [149]:
model.corpus_count

5569

In [150]:
model.epochs

5

In [151]:
words[0]

['go',
 'until',
 'jurong',
 'point',
 'crazy',
 'available',
 'only',
 'in',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'there',
 'got',
 'amore',
 'wat']

In [152]:
def avg_word2vec(doc):
    # remove out-of-vocabulary words
    sent = [word for word in doc if word in model.wv.index_to_key]
    #print(sent)
    
    if len(sent)>=1:
        return np.mean([model.wv[word] for word in sent], axis=0)
    else:
        return np.zeros(model.vector_size)  
    #return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis=0)
                #or [np.zeros(len(model.wv.index_to_key))], axis=0)

In [153]:
from tqdm import tqdm

In [154]:
#apply for the entire sentences
import numpy as np
X=[]
for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))

100%|██████████| 5569/5569 [00:00<00:00, 8332.47it/s]


In [155]:
len(X)

5569

In [156]:
##independent Features
X_new =np.array(X)

In [157]:
messages.shape

(5572, 2)

In [158]:
X[1]

array([-0.15698993,  0.20694876,  0.09834898,  0.07755301,  0.08400182,
       -0.4218933 ,  0.13870007,  0.41419548, -0.23066734, -0.09094738,
       -0.15200637, -0.31590313, -0.04939945,  0.10998864,  0.16034976,
       -0.14339815,  0.10960845, -0.26590025, -0.06103827, -0.45611978,
        0.18076426,  0.1077554 ,  0.05601355, -0.18752453, -0.02259633,
       -0.01517238, -0.17785563, -0.17202045, -0.21790263,  0.02160325,
        0.27175573,  0.02140511,  0.09479968, -0.1556507 , -0.12238242,
        0.3297151 ,  0.05756973, -0.10476536, -0.09662916, -0.4159368 ,
        0.09501418, -0.21532187, -0.15927285,  0.01384784,  0.12360094,
        0.00283992, -0.10624774, -0.03769789,  0.18799657,  0.1248857 ,
        0.14938457, -0.15419427, -0.05021889,  0.05157485, -0.06040449,
        0.03681945,  0.13187328,  0.0057825 , -0.3189423 ,  0.16309984,
       -0.00396396,  0.14070949,  0.00106834, -0.09672053, -0.25189087,
        0.24241224,  0.08653629,  0.19638029, -0.30145267,  0.34

In [159]:
X_new.shape

(5569, 100)

In [160]:
X_new[0].shape

(100,)

In [161]:
## Dependent Features
## Output Features
y = messages[list(map(lambda x: len(x)>0 ,corpus))]
y=pd.get_dummies(y['label'])
y=y.iloc[:,0].values

In [162]:
y.shape

(5569,)

In [163]:
X[0].reshape(1,-1).shape

(1, 100)

In [164]:
df = pd.DataFrame()
for i in range(len(X)):
    row_df = pd.DataFrame(X[i].reshape(1, -1))
    df = pd.concat([df, row_df], ignore_index=True)


In [165]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.171442,0.237221,0.115663,0.089568,0.093416,-0.487923,0.172062,0.47344,-0.267016,-0.110276,...,0.354254,0.147274,0.046892,0.052907,0.417233,0.184016,0.160481,-0.192838,0.144671,0.005655
1,-0.15699,0.206949,0.098349,0.077553,0.084002,-0.421893,0.1387,0.414195,-0.230667,-0.090947,...,0.310038,0.119944,0.036354,0.040307,0.351499,0.15442,0.138169,-0.175064,0.132551,-0.001885
2,-0.186233,0.252332,0.123595,0.109194,0.079983,-0.525743,0.17183,0.470641,-0.28477,-0.134744,...,0.352695,0.150427,0.044846,0.038822,0.432584,0.170073,0.113106,-0.229661,0.175991,0.024483
3,-0.2345,0.318152,0.151221,0.12003,0.125789,-0.657432,0.225786,0.643896,-0.362468,-0.142589,...,0.479576,0.194275,0.059096,0.078023,0.551187,0.254435,0.231897,-0.259378,0.194622,-0.002981
4,-0.204751,0.262834,0.135939,0.098635,0.117269,-0.562479,0.188257,0.550859,-0.313852,-0.127308,...,0.415446,0.16564,0.052553,0.067948,0.472463,0.218312,0.190621,-0.231673,0.160307,-0.003048


In [166]:
df['Output']=y

In [167]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Output
0,-0.171442,0.237221,0.115663,0.089568,0.093416,-0.487923,0.172062,0.47344,-0.267016,-0.110276,...,0.147274,0.046892,0.052907,0.417233,0.184016,0.160481,-0.192838,0.144671,0.005655,True
1,-0.15699,0.206949,0.098349,0.077553,0.084002,-0.421893,0.1387,0.414195,-0.230667,-0.090947,...,0.119944,0.036354,0.040307,0.351499,0.15442,0.138169,-0.175064,0.132551,-0.001885,True
2,-0.186233,0.252332,0.123595,0.109194,0.079983,-0.525743,0.17183,0.470641,-0.28477,-0.134744,...,0.150427,0.044846,0.038822,0.432584,0.170073,0.113106,-0.229661,0.175991,0.024483,False
3,-0.2345,0.318152,0.151221,0.12003,0.125789,-0.657432,0.225786,0.643896,-0.362468,-0.142589,...,0.194275,0.059096,0.078023,0.551187,0.254435,0.231897,-0.259378,0.194622,-0.002981,True
4,-0.204751,0.262834,0.135939,0.098635,0.117269,-0.562479,0.188257,0.550859,-0.313852,-0.127308,...,0.16564,0.052553,0.067948,0.472463,0.218312,0.190621,-0.231673,0.160307,-0.003048,True


In [168]:
df.dropna(inplace=True)

In [169]:
df.isnull().sum()

0         0
1         0
2         0
3         0
4         0
         ..
96        0
97        0
98        0
99        0
Output    0
Length: 101, dtype: int64

In [170]:
## Independent Feature
X=df

In [171]:
X.isnull().sum()

0         0
1         0
2         0
3         0
4         0
         ..
96        0
97        0
98        0
99        0
Output    0
Length: 101, dtype: int64

In [172]:
y=df['Output']

In [173]:
## Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20)

In [174]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Output
4858,-0.219899,0.283738,0.137815,0.100681,0.115269,-0.588348,0.202687,0.571821,-0.32209,-0.132158,...,0.18005,0.060615,0.070504,0.506392,0.232929,0.202305,-0.230904,0.170331,0.004344,True
2862,-0.198163,0.257653,0.128772,0.096769,0.104425,-0.542443,0.177536,0.525851,-0.299312,-0.131802,...,0.171016,0.046816,0.054857,0.455556,0.205118,0.167005,-0.223642,0.150033,0.001514,False
3716,-0.262906,0.314713,0.155447,0.115671,0.148314,-0.663943,0.228118,0.650798,-0.379098,-0.13957,...,0.207209,0.059245,0.079107,0.579441,0.280018,0.232669,-0.265859,0.195955,-0.008788,True
4505,-0.22662,0.260919,0.13865,0.077735,0.148019,-0.584707,0.179997,0.583006,-0.324749,-0.140113,...,0.171937,0.047055,0.064101,0.456805,0.24725,0.203466,-0.234947,0.135184,-0.003646,True
3822,-0.145741,0.206291,0.09948,0.077947,0.068446,-0.417295,0.143692,0.392687,-0.226742,-0.102907,...,0.127026,0.040912,0.03984,0.350605,0.14651,0.118745,-0.17024,0.12949,0.005243,True


In [175]:
y_train[:10]

4858     True
2862    False
3716     True
4505     True
3822     True
3271    False
1078     True
337      True
1424     True
3370     True
Name: Output, dtype: bool

In [176]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()

In [177]:
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

classifier.fit(X_train,y_train)

In [178]:
y_pred=classifier.predict(X_test)

In [179]:
from sklearn.metrics import accuracy_score,classification_report
print(accuracy_score(y_test,y_pred))

0.9955116696588869


In [180]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

       False       1.00      0.97      0.98       157
        True       0.99      1.00      1.00       957

    accuracy                           1.00      1114
   macro avg       1.00      0.98      0.99      1114
weighted avg       1.00      1.00      1.00      1114

