In [15]:
import pandas as pd

In [16]:
messages = pd.read_csv('SMSSpamCollection.txt', sep='\t', names=["label", "message"])
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [17]:
messages.shape

(5572, 2)

### Text preprocessing

In [18]:
import nltk
import re
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [19]:
ps = PorterStemmer()
lemmatizer=WordNetLemmatizer()

In [20]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [21]:
corpus[10:15]

['gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash pound txt csh send cost p day day tsandc appli repli hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw',
 'search right word thank breather promis wont take help grant fulfil promis wonder bless time',
 'date sunday']

## Text to vector: BOW, TFIDF, Word2Vec

####  Bag of Words model

In [22]:
# Creating the Bag of Words model

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500, binary = True, ngram_range=(2,2))
X = cv.fit_transform(corpus).toarray()

In [23]:
X.shape

(5572, 2500)

In [24]:
y=pd.get_dummies(messages['label'])
y=y.iloc[:,1].values
y[:10]

array([False, False,  True, False, False,  True, False, False,  True,
        True])

In [25]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [26]:
# Training model using Naive bayes classifier

from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [27]:
y_pred=spam_detect_model.predict(X_test)

In [28]:
from sklearn.metrics import accuracy_score,classification_report
print(accuracy_score(y_test,y_pred))

0.9730941704035875


In [29]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

       False       0.97      1.00      0.98       955
        True       1.00      0.81      0.90       160

    accuracy                           0.97      1115
   macro avg       0.98      0.91      0.94      1115
weighted avg       0.97      0.97      0.97      1115



## TF-IDF

In [30]:
# Creating the TFIDF model
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features=2500, binary=True, ngram_range=(2,2))
X = tv.fit_transform(corpus).toarray()

In [31]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [32]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [33]:
#prediction
y_pred=spam_detect_model.predict(X_test)

In [34]:
score=accuracy_score(y_test,y_pred)
print(score)

0.957847533632287


In [35]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

       False       1.00      0.95      0.98      1002
        True       0.71      1.00      0.83       113

    accuracy                           0.96      1115
   macro avg       0.85      0.98      0.90      1115
weighted avg       0.97      0.96      0.96      1115



## Word2vec Implementation

In [36]:
#!pip install gensim

In [37]:
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')

In [38]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [39]:
words=[]
for sent in corpus:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))

In [40]:
#words[:10]

In [41]:
import numpy as np
import gensim

# Train Word2Vec
model = gensim.models.Word2Vec(sentences=words, vector_size=300, window=5, min_count=2)

# Create feature vectors for each message (average of word embeddings)
X = []
for sent in corpus:
    tokens = simple_preprocess(sent)
    word_vecs = [model.wv[w] for w in tokens if w in model.wv]
    if len(word_vecs) > 0:
        X.append(np.mean(word_vecs, axis=0))
    else:
        X.append(np.zeros(300))  # fallback if no words in vocab
X = np.array(X)


In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Logistic Regression works well with embeddings
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8565022421524664
              precision    recall  f1-score   support

       False       0.86      1.00      0.92       955
        True       0.00      0.00      0.00       160

    accuracy                           0.86      1115
   macro avg       0.43      0.50      0.46      1115
weighted avg       0.73      0.86      0.79      1115



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [43]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train SVM
clf = svm.SVC(kernel='linear')  # You can also try kernel='rbf' or 'poly'
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8565022421524664
              precision    recall  f1-score   support

       False       0.86      1.00      0.92       955
        True       0.00      0.00      0.00       160

    accuracy                           0.86      1115
   macro avg       0.43      0.50      0.46      1115
weighted avg       0.73      0.86      0.79      1115



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Avg Word2Vec

In [44]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [lemmatizer.lemmatize(word) for word in review]
    review = ' '.join(review)
    corpus.append(review)

In [45]:
[[i,j,k] for i,j,k in zip(list(map(len,corpus)),corpus, messages['message']) if i<1]

[[0, '', '645'], [0, '', ':) '], [0, '', ':-) :-)']]

In [46]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [47]:
words=[]
for sent in corpus:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))


In [48]:
#words[:10]

In [49]:
## Lets train Word2vec from scratch
model=gensim.models.Word2Vec(words)

In [50]:
## To Get All the Vocabulary
model.wv.index_to_key[:10]

['to', 'you', 'the', 'it', 'and', 'in', 'is', 'me', 'my', 'for']

In [51]:
model.corpus_count

5569

In [52]:
model.epochs

5

In [53]:
words[0]

['go',
 'until',
 'jurong',
 'point',
 'crazy',
 'available',
 'only',
 'in',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'there',
 'got',
 'amore',
 'wat']

In [54]:
def avg_word2vec(doc):
    # remove out-of-vocabulary words
    sent = [word for word in doc if word in model.wv.index_to_key]
    #print(sent)
    
    if len(sent)>=1:
        return np.mean([model.wv[word] for word in sent], axis=0)
    else:
        return np.zeros(model.vector_size)  
    #return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis=0)
                #or [np.zeros(len(model.wv.index_to_key))], axis=0)

In [55]:
from tqdm import tqdm

In [56]:
#apply for the entire sentences
import numpy as np
X=[]
for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))

100%|██████████| 5569/5569 [00:00<00:00, 9294.13it/s]


In [57]:
len(X)

5569

In [58]:
##independent Features
X_new =np.array(X)

In [59]:
messages.shape

(5572, 2)

In [60]:
X[1]

array([-1.59108609e-01,  2.02145278e-01,  9.85729694e-02,  7.63481110e-02,
        8.50550830e-02, -4.18432683e-01,  1.36617213e-01,  4.10810918e-01,
       -2.27364406e-01, -8.75969455e-02, -1.58424124e-01, -3.14725012e-01,
       -5.20298928e-02,  1.09035999e-01,  1.58489794e-01, -1.44273698e-01,
        1.11859895e-01, -2.68507987e-01, -6.12729788e-02, -4.59902525e-01,
        1.79319024e-01,  1.06945112e-01,  5.86744100e-02, -1.84881806e-01,
       -2.07718331e-02, -1.44403875e-02, -1.73210189e-01, -1.75231844e-01,
       -2.17963636e-01,  2.33971700e-02,  2.74732202e-01,  2.15336196e-02,
        9.31747705e-02, -1.51477426e-01, -1.21562749e-01,  3.27181846e-01,
        5.59293553e-02, -1.04077019e-01, -8.88169929e-02, -4.13653255e-01,
        9.51609164e-02, -2.14954168e-01, -1.59748539e-01,  1.72670092e-02,
        1.22068822e-01,  1.97412143e-03, -1.10439554e-01, -4.11424860e-02,
        1.87304348e-01,  1.23775095e-01,  1.49369702e-01, -1.55040771e-01,
       -4.44277525e-02,  

In [61]:
X_new.shape

(5569, 100)

In [62]:
X_new[0].shape

(100,)

In [63]:
## Dependent Features
## Output Features
y = messages[list(map(lambda x: len(x)>0 ,corpus))]
y=pd.get_dummies(y['label'])
y=y.iloc[:,0].values

In [64]:
y.shape

(5569,)

In [65]:
X[0].reshape(1,-1).shape

(1, 100)

In [66]:
df = pd.DataFrame()
for i in range(len(X)):
    row_df = pd.DataFrame(X[i].reshape(1, -1))
    df = pd.concat([df, row_df], ignore_index=True)


In [67]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.174882,0.237811,0.117865,0.090352,0.092584,-0.490917,0.173914,0.476199,-0.264198,-0.109826,...,0.355807,0.14927,0.050715,0.053324,0.421908,0.185385,0.165887,-0.19164,0.147173,0.009049
1,-0.159109,0.202145,0.098573,0.076348,0.085055,-0.418433,0.136617,0.410811,-0.227364,-0.087597,...,0.312518,0.120728,0.038466,0.039349,0.351808,0.15565,0.14358,-0.17441,0.133352,0.00192
2,-0.185214,0.253473,0.126324,0.110499,0.074924,-0.528249,0.174914,0.470094,-0.280336,-0.13655,...,0.347033,0.149244,0.048464,0.040011,0.427262,0.167831,0.111661,-0.222528,0.174398,0.024002
3,-0.240059,0.315037,0.152924,0.119885,0.126817,-0.656782,0.224011,0.641553,-0.358653,-0.139065,...,0.483817,0.195657,0.062522,0.07681,0.556375,0.2556,0.237112,-0.262585,0.199402,0.004023
4,-0.206349,0.26209,0.137853,0.099122,0.115376,-0.561558,0.189683,0.549954,-0.309922,-0.125687,...,0.414035,0.166428,0.057105,0.067802,0.477216,0.218411,0.194449,-0.231584,0.165063,0.001729


In [68]:
df['Output']=y

In [69]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Output
0,-0.174882,0.237811,0.117865,0.090352,0.092584,-0.490917,0.173914,0.476199,-0.264198,-0.109826,...,0.14927,0.050715,0.053324,0.421908,0.185385,0.165887,-0.19164,0.147173,0.009049,True
1,-0.159109,0.202145,0.098573,0.076348,0.085055,-0.418433,0.136617,0.410811,-0.227364,-0.087597,...,0.120728,0.038466,0.039349,0.351808,0.15565,0.14358,-0.17441,0.133352,0.00192,True
2,-0.185214,0.253473,0.126324,0.110499,0.074924,-0.528249,0.174914,0.470094,-0.280336,-0.13655,...,0.149244,0.048464,0.040011,0.427262,0.167831,0.111661,-0.222528,0.174398,0.024002,False
3,-0.240059,0.315037,0.152924,0.119885,0.126817,-0.656782,0.224011,0.641553,-0.358653,-0.139065,...,0.195657,0.062522,0.07681,0.556375,0.2556,0.237112,-0.262585,0.199402,0.004023,True
4,-0.206349,0.26209,0.137853,0.099122,0.115376,-0.561558,0.189683,0.549954,-0.309922,-0.125687,...,0.166428,0.057105,0.067802,0.477216,0.218411,0.194449,-0.231584,0.165063,0.001729,True


In [70]:
df.dropna(inplace=True)

In [71]:
#df.isnull().sum()

In [72]:
## Independent Feature
X=df

In [73]:
X.isnull().sum()

0         0
1         0
2         0
3         0
4         0
         ..
96        0
97        0
98        0
99        0
Output    0
Length: 101, dtype: int64

In [74]:
y=df['Output']

In [75]:
## Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20)

In [76]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Output
3660,-0.241453,0.295197,0.158984,0.104243,0.139259,-0.62881,0.212046,0.61734,-0.348919,-0.142457,...,0.186973,0.060679,0.071482,0.528236,0.253054,0.224874,-0.261345,0.169463,0.017043,True
5416,-0.213955,0.28475,0.140308,0.10655,0.115649,-0.590091,0.208527,0.578581,-0.32251,-0.131372,...,0.182257,0.061092,0.066616,0.506817,0.234375,0.20827,-0.230259,0.172606,0.003918,True
2635,-0.242497,0.290697,0.155734,0.125456,0.112966,-0.644972,0.206692,0.600468,-0.359107,-0.155072,...,0.182643,0.065855,0.058448,0.525453,0.231372,0.169267,-0.27136,0.199076,0.024578,True
1588,-0.206828,0.261218,0.134267,0.094819,0.113412,-0.558678,0.184392,0.54646,-0.30558,-0.125041,...,0.169816,0.05588,0.070563,0.473382,0.227765,0.197202,-0.218161,0.15685,0.001473,True
5300,-0.218379,0.280137,0.136203,0.102899,0.11988,-0.59211,0.198281,0.579539,-0.323213,-0.128026,...,0.178222,0.058758,0.067945,0.504775,0.23351,0.207549,-0.237374,0.171566,0.000659,True


In [77]:
y_train[:5]

3660    True
5416    True
2635    True
1588    True
5300    True
Name: Output, dtype: bool

In [78]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()

In [79]:
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

classifier.fit(X_train,y_train)

In [80]:
y_pred=classifier.predict(X_test)

In [81]:
from sklearn.metrics import accuracy_score,classification_report
print(accuracy_score(y_test,y_pred))

0.9991023339317774


In [180]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

       False       1.00      0.97      0.98       157
        True       0.99      1.00      1.00       957

    accuracy                           1.00      1114
   macro avg       1.00      0.98      0.99      1114
weighted avg       1.00      1.00      1.00      1114

