In [88]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [89]:
messages = pd.read_csv('SMSSpamCollection.txt', sep='\t', names=["label", "message"])
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [90]:
messages.shape

(5572, 2)

### Text preprocessing

In [91]:
import nltk
import re
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [92]:
ps = PorterStemmer()
lemmatizer=WordNetLemmatizer()

In [93]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [94]:
corpus[10:15]

['gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash pound txt csh send cost p day day tsandc appli repli hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw',
 'search right word thank breather promis wont take help grant fulfil promis wonder bless time',
 'date sunday']

## Text to vector: BOW, TFIDF, Word2Vec

####  Bag of Words model

In [95]:
# Creating the Bag of Words model

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500, binary = True, ngram_range=(2,2))
X = cv.fit_transform(corpus).toarray()

In [96]:
X.shape

(5572, 2500)

In [97]:
y=pd.get_dummies(messages['label'])
y=y.iloc[:,1].values
y[:10]

array([False, False,  True, False, False,  True, False, False,  True,
        True])

In [98]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [99]:
# Training model using Naive bayes classifier

from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [100]:
y_pred=spam_detect_model.predict(X_test)

In [101]:
from sklearn.metrics import accuracy_score,classification_report
print(accuracy_score(y_test,y_pred))

0.9730941704035875


In [102]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

       False       0.97      1.00      0.98       955
        True       1.00      0.81      0.90       160

    accuracy                           0.97      1115
   macro avg       0.98      0.91      0.94      1115
weighted avg       0.97      0.97      0.97      1115



## TF-IDF

In [103]:
# Creating the TFIDF model
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features=2500, binary=True, ngram_range=(2,2))
X = tv.fit_transform(corpus).toarray()

In [104]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [105]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [106]:
#prediction
y_pred=spam_detect_model.predict(X_test)

In [107]:
score=accuracy_score(y_test,y_pred)
print(score)

0.957847533632287


In [108]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

       False       1.00      0.95      0.98      1002
        True       0.71      1.00      0.83       113

    accuracy                           0.96      1115
   macro avg       0.85      0.98      0.90      1115
weighted avg       0.97      0.96      0.96      1115



## Word2vec Implementation

In [109]:
#!pip install gensim

In [110]:
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')

In [111]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [112]:
words=[]
for sent in corpus:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))

In [113]:
#words[:10]

In [114]:
import numpy as np
import gensim

# Train Word2Vec
model = gensim.models.Word2Vec(sentences=words, vector_size=300, window=5, min_count=2)

# Create feature vectors for each message (average of word embeddings)
X = []
for sent in corpus:
    tokens = simple_preprocess(sent)
    word_vecs = [model.wv[w] for w in tokens if w in model.wv]
    if len(word_vecs) > 0:
        X.append(np.mean(word_vecs, axis=0))
    else:
        X.append(np.zeros(300))  # fallback if no words in vocab
X = np.array(X)


In [115]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Logistic Regression works well with embeddings
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8565022421524664
              precision    recall  f1-score   support

       False       0.86      1.00      0.92       955
        True       0.00      0.00      0.00       160

    accuracy                           0.86      1115
   macro avg       0.43      0.50      0.46      1115
weighted avg       0.73      0.86      0.79      1115



In [116]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train SVM
clf = svm.SVC(kernel='linear')  # You can also try kernel='rbf' or 'poly'
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8565022421524664
              precision    recall  f1-score   support

       False       0.86      1.00      0.92       955
        True       0.00      0.00      0.00       160

    accuracy                           0.86      1115
   macro avg       0.43      0.50      0.46      1115
weighted avg       0.73      0.86      0.79      1115



## Avg Word2Vec

In [117]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [lemmatizer.lemmatize(word) for word in review]
    review = ' '.join(review)
    corpus.append(review)

In [118]:
[[i,j,k] for i,j,k in zip(list(map(len,corpus)),corpus, messages['message']) if i<1]

[[0, '', '645'], [0, '', ':) '], [0, '', ':-) :-)']]

In [119]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [120]:
words=[]
for sent in corpus:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))


In [121]:
#words[:10]

In [122]:
## Lets train Word2vec from scratch
model=gensim.models.Word2Vec(words)

In [123]:
## To Get All the Vocabulary
model.wv.index_to_key[:10]

['to', 'you', 'the', 'it', 'and', 'in', 'is', 'me', 'my', 'for']

In [124]:
model.corpus_count

5569

In [125]:
model.epochs

5

In [126]:
words[0]

['go',
 'until',
 'jurong',
 'point',
 'crazy',
 'available',
 'only',
 'in',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'there',
 'got',
 'amore',
 'wat']

In [127]:
def avg_word2vec(doc):
    # remove out-of-vocabulary words
    sent = [word for word in doc if word in model.wv.index_to_key]
    #print(sent)
    
    if len(sent)>=1:
        return np.mean([model.wv[word] for word in sent], axis=0)
    else:
        return np.zeros(model.vector_size)  
    #return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis=0)
                #or [np.zeros(len(model.wv.index_to_key))], axis=0)

In [128]:
from tqdm import tqdm

In [129]:
#apply for the entire sentences
import numpy as np
X=[]
for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))

100%|██████████| 5569/5569 [00:00<00:00, 8580.42it/s]


In [130]:
len(X)

5569

In [131]:
##independent Features
X_new =np.array(X)

In [132]:
messages.shape

(5572, 2)

In [133]:
X[1]

array([-0.15813592,  0.20628825,  0.09883745,  0.07780463,  0.08844206,
       -0.42327774,  0.1367866 ,  0.4153632 , -0.23288128, -0.0910508 ,
       -0.15298262, -0.31630844, -0.04828515,  0.11177237,  0.16231343,
       -0.14389886,  0.10960796, -0.26661217, -0.06120687, -0.4568581 ,
        0.18122873,  0.10891304,  0.05708148, -0.18651149, -0.02250941,
       -0.01462258, -0.17839006, -0.17155111, -0.21673435,  0.02356395,
        0.27177846,  0.01962412,  0.09230398, -0.15340431, -0.12324122,
        0.3288319 ,  0.05432113, -0.10620004, -0.09313025, -0.41694784,
        0.09856156, -0.21763575, -0.15759435,  0.0146166 ,  0.12185354,
        0.00283993, -0.10670035, -0.03778572,  0.18609628,  0.12430638,
        0.14842062, -0.15504752, -0.04814173,  0.05281594, -0.05999708,
        0.03886792,  0.13011205,  0.0049193 , -0.3168648 ,  0.16437304,
       -0.00165583,  0.13954699,  0.00204779, -0.0934314 , -0.25104457,
        0.24216655,  0.08594633,  0.19452628, -0.29950473,  0.34

In [134]:
X_new.shape

(5569, 100)

In [135]:
X_new[0].shape

(100,)

In [136]:
## Dependent Features
## Output Features
y = messages[list(map(lambda x: len(x)>0 ,corpus))]
y=pd.get_dummies(y['label'])
y=y.iloc[:,0].values

In [137]:
y.shape

(5569,)

In [138]:
X[0].reshape(1,-1).shape

(1, 100)

In [139]:
df = pd.DataFrame()
for i in range(len(X)):
    row_df = pd.DataFrame(X[i].reshape(1, -1))
    df = pd.concat([df, row_df], ignore_index=True)


In [140]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.170072,0.238321,0.116029,0.091164,0.094253,-0.488225,0.171596,0.472685,-0.268256,-0.112176,...,0.353995,0.148204,0.045717,0.053237,0.417125,0.18296,0.157308,-0.193699,0.145064,0.006613
1,-0.158136,0.206288,0.098837,0.077805,0.088442,-0.423278,0.136787,0.415363,-0.232881,-0.091051,...,0.314665,0.120929,0.03465,0.040367,0.351016,0.155592,0.137464,-0.177634,0.132412,-0.000584
2,-0.181461,0.251296,0.122674,0.109218,0.079805,-0.521759,0.171689,0.468691,-0.283686,-0.134998,...,0.349708,0.150155,0.044228,0.040833,0.426301,0.168881,0.111171,-0.224256,0.172022,0.022945
3,-0.236513,0.321809,0.153439,0.122647,0.131486,-0.666426,0.225869,0.651303,-0.3697,-0.146344,...,0.487983,0.197203,0.057233,0.078637,0.557533,0.25734,0.230564,-0.26763,0.197915,-0.00031
4,-0.201154,0.263933,0.135753,0.099299,0.117644,-0.562669,0.188795,0.551348,-0.314562,-0.12957,...,0.414122,0.166787,0.052223,0.069639,0.474087,0.218538,0.190127,-0.232033,0.161079,-0.003231


In [141]:
df['Output']=y

In [142]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Output
0,-0.170072,0.238321,0.116029,0.091164,0.094253,-0.488225,0.171596,0.472685,-0.268256,-0.112176,...,0.148204,0.045717,0.053237,0.417125,0.18296,0.157308,-0.193699,0.145064,0.006613,True
1,-0.158136,0.206288,0.098837,0.077805,0.088442,-0.423278,0.136787,0.415363,-0.232881,-0.091051,...,0.120929,0.03465,0.040367,0.351016,0.155592,0.137464,-0.177634,0.132412,-0.000584,True
2,-0.181461,0.251296,0.122674,0.109218,0.079805,-0.521759,0.171689,0.468691,-0.283686,-0.134998,...,0.150155,0.044228,0.040833,0.426301,0.168881,0.111171,-0.224256,0.172022,0.022945,False
3,-0.236513,0.321809,0.153439,0.122647,0.131486,-0.666426,0.225869,0.651303,-0.3697,-0.146344,...,0.197203,0.057233,0.078637,0.557533,0.25734,0.230564,-0.26763,0.197915,-0.00031,True
4,-0.201154,0.263933,0.135753,0.099299,0.117644,-0.562669,0.188795,0.551348,-0.314562,-0.12957,...,0.166787,0.052223,0.069639,0.474087,0.218538,0.190127,-0.232033,0.161079,-0.003231,True


In [143]:
df.dropna(inplace=True)

In [144]:
#df.isnull().sum()

In [145]:
## Independent Feature
X=df

In [146]:
X.isnull().sum()

0         0
1         0
2         0
3         0
4         0
         ..
96        0
97        0
98        0
99        0
Output    0
Length: 101, dtype: int64

In [147]:
y=df['Output']

In [148]:
## Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20)

In [149]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Output
4350,-0.19097,0.267104,0.126557,0.098083,0.09791,-0.540069,0.189256,0.525421,-0.29944,-0.128409,...,0.169774,0.061578,0.066496,0.46568,0.212258,0.182379,-0.209439,0.153881,0.000345,True
25,-0.162982,0.214307,0.108158,0.080389,0.089209,-0.455714,0.149784,0.439397,-0.248981,-0.106291,...,0.133393,0.044859,0.052341,0.378333,0.177719,0.145262,-0.184575,0.129832,0.001693,True
2117,-0.188519,0.249629,0.12704,0.088497,0.10592,-0.519454,0.18077,0.504822,-0.279042,-0.118441,...,0.16155,0.061363,0.063056,0.452793,0.220289,0.181663,-0.206807,0.145469,0.000676,True
1833,-0.225829,0.301233,0.144638,0.104343,0.125376,-0.646672,0.216887,0.639028,-0.359274,-0.148745,...,0.197287,0.06533,0.071479,0.538773,0.253498,0.219811,-0.252653,0.182323,-0.011086,True
2253,-0.134294,0.17598,0.084191,0.063511,0.075015,-0.358243,0.119848,0.351706,-0.196653,-0.078402,...,0.103534,0.030825,0.042293,0.307364,0.150134,0.122261,-0.147283,0.100983,0.003383,True


In [150]:
y_train[:5]

4350    True
25      True
2117    True
1833    True
2253    True
Name: Output, dtype: bool

In [151]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()

In [152]:
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

classifier.fit(X_train,y_train)

In [153]:
y_pred=classifier.predict(X_test)

In [154]:
from sklearn.metrics import accuracy_score,classification_report
print(accuracy_score(y_test,y_pred))

1.0


In [156]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

       False       1.00      1.00      1.00       146
        True       1.00      1.00      1.00       968

    accuracy                           1.00      1114
   macro avg       1.00      1.00      1.00      1114
weighted avg       1.00      1.00      1.00      1114

