In [211]:
import pandas as pd

In [212]:
messages = pd.read_csv('SMSSpamCollection.txt', sep='\t', names=["label", "message"])
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [213]:
messages.shape

(5572, 2)

### Text preprocessing

In [214]:
import nltk
import re
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [215]:
ps = PorterStemmer()
lemmatizer=WordNetLemmatizer()

In [216]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [217]:
corpus[10:15]

['gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash pound txt csh send cost p day day tsandc appli repli hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw',
 'search right word thank breather promis wont take help grant fulfil promis wonder bless time',
 'date sunday']

## Text to vector: BOW, TFIDF, Word2Vec

####  Bag of Words model

In [218]:
# Creating the Bag of Words model

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500, binary = True, ngram_range=(2,2))
X = cv.fit_transform(corpus).toarray()

In [219]:
X.shape

(5572, 2500)

In [220]:
y=pd.get_dummies(messages['label'])
y=y.iloc[:,1].values
y[:10]

array([False, False,  True, False, False,  True, False, False,  True,
        True])

In [221]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [222]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)


In [223]:
# Training model using Naive bayes classifier

from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [224]:
y_pred=spam_detect_model.predict(X_test)

In [225]:
from sklearn.metrics import accuracy_score,classification_report
print(accuracy_score(y_test,y_pred))

0.9730941704035875


In [226]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

       False       0.97      1.00      0.98       955
        True       1.00      0.81      0.90       160

    accuracy                           0.97      1115
   macro avg       0.98      0.91      0.94      1115
weighted avg       0.97      0.97      0.97      1115



## TF-IDF

In [227]:
# Creating the TFIDF model
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features=2500, binary=True, ngram_range=(2,2))
X = tv.fit_transform(corpus).toarray()

In [228]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [229]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [230]:
#prediction
y_pred=spam_detect_model.predict(X_test)

In [231]:
score=accuracy_score(y_test,y_pred)
print(score)

0.957847533632287


In [232]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

       False       1.00      0.95      0.98      1002
        True       0.71      1.00      0.83       113

    accuracy                           0.96      1115
   macro avg       0.85      0.98      0.90      1115
weighted avg       0.97      0.96      0.96      1115



## Word2vec Implementation

In [233]:
#!pip install gensim

In [234]:
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')

In [235]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [236]:
words=[]
for sent in corpus:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))

In [282]:
#words[:10]

In [238]:
import numpy as np
import gensim

# Train Word2Vec
model = gensim.models.Word2Vec(sentences=words, vector_size=300, window=5, min_count=2, workers=4)

# Create feature vectors for each message (average of word embeddings)
X = []
for sent in corpus:
    tokens = simple_preprocess(sent)
    word_vecs = [model.wv[w] for w in tokens if w in model.wv]
    if len(word_vecs) > 0:
        X.append(np.mean(word_vecs, axis=0))
    else:
        X.append(np.zeros(300))  # fallback if no words in vocab
X = np.array(X)


In [239]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Logistic Regression works well with embeddings
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8565022421524664
              precision    recall  f1-score   support

       False       0.86      1.00      0.92       955
        True       0.00      0.00      0.00       160

    accuracy                           0.86      1115
   macro avg       0.43      0.50      0.46      1115
weighted avg       0.73      0.86      0.79      1115



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [240]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train SVM
clf = svm.SVC(kernel='linear')  # You can also try kernel='rbf' or 'poly'
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8565022421524664
              precision    recall  f1-score   support

       False       0.86      1.00      0.92       955
        True       0.00      0.00      0.00       160

    accuracy                           0.86      1115
   macro avg       0.43      0.50      0.46      1115
weighted avg       0.73      0.86      0.79      1115



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Avg Word2Vec

In [241]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [lemmatizer.lemmatize(word) for word in review]
    review = ' '.join(review)
    corpus.append(review)

In [242]:
[[i,j,k] for i,j,k in zip(list(map(len,corpus)),corpus, messages['message']) if i<1]

[[0, '', '645'], [0, '', ':) '], [0, '', ':-) :-)']]

In [243]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [244]:
words=[]
for sent in corpus:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))


In [283]:
#words[:10]

In [246]:
## Lets train Word2vec from scratch
model=gensim.models.Word2Vec(words)

In [247]:
## To Get All the Vocabulary
model.wv.index_to_key[:10]

['to', 'you', 'the', 'it', 'and', 'in', 'is', 'me', 'my', 'for']

In [248]:
model.corpus_count

5569

In [249]:
model.epochs

5

In [250]:
words[0]

['go',
 'until',
 'jurong',
 'point',
 'crazy',
 'available',
 'only',
 'in',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'there',
 'got',
 'amore',
 'wat']

In [251]:
def avg_word2vec(doc):
    # remove out-of-vocabulary words
    sent = [word for word in doc if word in model.wv.index_to_key]
    #print(sent)
    
    if len(sent)>=1:
        return np.mean([model.wv[word] for word in sent], axis=0)
    else:
        return np.zeros(model.vector_size)  
    #return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis=0)
                #or [np.zeros(len(model.wv.index_to_key))], axis=0)

In [252]:
from tqdm import tqdm

In [253]:
#apply for the entire sentences
import numpy as np
X=[]
for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))

100%|██████████| 5569/5569 [00:00<00:00, 9267.30it/s]


In [254]:
len(X)

5569

In [255]:
##independent Features
X_new =np.array(X)

In [256]:
messages.shape

(5572, 2)

In [257]:
X[1]

array([-0.16093197,  0.20453805,  0.09947707,  0.07885838,  0.08950631,
       -0.42499968,  0.13644703,  0.4153893 , -0.23689635, -0.09217185,
       -0.15480791, -0.31651616, -0.04980419,  0.11148873,  0.16321428,
       -0.14428031,  0.11125355, -0.268739  , -0.06181652, -0.45928854,
        0.18141152,  0.10963971,  0.06097827, -0.18918177, -0.0210139 ,
       -0.01466911, -0.18033618, -0.17238685, -0.21711223,  0.02482686,
        0.27419204,  0.01994079,  0.09131002, -0.15160473, -0.12291606,
        0.33027926,  0.05416757, -0.10768913, -0.09164776, -0.42103297,
        0.10214926, -0.22027598, -0.15843156,  0.01745847,  0.12118668,
        0.00501826, -0.1102303 , -0.03827666,  0.18302591,  0.1238932 ,
        0.1482834 , -0.15634958, -0.04515098,  0.0552966 , -0.06084734,
        0.04053638,  0.12938622,  0.00288954, -0.31999993,  0.16675076,
       -0.00207383,  0.13900374,  0.00422619, -0.09293123, -0.25152925,
        0.24376994,  0.08775419,  0.19498402, -0.30141303,  0.34

In [258]:
X_new.shape

(5569, 100)

In [259]:
X_new[0].shape

(100,)

In [260]:
## Dependent Features
## Output Features
y = messages[list(map(lambda x: len(x)>0 ,corpus))]
y=pd.get_dummies(y['label'])
y=y.iloc[:,0].values

In [261]:
y.shape

(5569,)

In [262]:
X[0].reshape(1,-1).shape

(1, 100)

In [263]:
df = pd.DataFrame()
for i in range(len(X)):
    row_df = pd.DataFrame(X[i].reshape(1, -1))
    df = pd.concat([df, row_df], ignore_index=True)


In [264]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.17184,0.237848,0.116681,0.092085,0.093744,-0.488509,0.172578,0.472578,-0.270512,-0.113086,...,0.354839,0.148715,0.044914,0.053749,0.418307,0.184064,0.157891,-0.193095,0.144914,0.007853
1,-0.160932,0.204538,0.099477,0.078858,0.089506,-0.425,0.136447,0.415389,-0.236896,-0.092172,...,0.316866,0.121679,0.032952,0.03932,0.352571,0.156026,0.136275,-0.180448,0.133265,0.001035
2,-0.184822,0.255262,0.125107,0.113157,0.076727,-0.527064,0.172973,0.469597,-0.286648,-0.138787,...,0.34519,0.147961,0.044297,0.041335,0.424344,0.167367,0.107998,-0.22398,0.172577,0.022986
3,-0.238567,0.318978,0.15116,0.123601,0.130564,-0.660074,0.225503,0.643615,-0.369913,-0.143764,...,0.484397,0.196816,0.055703,0.078192,0.556731,0.254269,0.227454,-0.267032,0.198668,0.002803
4,-0.206898,0.26395,0.1366,0.101101,0.119167,-0.562229,0.188328,0.54985,-0.31829,-0.129121,...,0.416289,0.16691,0.051844,0.069406,0.477507,0.219134,0.188823,-0.235534,0.162929,0.001017


In [265]:
df['Output']=y

In [266]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Output
0,-0.17184,0.237848,0.116681,0.092085,0.093744,-0.488509,0.172578,0.472578,-0.270512,-0.113086,...,0.148715,0.044914,0.053749,0.418307,0.184064,0.157891,-0.193095,0.144914,0.007853,True
1,-0.160932,0.204538,0.099477,0.078858,0.089506,-0.425,0.136447,0.415389,-0.236896,-0.092172,...,0.121679,0.032952,0.03932,0.352571,0.156026,0.136275,-0.180448,0.133265,0.001035,True
2,-0.184822,0.255262,0.125107,0.113157,0.076727,-0.527064,0.172973,0.469597,-0.286648,-0.138787,...,0.147961,0.044297,0.041335,0.424344,0.167367,0.107998,-0.22398,0.172577,0.022986,False
3,-0.238567,0.318978,0.15116,0.123601,0.130564,-0.660074,0.225503,0.643615,-0.369913,-0.143764,...,0.196816,0.055703,0.078192,0.556731,0.254269,0.227454,-0.267032,0.198668,0.002803,True
4,-0.206898,0.26395,0.1366,0.101101,0.119167,-0.562229,0.188328,0.54985,-0.31829,-0.129121,...,0.16691,0.051844,0.069406,0.477507,0.219134,0.188823,-0.235534,0.162929,0.001017,True


In [267]:
df.dropna(inplace=True)

In [279]:
#df.isnull().sum()

In [269]:
## Independent Feature
X=df

In [270]:
X.isnull().sum()

0         0
1         0
2         0
3         0
4         0
         ..
96        0
97        0
98        0
99        0
Output    0
Length: 101, dtype: int64

In [271]:
y=df['Output']

In [272]:
## Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20)

In [273]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Output
1917,-0.242129,0.302604,0.156357,0.115841,0.134746,-0.64171,0.207236,0.631827,-0.360631,-0.13699,...,0.188519,0.048976,0.064713,0.534513,0.247505,0.216159,-0.266904,0.190675,0.008659,True
4371,-0.204872,0.289223,0.141374,0.124831,0.096361,-0.587584,0.199027,0.538516,-0.324427,-0.148328,...,0.173356,0.054258,0.055853,0.487123,0.19857,0.146315,-0.24287,0.184238,0.017684,False
5344,-0.201249,0.275656,0.133988,0.102877,0.104621,-0.559112,0.193726,0.538688,-0.308219,-0.134195,...,0.171113,0.058206,0.066634,0.4757,0.214544,0.176101,-0.221496,0.162051,0.010521,True
4143,-0.182206,0.253263,0.124225,0.094907,0.089002,-0.5158,0.178785,0.496899,-0.286015,-0.120435,...,0.158374,0.050006,0.058527,0.438551,0.187354,0.161657,-0.198826,0.150776,0.004994,True
532,-0.214095,0.287503,0.146064,0.109201,0.115348,-0.589061,0.205739,0.575844,-0.326846,-0.141876,...,0.18617,0.05763,0.07112,0.507255,0.226062,0.192785,-0.237758,0.175151,0.009259,True


In [281]:
y_train[:5]

1917     True
4371    False
5344     True
4143     True
532      True
Name: Output, dtype: bool

In [275]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()

In [276]:
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

classifier.fit(X_train,y_train)

In [277]:
y_pred=classifier.predict(X_test)

In [278]:
from sklearn.metrics import accuracy_score,classification_report
print(accuracy_score(y_test,y_pred))

1.0


In [180]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

       False       1.00      0.97      0.98       157
        True       0.99      1.00      1.00       957

    accuracy                           1.00      1114
   macro avg       1.00      0.98      0.99      1114
weighted avg       1.00      1.00      1.00      1114

