In [1]:
import pandas as pd 
messages = pd.read_csv('SMSSpamCollection.txt',sep='\t',
                       names = ["label","message"])

messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Nabeel
[nltk_data]     Ali\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [4]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z0-9]',' ', messages['message'][i])
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)


print(corpus)

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat', 'ok lar joke wif u oni', 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18', 'u dun say earli hor u c alreadi say', 'nah think goe usf live around though', 'freemsg hey darl 3 week word back like fun still tb ok xxx std chg send 1 50 rcv', 'even brother like speak treat like aid patent', 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun', 'winner valu network custom select receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour', 'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 08002986030', 'gonna home soon want talk stuff anymor tonight k cri enough today', 'six chanc win cash 100 20 000 pound txt csh11 send 87575 cost 150p day 6day 16 tsandc appli repli hl 4 info', 'urgent 1 week free membership 100 

In [5]:
# Simple Bag of Words

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500, binary=True, ngram_range=(2,2))
X = cv.fit_transform(corpus).toarray()
print(X)
X.shape

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


(5572, 2500)

In [6]:
y = pd.get_dummies(messages['label'])
y = y.iloc[:,1]


In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20, random_state=0)

In [8]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train,y_train)

In [9]:
y_pred = spam_detect_model.predict(X_test)

In [10]:
from sklearn.metrics import accuracy_score, classification_report

In [11]:
bow_score = accuracy_score(y_test,y_pred)
print(bow_score)

0.9721973094170404


In [12]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

       False       1.00      0.97      0.98       986
        True       0.81      1.00      0.89       129

    accuracy                           0.97      1115
   macro avg       0.90      0.98      0.94      1115
weighted avg       0.98      0.97      0.97      1115



In [13]:
#USing TF IDF

from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer (max_features=2500, ngram_range=(1,2))
X = tv.fit_transform(corpus).toarray()

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20, random_state=0)

In [15]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train,y_train)

In [16]:
y_pred = spam_detect_model.predict(X_test)

In [17]:
tfidf_score = accuracy_score(y_test,y_pred)
print(tfidf_score)

0.9811659192825112


In [18]:
# Using randomForestClassifier

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

RandomForestClassifier()

In [19]:
y_pred = classifier.predict(X_test)

In [20]:
rf_score = accuracy_score(y_pred,y_test)
print(rf_score)

0.9856502242152466


In [21]:
# Using Word to vector 
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [22]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z0-9]',' ', messages['message'][i])
    review = review.lower()
    review = review.split()

    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)


print(corpus)



In [23]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [24]:
words = []
for sent in corpus:
    sentences = sent_tokenize(sent)
    for sentence in sentences:
        words.append(simple_preprocess(sentence))

In [25]:
import gensim

In [55]:
model = gensim.models.Word2Vec(words,window=5,min_count=2)

In [27]:
model.wv.index_to_key

['call',
 'get',
 'ur',
 'gt',
 'lt',
 'go',
 'ok',
 'day',
 'free',
 'know',
 'come',
 'like',
 'good',
 'time',
 'got',
 'text',
 'love',
 'want',
 'send',
 'need',
 'one',
 'txt',
 'today',
 'going',
 'stop',
 'home',
 'lor',
 'sorry',
 'see',
 'still',
 'take',
 'mobile',
 'back',
 'da',
 'reply',
 'dont',
 'think',
 'tell',
 'week',
 'phone',
 'hi',
 'new',
 'later',
 'please',
 'pls',
 'co',
 'msg',
 'dear',
 'make',
 'night',
 'message',
 'say',
 'well',
 'min',
 'thing',
 'much',
 'claim',
 'great',
 'hope',
 'oh',
 'hey',
 'give',
 'number',
 'happy',
 'work',
 'friend',
 'wat',
 'yes',
 'way',
 'www',
 'let',
 'prize',
 'right',
 'tomorrow',
 'already',
 'ask',
 'said',
 'win',
 'life',
 'amp',
 'cash',
 'yeah',
 'im',
 'really',
 'tone',
 'babe',
 'meet',
 'find',
 'miss',
 'morning',
 'service',
 'last',
 'uk',
 'thanks',
 'care',
 'would',
 'com',
 'anything',
 'year',
 'lol',
 'also',
 'nokia',
 'every',
 'feel',
 'keep',
 'sure',
 'pick',
 'sent',
 'urgent',
 'contact',


In [28]:
import numpy as np

In [None]:

def avg_word2vec(doc):
    vectors = [model.wv[word] for word in doc if word in model.wv.index_to_key]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)  
    return np.mean(vectors, axis=0)

In [30]:
from tqdm import tqdm

In [57]:
X = []
for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))

  0%|          | 0/5565 [00:00<?, ?it/s]

100%|██████████| 5565/5565 [00:02<00:00, 2779.80it/s]


In [58]:
X_new = np.array(X)

X[0]

array([-0.16339228,  0.31434676,  0.08484024,  0.03199365,  0.05317536,
       -0.3650975 ,  0.07757656,  0.49155906, -0.16316451, -0.08975241,
       -0.15939094, -0.38339427, -0.02848244,  0.04947598,  0.06121175,
       -0.25359562, -0.00072654, -0.35746333, -0.02359009, -0.46697998,
        0.09783884,  0.08609986,  0.03602529, -0.12954241, -0.13327561,
        0.06098592, -0.22170198, -0.14080334, -0.23811589,  0.04187902,
        0.30845013,  0.03925674,  0.09220978, -0.14627358, -0.06650322,
        0.2605175 ,  0.05202959, -0.21656819, -0.21261464, -0.48734224,
       -0.08360386, -0.24322213, -0.04760214,  0.04930492,  0.15344895,
       -0.10357296, -0.10076157, -0.08712839,  0.1625008 ,  0.19511886,
        0.15917313, -0.27001405, -0.06354239,  0.0075532 , -0.18500102,
        0.05427645,  0.15362032, -0.02484323, -0.34419855,  0.05704695,
        0.04458815,  0.08408305, -0.12456974, -0.00049226, -0.29428807,
        0.20359392,  0.10510063,  0.17995596, -0.2874107 ,  0.29

In [59]:
y = messages['label'][:len(X_new)]

In [60]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_new,y,test_size=0.20, random_state=0)

In [66]:
from sklearn.ensemble import RandomForestClassifier

In [64]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier()

In [70]:
y_pred = rf.predict(X_test)
avg_word = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8598382749326146
              precision    recall  f1-score   support

         ham       0.87      0.98      0.92       972
        spam       0.14      0.02      0.04       141

    accuracy                           0.86      1113
   macro avg       0.51      0.50      0.48      1113
weighted avg       0.78      0.86      0.81      1113



In [None]:
print(avg_word)

print(bow_score)

print(tfidf_score)

0.8598382749326146
0.9721973094170404
0.9811659192825112
