In [1]:
import numpy as np
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv("SMSSpamCollection.txt", sep='\t', names=['label', 'message'])

In [3]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
## Preprocess the data
# Remove punc
# remove stopword
# do stemming

In [5]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [6]:
## Lets first try on one message.

In [7]:
message = df['message'][0]

In [8]:
print(message)

Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...


In [9]:
type(message)

str

In [10]:
nonpunc = [c for c in message if c not in string.punctuation]

In [11]:
nonpunc

['G',
 'o',
 ' ',
 'u',
 'n',
 't',
 'i',
 'l',
 ' ',
 'j',
 'u',
 'r',
 'o',
 'n',
 'g',
 ' ',
 'p',
 'o',
 'i',
 'n',
 't',
 ' ',
 'c',
 'r',
 'a',
 'z',
 'y',
 ' ',
 'A',
 'v',
 'a',
 'i',
 'l',
 'a',
 'b',
 'l',
 'e',
 ' ',
 'o',
 'n',
 'l',
 'y',
 ' ',
 'i',
 'n',
 ' ',
 'b',
 'u',
 'g',
 'i',
 's',
 ' ',
 'n',
 ' ',
 'g',
 'r',
 'e',
 'a',
 't',
 ' ',
 'w',
 'o',
 'r',
 'l',
 'd',
 ' ',
 'l',
 'a',
 ' ',
 'e',
 ' ',
 'b',
 'u',
 'f',
 'f',
 'e',
 't',
 ' ',
 'C',
 'i',
 'n',
 'e',
 ' ',
 't',
 'h',
 'e',
 'r',
 'e',
 ' ',
 'g',
 'o',
 't',
 ' ',
 'a',
 'm',
 'o',
 'r',
 'e',
 ' ',
 'w',
 'a',
 't']

In [12]:
nonpunc = ''.join(nonpunc)

In [13]:
nonpunc

'Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amore wat'

In [14]:
type(nonpunc)

str

In [15]:
## Now stopword

In [16]:
stopwords = stopwords.words('english')

In [17]:
print(stopwords)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [18]:
nonpunc = nonpunc.lower()

In [19]:
nonpunc = nonpunc.split()

In [20]:
print(nonpunc)

['go', 'until', 'jurong', 'point', 'crazy', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'there', 'got', 'amore', 'wat']


In [21]:
clean_message = [word for word in nonpunc if word not in stopwords]

In [22]:
print(clean_message)

['go', 'jurong', 'point', 'crazy', 'available', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amore', 'wat']


In [23]:
clean_message = " ".join(clean_message)

In [24]:
clean_message

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [25]:
stemmer = PorterStemmer()

In [26]:
clean_message = [stemmer.stem(word) for word in clean_message]

In [27]:
clean_message

['g',
 'o',
 ' ',
 'j',
 'u',
 'r',
 'o',
 'n',
 'g',
 ' ',
 'p',
 'o',
 'i',
 'n',
 't',
 ' ',
 'c',
 'r',
 'a',
 'z',
 'y',
 ' ',
 'a',
 'v',
 'a',
 'i',
 'l',
 'a',
 'b',
 'l',
 'e',
 ' ',
 'b',
 'u',
 'g',
 'i',
 's',
 ' ',
 'n',
 ' ',
 'g',
 'r',
 'e',
 'a',
 't',
 ' ',
 'w',
 'o',
 'r',
 'l',
 'd',
 ' ',
 'l',
 'a',
 ' ',
 'e',
 ' ',
 'b',
 'u',
 'f',
 'f',
 'e',
 't',
 ' ',
 'c',
 'i',
 'n',
 'e',
 ' ',
 'g',
 'o',
 't',
 ' ',
 'a',
 'm',
 'o',
 'r',
 'e',
 ' ',
 'w',
 'a',
 't']

In [28]:
clean_message = ''.join(clean_message)

In [29]:
clean_message

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [30]:
## Now for wholw this process let us make a function

In [31]:
def text_process(message):
    nonpunc = [c for c in message if c not in string.punctuation]
    nonpunc = ''.join(nonpunc)
    nonpunc = nonpunc.lower().split()
    clean_message = [word for word in nonpunc if word not in stopwords]
    clean_message = " ".join(clean_message)
    clean_message = [stemmer.stem(word) for word in clean_message]
    clean_message = ''.join(clean_message)
    return clean_message

In [32]:
df['message'] = df['message'].apply(text_process)

In [33]:
df.head()

Unnamed: 0,label,message
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor u c already say
4,ham,nah dont think goes usf lives around though


In [34]:
## Using mapping to get binary outputs

In [35]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

In [36]:
df.head()

Unnamed: 0,label,message
0,0,go jurong point crazy available bugis n great ...
1,0,ok lar joking wif u oni
2,1,free entry 2 wkly comp win fa cup final tkts 2...
3,0,u dun say early hor u c already say
4,0,nah dont think goes usf lives around though


### 1. Countvectorizer

In [37]:
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(df['message'])

In [38]:
print(X)
print(type(X))

  (0, 959)	1
  (0, 1699)	1
  (0, 572)	1
  (0, 289)	1
  (0, 394)	1
  (0, 986)	1
  (0, 2434)	1
  (0, 1245)	1
  (0, 500)	1
  (0, 977)	1
  (0, 2351)	1
  (1, 1578)	1
  (1, 1255)	1
  (1, 1200)	1
  (1, 2395)	1
  (1, 1589)	1
  (2, 894)	1
  (2, 753)	2
  (2, 2416)	1
  (2, 528)	1
  (2, 2400)	1
  (2, 798)	2
  (2, 586)	1
  (2, 848)	1
  (2, 2175)	1
  :	:
  (5567, 720)	1
  (5567, 37)	1
  (5567, 136)	1
  (5567, 1710)	1
  (5567, 389)	1
  (5568, 1071)	1
  (5568, 966)	1
  (5568, 891)	1
  (5568, 761)	1
  (5569, 1492)	1
  (5570, 894)	1
  (5570, 1113)	1
  (5570, 1302)	1
  (5570, 2368)	1
  (5570, 1536)	1
  (5570, 2278)	1
  (5570, 1988)	1
  (5570, 732)	1
  (5570, 932)	1
  (5570, 403)	1
  (5570, 1152)	1
  (5570, 999)	1
  (5571, 1521)	1
  (5571, 2218)	1
  (5571, 1827)	1
<class 'scipy.sparse._csr.csr_matrix'>


In [39]:
X.shape

(5572, 2500)

In [40]:
df.shape

(5572, 2)

In [41]:
X_train, X_test, y_train, y_test = train_test_split(
    X, df['label'], test_size=0.33, random_state=42)

In [42]:
model1 = MultinomialNB()

In [43]:
model1.fit(X_train, y_train)
y_pred = model1.predict(X_test)

In [44]:
print(accuracy_score(y_test,y_pred))
print('\n')
print(classification_report(y_pred,y_test))

0.9825992387166939


              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1583
           1       0.96      0.92      0.94       256

    accuracy                           0.98      1839
   macro avg       0.97      0.96      0.96      1839
weighted avg       0.98      0.98      0.98      1839



### 2. TFIDF

In [45]:
tv = TfidfVectorizer(max_features=2500)
X = tv.fit_transform(df['message'])

In [46]:
X_train, X_test, y_train, y_test = train_test_split(
    X, df['label'], test_size=0.33, random_state=42)

In [47]:
model2 = MultinomialNB()
model2.fit(X_train, y_train)
y_pred = model2.predict(X_test)

In [48]:
print(accuracy_score(y_test,y_pred))
print('\n')
print(classification_report(y_pred,y_test))

0.9815116911364872


              precision    recall  f1-score   support

           0       1.00      0.98      0.99      1627
           1       0.86      1.00      0.93       212

    accuracy                           0.98      1839
   macro avg       0.93      0.99      0.96      1839
weighted avg       0.98      0.98      0.98      1839



### 3. Wordtovec

In [49]:
df['message_token'] = df['message'].apply(simple_preprocess)

In [50]:
df.head()

Unnamed: 0,label,message,message_token
0,0,go jurong point crazy available bugis n great ...,"[go, jurong, point, crazy, available, bugis, g..."
1,0,ok lar joking wif u oni,"[ok, lar, joking, wif, oni]"
2,1,free entry 2 wkly comp win fa cup final tkts 2...,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,0,u dun say early hor u c already say,"[dun, say, early, hor, already, say]"
4,0,nah dont think goes usf lives around though,"[nah, dont, think, goes, usf, lives, around, t..."


In [51]:
model3 = Word2Vec(sentences=df['message_token'], window=10, min_count=2, workers=4)

In [52]:
model3.corpus_count

5572

In [53]:
model3.epochs

5

In [54]:
vector = model3.wv['hello']  

In [55]:
vector

array([-0.31630352,  0.31304297,  0.05043954,  0.06534324,  0.05266415,
       -0.5875819 ,  0.17087816,  0.91545194, -0.17708129, -0.29298258,
       -0.20044567, -0.5562762 ,  0.00381371,  0.2003792 ,  0.14968428,
       -0.31341085, -0.03397866, -0.45752874, -0.03751825, -0.7652368 ,
        0.19694984,  0.21621338,  0.21015713, -0.10011743, -0.16762912,
        0.02919729, -0.22882107, -0.31654015, -0.41917044, -0.01491172,
        0.37789184,  0.10792051,  0.11938658, -0.22327115, -0.20367745,
        0.34641004,  0.01906409, -0.3483623 , -0.28835574, -0.7583202 ,
        0.08962689, -0.31603178, -0.16332223,  0.07661263,  0.21350747,
       -0.25313473, -0.3020954 , -0.01795379,  0.14823979,  0.31998828,
        0.20471054, -0.38425586, -0.12009192, -0.0993387 , -0.14267355,
        0.2775564 ,  0.2597659 , -0.0815615 , -0.42383412,  0.09940104,
        0.21117485,  0.26163098, -0.1690979 , -0.17784655, -0.5728942 ,
        0.34268323,  0.07931728,  0.23023972, -0.5290045 ,  0.37

In [56]:
def get_avg_vector(message, model):
    vectors = [model.wv[word] for word in message if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

In [57]:
df['message_vector'] = df['message_token'].apply(lambda x: get_avg_vector(x, model3))

In [58]:
type(df['message_vector'][0])

numpy.ndarray

In [59]:
model3.wv.similar_by_word('kid')

[('south', 0.9673158526420593),
 ('india', 0.96656334400177),
 ('cool', 0.965729296207428),
 ('digital', 0.9656597971916199),
 ('information', 0.9655689001083374),
 ('accept', 0.9652981162071228),
 ('mayb', 0.9652918577194214),
 ('probably', 0.9652125239372253),
 ('delivery', 0.9651215076446533),
 ('doin', 0.9651201367378235)]

In [60]:
X = np.vstack(df['message_vector'].values)
X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.33, random_state=42)

In [61]:
X_train.shape

(3733, 100)

In [62]:
model4 = LogisticRegression()
model4.fit(X_train, y_train)
y_pred = model4.predict(X_test)

In [63]:
print(accuracy_score(y_test,y_pred))
print('\n')
print(classification_report(y_pred,y_test))

0.866231647634584


              precision    recall  f1-score   support

           0       1.00      0.87      0.93      1837
           1       0.00      0.50      0.01         2

    accuracy                           0.87      1839
   macro avg       0.50      0.68      0.47      1839
weighted avg       1.00      0.87      0.93      1839

