In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import plotly.express as px
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [2]:
dataset = pd.read_csv('../Dataset/spam.csv')

In [3]:
dataset.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [4]:
dataset.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

In [5]:
dataset.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
X = dataset['v2']
y = dataset['v1']

In [7]:
X.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: v2, dtype: object

In [8]:
y.head()

0     ham
1     ham
2    spam
3     ham
4     ham
Name: v1, dtype: object

# 1.) Exploratory Data Analysis

In [9]:
dataset.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
spam_dataset = dataset[dataset['v1'] == 'spam']

In [11]:
spam_dataset.head()

Unnamed: 0,v1,v2
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
5,spam,FreeMsg Hey there darling it's been 3 week's n...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...
11,spam,"SIX chances to win CASH! From 100 to 20,000 po..."


In [12]:
dataset['v1'].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

##### There is imbalance in dataset.

### 1.1) Exploring most common words in spam messages.

In [13]:
def clean_sentences(sentence):
  sentence = re.sub('[^a-zA-Z]',  ' ',sentence)
  sentence = re.sub('  ',  ' ',sentence)
  return sentence

In [14]:
spam_dataset['v2'] = spam_dataset['v2'].apply(clean_sentences)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [15]:
spam_dataset.head()

Unnamed: 0,v1,v2
2,spam,Free entry in a wkly comp to win FA Cup final...
5,spam,FreeMsg Hey there darling it s been week s no...
8,spam,WINNER As a valued network customer you have ...
9,spam,Had your mobile months or more U R entitled t...
11,spam,SIX chances to win CASH From to pounds tx...


In [16]:
wordsDict = dict()

In [17]:
stopWords = stopwords.words('English')

In [18]:
stopWords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [19]:
def get_getwords(sentence):
  wordsList = sentence.split(' ') #nltk.word_tokenize(sentence)

  for word in wordsList:
    if not word in stopWords:
      word = word.strip()
      wordsDict[word] = wordsDict.get(word, 0) + 1

In [20]:
spam_dataset['v2'].apply(get_getwords)

2       None
5       None
8       None
9       None
11      None
        ... 
5537    None
5540    None
5547    None
5566    None
5567    None
Name: v2, Length: 747, dtype: object

In [21]:
wordsDict

{'Free': 43,
 'entry': 25,
 '': 7315,
 'wkly': 10,
 'comp': 10,
 'win': 38,
 'FA': 4,
 'Cup': 3,
 'final': 11,
 'tkts': 4,
 'st': 29,
 'May': 2,
 'Text': 43,
 'receive': 30,
 'question': 7,
 'std': 11,
 'txt': 83,
 'rate': 27,
 'T': 77,
 'C': 54,
 'apply': 28,
 'FreeMsg': 11,
 'Hey': 5,
 'darling': 2,
 'week': 59,
 'word': 25,
 'back': 21,
 'I': 54,
 'like': 13,
 'fun': 9,
 'still': 7,
 'Tb': 1,
 'ok': 3,
 'XxX': 2,
 'chgs': 1,
 'send': 49,
 'rcv': 4,
 'WINNER': 3,
 'As': 12,
 'valued': 11,
 'network': 22,
 'customer': 39,
 'selected': 25,
 'receivea': 2,
 'prize': 74,
 'reward': 9,
 'To': 75,
 'claim': 78,
 'call': 208,
 'Claim': 32,
 'code': 12,
 'KL': 2,
 'Valid': 24,
 'hours': 6,
 'Had': 15,
 'mobile': 97,
 'months': 5,
 'U': 113,
 'R': 19,
 'entitled': 8,
 'Update': 14,
 'latest': 26,
 'colour': 17,
 'mobiles': 10,
 'camera': 20,
 'Call': 140,
 'The': 24,
 'Mobile': 31,
 'Co': 5,
 'FREE': 119,
 'SIX': 2,
 'chances': 2,
 'CASH': 14,
 'From': 11,
 'pounds': 17,
 'CSH': 2,
 'Cost': 2

In [22]:
wordsList = list(wordsDict.items())

In [23]:
wordsList

[('Free', 43),
 ('entry', 25),
 ('', 7315),
 ('wkly', 10),
 ('comp', 10),
 ('win', 38),
 ('FA', 4),
 ('Cup', 3),
 ('final', 11),
 ('tkts', 4),
 ('st', 29),
 ('May', 2),
 ('Text', 43),
 ('receive', 30),
 ('question', 7),
 ('std', 11),
 ('txt', 83),
 ('rate', 27),
 ('T', 77),
 ('C', 54),
 ('apply', 28),
 ('FreeMsg', 11),
 ('Hey', 5),
 ('darling', 2),
 ('week', 59),
 ('word', 25),
 ('back', 21),
 ('I', 54),
 ('like', 13),
 ('fun', 9),
 ('still', 7),
 ('Tb', 1),
 ('ok', 3),
 ('XxX', 2),
 ('chgs', 1),
 ('send', 49),
 ('rcv', 4),
 ('WINNER', 3),
 ('As', 12),
 ('valued', 11),
 ('network', 22),
 ('customer', 39),
 ('selected', 25),
 ('receivea', 2),
 ('prize', 74),
 ('reward', 9),
 ('To', 75),
 ('claim', 78),
 ('call', 208),
 ('Claim', 32),
 ('code', 12),
 ('KL', 2),
 ('Valid', 24),
 ('hours', 6),
 ('Had', 15),
 ('mobile', 97),
 ('months', 5),
 ('U', 113),
 ('R', 19),
 ('entitled', 8),
 ('Update', 14),
 ('latest', 26),
 ('colour', 17),
 ('mobiles', 10),
 ('camera', 20),
 ('Call', 140),
 ('The'

In [24]:
wordsList.sort(key = lambda x : x[1], reverse =True)

In [25]:
wordsList

[('', 7315),
 ('call', 208),
 ('p', 173),
 ('Call', 140),
 ('FREE', 119),
 ('U', 113),
 ('ur', 107),
 ('mobile', 97),
 ('www', 96),
 ('txt', 83),
 ('You', 82),
 ('claim', 78),
 ('T', 77),
 ('Txt', 76),
 ('To', 75),
 ('text', 75),
 ('prize', 74),
 ('u', 73),
 ('Your', 72),
 ('free', 66),
 ('STOP', 63),
 ('week', 59),
 ('reply', 58),
 ('com', 57),
 ('cash', 57),
 ('contact', 56),
 ('C', 54),
 ('I', 54),
 ('uk', 53),
 ('stop', 52),
 ('service', 50),
 ('Nokia', 50),
 ('send', 49),
 ('No', 49),
 ('co', 49),
 ('get', 47),
 ('ppm', 46),
 ('per', 46),
 ('NOW', 46),
 ('Reply', 44),
 ('W', 44),
 ('Cs', 44),
 ('Free', 43),
 ('Text', 43),
 ('new', 42),
 ('This', 42),
 ('URGENT', 41),
 ('min', 41),
 ('msg', 41),
 ('customer', 39),
 ('win', 38),
 ('tone', 38),
 ('Please', 37),
 ('We', 37),
 ('awarded', 37),
 ('mins', 37),
 ('Get', 34),
 ('draw', 34),
 ('M', 34),
 ('phone', 34),
 ('line', 33),
 ('Claim', 32),
 ('Box', 32),
 ('every', 32),
 ('Mobile', 31),
 ('shows', 31),
 ('PO', 31),
 ('Just', 31),
 

In [26]:
sortedWords = pd.DataFrame(wordsList, columns=['Word', 'Count'])

In [27]:
sortedWords.head()

Unnamed: 0,Word,Count
0,,7315
1,call,208
2,p,173
3,Call,140
4,FREE,119


In [28]:
fig = px.bar(sortedWords[1:50].sort_values(by=['Count']), x = 'Count', y='Word', title='Most common words in spam messages', height=1000, width=800)
fig.show()

# 2.) Model Creation

In [29]:
dataset.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [30]:
dataset['v2'] = dataset['v2'].apply(clean_sentences)

In [31]:
dataset.head()

Unnamed: 0,v1,v2
0,ham,Go until jurong point crazy Available only in...
1,ham,Ok lar Joking wif u oni
2,spam,Free entry in a wkly comp to win FA Cup final...
3,ham,U dun say so early hor U c already then say
4,ham,Nah I don t think he goes to usf he lives arou...


In [32]:
ps = PorterStemmer()
lm = WordNetLemmatizer()

In [33]:
def tokenizeWords(sentence, method):
  wordsList = sentence.split(' ')
  # using stemming 
  stemmedWordList = [ps.stem(word) for word in wordsList if not word in stopWords]
  # usnifn lemmatization
  lemmatizedWordList = [lm.lemmatize(word) for word in wordsList if not word in stopWords]
  if method == 'Stemming':
    return ' '.join(stemmedWordList)
  else:
    return ' '.join(lemmatizedWordList)

In [34]:
dataset['Stemmed Messages'] = dataset['v2'].apply(tokenizeWords, method= 'Stemming')
dataset['Lemmatized Messages'] = dataset['v2'].apply(tokenizeWords, method='Lemmatize')

In [35]:
dataset.head()

Unnamed: 0,v1,v2,Stemmed Messages,Lemmatized Messages
0,ham,Go until jurong point crazy Available only in...,go jurong point crazi avail bugi n great worl...,Go jurong point crazy Available bugis n great...
1,ham,Ok lar Joking wif u oni,ok lar joke wif u oni,Ok lar Joking wif u oni
2,spam,Free entry in a wkly comp to win FA Cup final...,free entri wkli comp win fa cup final tkt st...,Free entry wkly comp win FA Cup final tkts s...
3,ham,U dun say so early hor U c already then say,u dun say earli hor u c alreadi say,U dun say early hor U c already say
4,ham,Nah I don t think he goes to usf he lives arou...,nah i think goe usf live around though,Nah I think go usf life around though


In [36]:
dataset.shape

(5572, 4)

In [37]:
le = LabelEncoder()

In [38]:
X = dataset['Stemmed Messages']
y = le.fit_transform(dataset['v1'])

In [39]:
X.head()

0    go jurong point crazi  avail bugi n great worl...
1                             ok lar  joke wif u oni  
2    free entri  wkli comp win fa cup final tkt  st...
3               u dun say earli hor  u c alreadi say  
4               nah i think goe usf live around though
Name: Stemmed Messages, dtype: object

In [40]:
y

array([0, 0, 1, ..., 0, 0, 0])

## 2.1) Using Bag of Words

In [41]:
cv = CountVectorizer(max_features=4500)
X_countVectorized = cv.fit_transform(X).toarray()

In [42]:
import seaborn as sns

In [43]:
print(X_countVectorized)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [44]:
X_countVectorized.shape

(5572, 4500)

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X_countVectorized, y, test_size=.3, random_state=101)

In [46]:
nb = MultinomialNB()
nb.fit(X_train, y_train)

MultinomialNB()

In [47]:
train_predictions = nb.predict(X_train)

In [48]:
test_predictions = nb.predict(X_test)

In [49]:
print('Accuracy on training data: ', accuracy_score(y_train, train_predictions))
print('Accuracy on test data: ', accuracy_score(y_test, test_predictions))

Accuracy on training data:  0.9928205128205129
Accuracy on test data:  0.9844497607655502


## 2.2) Tfidf

In [50]:
tfidf = TfidfVectorizer(max_features=4500)
X_TfidfVectorized = tfidf.fit_transform(X).toarray()

In [51]:
X_countVectorized.shape

(5572, 4500)

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X_TfidfVectorized, y, test_size=.3, random_state=101)

In [53]:
nb.fit(X_train, y_train)
train_predictions = nb.predict(X_train)
test_predictions = nb.predict(X_test)

In [54]:
print('Accuracy on training data: ', accuracy_score(y_train, train_predictions))
print('Accuracy on test data: ', accuracy_score(y_test, test_predictions))

Accuracy on training data:  0.978974358974359
Accuracy on test data:  0.9766746411483254


## Making Predictions 

In [141]:
text = ['you have won']

In [142]:
X_CountVectorizer = CountVectorizer(stop_words='english')
X_train_counts = X_CountVectorizer.fit_transform(dataset['v2'])

In [152]:
X_train_counts.shape

(5572, 7414)

In [144]:
y.shape

(5572,)

In [145]:
nb.fit(X_train_counts, y)

MultinomialNB()

In [146]:
accuracy_score(y, nb.predict(X_train_counts))

0.991564967695621

In [147]:
trans = X_CountVectorizer.transform(text)

In [148]:
trans.shape

(1, 7414)

In [149]:
nb.predict(trans)

array([1])