# Understanding data set

In [1]:
import pandas as pd
import numpy as np
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [4]:
data = pd.read_csv('D:/Project series/AI/Spam Classification/spam.csv',encoding='cp1252')

In [5]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
data = data[['v1','v2']]

In [7]:
data['v1'] = data['v1'].apply(lambda x:0 if x=='ham' else 1)

In [8]:
data

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


# Text Pre-Processing

In [9]:
def process(x):
    temp = []
    document = nlp(x.lower())
    print(document)
    for i in document:
        if i.is_stop!=True and i.is_punct!= True:
            print(i)
            temp.append(i.lemma_)
            print(temp)
        else:
            pass
        
    return (' '.join(temp))

In [10]:
data['v2'] = data['v2'].apply(lambda x: process(x))

go until jurong point, crazy.. available only in bugis n great world la e buffet... cine there got amore wat...
jurong
['jurong']
point
['jurong', 'point']
crazy
['jurong', 'point', 'crazy']
available
['jurong', 'point', 'crazy', 'available']
bugis
['jurong', 'point', 'crazy', 'available', 'bugis']
n
['jurong', 'point', 'crazy', 'available', 'bugis', 'n']
great
['jurong', 'point', 'crazy', 'available', 'bugis', 'n', 'great']
world
['jurong', 'point', 'crazy', 'available', 'bugis', 'n', 'great', 'world']
la
['jurong', 'point', 'crazy', 'available', 'bugis', 'n', 'great', 'world', 'la']
e
['jurong', 'point', 'crazy', 'available', 'bugis', 'n', 'great', 'world', 'la', 'e']
buffet
['jurong', 'point', 'crazy', 'available', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet']
cine
['jurong', 'point', 'crazy', 'available', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine']
got
['jurong', 'point', 'crazy', 'available', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'get

In [11]:
data.head()

Unnamed: 0,v1,v2
0,0,jurong point crazy available bugis n great wor...
1,0,ok lar joke wif u oni
2,1,free entry 2 wkly comp win fa cup final tkts 2...
3,0,u dun early hor u c
4,0,nah think go usf live


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
vectorizer = TfidfVectorizer(analyzer='word',stop_words='english')

In [14]:
text_vector = vectorizer.fit_transform(data['v2'].values.tolist())

In [15]:
print(text_vector)

  (0, 7022)	0.19889398319567994
  (0, 1017)	0.360053918924906
  (0, 1835)	0.3041750056811069
  (0, 1585)	0.3437103845105694
  (0, 3864)	0.3041750056811069
  (0, 7189)	0.2413691324300752
  (0, 3147)	0.19816108817777053
  (0, 1587)	0.3041750056811069
  (0, 1205)	0.2693085890519748
  (0, 2063)	0.27379188480006533
  (0, 5080)	0.24585242817816577
  (0, 3755)	0.360053918924906
  (1, 4755)	0.5647537939557097
  (1, 7113)	0.4459451111953121
  (1, 3721)	0.47451057922863127
  (1, 3898)	0.4218684931830353
  (1, 4731)	0.2811632882742994
  (2, 77)	0.23954402497920027
  (2, 1088)	0.17052900260807233
  (2, 5363)	0.1647461730977881
  (2, 6734)	0.12679962529284416
  (2, 6154)	0.20060001748596265
  (2, 5318)	0.16821875652773552
  (2, 5405)	0.1647461730977881
  (2, 775)	0.22519382816846675
  :	:
  (5567, 460)	0.2432695511937228
  (5567, 6703)	0.19245234323971885
  (5567, 4522)	0.2549972248204478
  (5567, 6558)	0.16418937860062793
  (5567, 5134)	0.24588982940401446
  (5567, 1841)	0.19350952661778717
  (556

# Splitting Data set

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
x_train, x_test, y_train, y_test = train_test_split(text_vector.toarray(),data['v1'],test_size=0.2,random_state=20)

In [18]:
len(x_test)

1115

# Model Building

In [19]:
from sklearn.naive_bayes import BernoulliNB

In [20]:
modelB = BernoulliNB()
modelB.fit(x_train,y_train)
print(modelB.score(x_train,y_train))

0.9849674669059906


In [21]:
y_predictedB = modelB.predict(x_test)

In [22]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test,y_predictedB))

0.9829596412556054


## Best model is BernoulliNB with  98% Accuracy