In [1]:
import pandas as pd
import numpy as np

## Importing Data

In [2]:
df = pd.read_csv('spam.csv', encoding="ISO-8859-1")

In [3]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [5]:
columns_to_drop = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']

In [6]:
df = df.drop(columns=columns_to_drop)

In [9]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Data Preprocessing

In [10]:
from sklearn.preprocessing import LabelEncoder

In [11]:
le = LabelEncoder()

In [12]:
df.v1 = le.fit_transform(df.v1)

In [14]:
df.head()

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


## 0 -> Ham , 1 -> Spam

In [15]:
y = df.v1

In [16]:
np.unique(y, return_counts=True)

(array([0, 1]), array([4825,  747]))

In [18]:
df['v2'][4]

"Nah I don't think he goes to usf, he lives around here though"

## Natural Language Toolkit for Cleaning, Stemming and Lemmatizing of Words

In [19]:
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer

In [20]:
stops = set(stopwords.words('english'))
stemmer = LancasterStemmer()
lamet = WordNetLemmatizer()

In [21]:
import string
punc = set(string.punctuation)

In [22]:
bad = punc.union(stops)

In [23]:
import re

In [24]:
from nltk.tokenize import word_tokenize

In [25]:
def clean_text(message):
    message = message.lower()
    message = re.sub("[^a-zA-Z]+", " ", message)

    words = word_tokenize(message)
    important_words = set(words)-bad
    
    important_words = [lamet.lemmatize(i) for i in important_words]
    important_words = [stemmer.stem(i) for i in important_words]

    return " ".join(important_words)

In [27]:
clean_text(df['v2'][4])

'though go usf lif around think nah'

## Apply text cleaning to all rows of column 'v2'

In [28]:
df['v2'] = df['v2'].apply(clean_text)

In [30]:
df.head()

Unnamed: 0,v1,v2
0,0,e wat avail la got am crazy point jurong go bu...
1,0,on u ok lar jok wif
2,1,cup text win std may wkly receiv txt fin fre s...
3,0,u ear say dun already c hor
4,0,though go usf lif around think nah


## Vectorization of Words

In [31]:
from sklearn.feature_extraction.text import CountVectorizer

In [32]:
vect = CountVectorizer(max_features=500, min_df=2)

In [33]:
X = df.v2.values #Converts to numpy array

In [34]:
X.shape

(5572,)

In [35]:
y = df.v1.values

In [36]:
y.shape

(5572,)

In [37]:
vect.fit(X)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=500, min_df=2,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [38]:
X_mod = vect.transform(X).todense()

In [39]:
X_mod.shape

(5572, 500)

In [40]:
from sklearn.model_selection import train_test_split

In [42]:
X_train, X_test, y_train, y_test = train_test_split(
...     X_mod, y, test_size=0.33, random_state=42)

## Naive Bayes

In [43]:
from sklearn.naive_bayes import BernoulliNB

In [44]:
bnb = BernoulliNB()

In [45]:
bnb.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [47]:
print(bnb.score(X_test, y_test))

0.9804241435562806


## Accuracy Score : 98.04%

## Testing on random data
- Spam : 1
- Ham : 0

In [50]:
#SPAM
text = "IMPORTANT - You could be entitled up to £3,160 in compensation from mis-sold PPI on a credit card or loan. Please reply PPI for info or STOP to opt out."

In [51]:
test_vec = vect.transform([text.lower()]).todense()

In [70]:
bnb.predict(test_vec)


array([1])

In [71]:
text_2 = "let's meet tomorrow for the meeting"

In [72]:
test_vec_2 = vect.transform([text_2.lower()]).todense()

In [73]:
bnb.predict(test_vec_2)


array([0])