# Spam Classifier

### Importation

In [1]:
import csv
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist
import nltk
nltk.download('punkt')
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tilak\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Reading file

In [20]:
file = open("spam.csv")
csvreader = csv.reader(file)
header = next(csvreader)
print(header)
rows = []
for row in csvreader:
    rows.append(row)
# Printing first 10 examples.
print(rows[: 10])
file.close()

['v1', 'v2', '', '', '']
[['ham', 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', '', '', ''], ['ham', 'Ok lar... Joking wif u oni...', '', '', ''], ['spam', "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", '', '', ''], ['ham', 'U dun say so early hor... U c already then say...', '', '', ''], ['ham', "Nah I don't think he goes to usf, he lives around here though", '', '', ''], ['spam', "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv", '', '', ''], ['ham', 'Even my brother is not like to speak with me. They treat me like aids patent.', '', '', ''], ['ham', "As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertu

### Tokenization and stemming

In [4]:
fdist = FreqDist()
pst = PorterStemmer()
for i in range(len(rows)):
    tokens = word_tokenize(rows[i][1])
    for word in tokens:
        fdist[pst.stem(word.lower())] += 1
# Printing 30 most common words in the email body.
fdist.most_common(30)

[('.', 4865),
 ('i', 2900),
 ('to', 2241),
 ('you', 2228),
 (',', 1871),
 ('?', 1541),
 ('a', 1424),
 ('!', 1381),
 ('the', 1324),
 ('u', 1133),
 ('...', 1131),
 ('and', 977),
 ('it', 951),
 ('&', 916),
 ('is', 898),
 ('in', 887),
 ('me', 804),
 (';', 764),
 ('my', 759),
 (':', 717),
 ('for', 703),
 ('your', 703),
 ('..', 681),
 ('call', 656),
 ('have', 638),
 ('do', 632),
 ('of', 619),
 ('that', 613),
 ('on', 536),
 (')', 494)]

### Selecting the most common words

In [5]:
most_common = []
for (i,j) in fdist.most_common(100):
    most_common.append(i)
print(most_common)

['.', 'i', 'to', 'you', ',', '?', 'a', '!', 'the', 'u', '...', 'and', 'it', '&', 'is', 'in', 'me', ';', 'my', ':', 'for', 'your', '..', 'call', 'have', 'do', 'of', 'that', 'on', ')', "'s", 'are', 'now', '2', 'so', 'go', 'get', 'not', 'but', 'be', 'or', 'can', 'at', 'we', "'m", 'will', 'if', 'ur', 'with', 'just', "n't", 'no', 'thi', 'how', 'gt', 'lt', '*', 'up', 'what', 'come', 'when', '4', "''", '#', 'from', 'free', 'know', 'all', 'out', 'ok', 'like', 'love', 'got', 'time', 'wa', 'want', 'good', 'day', 'then', "'ll", 'there', '-', 'he', 'text', 'am', 'onli', 'send', 'hi', 'need', 'one', 'txt', 'as', 'today', 'see', 'by', 'take', 'about', 'think', 'did', 'home']


### features scaling

In [6]:
def scaling(st):
    tokens = word_tokenize(st)
    stemmed = []
    for token in tokens:
        stemmed.append(pst.stem(token.lower()))
    return stemmed

In [19]:
temprow = []
for i in range(len(rows)):
    temprow.append(scaling(rows[i][1]))
# Printing first 10 examples tokenized.
print(temprow[: 10])

[['go', 'until', 'jurong', 'point', ',', 'crazi', '..', 'avail', 'onli', 'in', 'bugi', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'cine', 'there', 'got', 'amor', 'wat', '...'], ['ok', 'lar', '...', 'joke', 'wif', 'u', 'oni', '...'], ['free', 'entri', 'in', '2', 'a', 'wkli', 'comp', 'to', 'win', 'fa', 'cup', 'final', 'tkt', '21st', 'may', '2005', '.', 'text', 'fa', 'to', '87121', 'to', 'receiv', 'entri', 'question', '(', 'std', 'txt', 'rate', ')', 't', '&', 'c', "'s", 'appli', '08452810075over18', "'s"], ['u', 'dun', 'say', 'so', 'earli', 'hor', '...', 'u', 'c', 'alreadi', 'then', 'say', '...'], ['nah', 'i', 'do', "n't", 'think', 'he', 'goe', 'to', 'usf', ',', 'he', 'live', 'around', 'here', 'though'], ['freemsg', 'hey', 'there', 'darl', 'it', "'s", 'been', '3', 'week', "'s", 'now', 'and', 'no', 'word', 'back', '!', 'i', "'d", 'like', 'some', 'fun', 'you', 'up', 'for', 'it', 'still', '?', 'tb', 'ok', '!', 'xxx', 'std', 'chg', 'to', 'send', ',', 'å£1.50', 'to', 'rcv'], ['even', '

### Getting features

In [8]:
def getFeatures(arr , most_common):
    fe = []
    for word in most_common:
        if word in arr:
            fe.append(1)
        else:
            fe.append(0)
    return fe

In [17]:
X = []
for i in range(len(temprow)):
    a = getFeatures(temprow[i] , most_common)
    X.append(a)
#Printing feature vectors of first 10 examples.
print(X[: 10])

[[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

### Getting target

In [11]:
y = []
for i in range(len(rows)):
    if rows[i][0] == 'ham':
        y.append(0)
    else:
        y.append(1)
# Printing first 10 target example.
print(y[: 10])


[0, 0, 1, 0, 0, 1, 0, 0, 1, 1]


### Splitting dataset into train and test

In [12]:
X = np.array(X)
y = np.array(y)
X_train , X_test , y_train, y_test = train_test_split(X,y,test_size = 0.2)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4457, 100)
(1115, 100)
(4457,)
(1115,)


### Model

#### Logistic Regression

In [13]:
clf = LogisticRegression()
clf.fit(X_train , y_train)

LogisticRegression()

### Prediction

In [15]:
pred = clf.predict(X_test)
# Checking first 50 predictions.
print("Predicted: " , pred[: 50])
print("Real: " , y_test[: 50])

Predicted:  [0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0
 0 0 0 0 0 0 1 1 0 0 0 0 1]
Real:  [0 1 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0
 0 0 0 0 0 0 1 1 0 0 0 0 1]


### F1 score

In [16]:
f1_score(y_test, pred, average='weighted')

0.9640356446116066

### Taking input

In [81]:
st = input("Enter your Email body: ")

Enter your Email body:  Can't read or see images?  View this email in a browser  	 https://campaign-image.com/zohocampaigns/133052000002457092_zc_v53_artboard_1.png Participate Now Dear MyPeers,  Today is Day 1 of the live sessions at The EnthusiasTech Summit 2022! Register yourself at tes.myways.ai If you have already registered, "Enroll" yourself in specific events Today's session are planned from 5pm to 8pm, Details are attached below. Please note that all sessions today will have the same Zoom link. Please join 10 mins prior to the event you are interested in! Or stay for the whole 3 hours, we would be happy to host you! Please find the detailed agenda below!


### Data Preprocessing

In [82]:
stemmed_st = scaling(st)
x_in = getFeatures(stemmed_st, most_common)
print(x_in)


[1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]


### Ham or Spam

In [83]:
predicted = clf.predict([x_in])
if predicted == 0:
    print("ham")
else:
    print("spam")

spam
