# Spam Classifier

### Importation

In [2]:
import csv
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist
import nltk
nltk.download('punkt')
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tilak\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Reading file

In [3]:
file = open("spam.csv")
csvreader = csv.reader(file)
header = next(csvreader)
print(header)
rows = []
for row in csvreader:
    rows.append(row)
print(rows)
file.close()

['v1', 'v2', '', '', '']


### Tokenization and stemming

In [4]:
fdist = FreqDist()
pst = PorterStemmer()
for i in range(len(rows)):
    tokens = word_tokenize(rows[i][1])
    for word in tokens:
        fdist[pst.stem(word.lower())] += 1
fdist.most_common(30)

[('.', 4865),
 ('i', 2900),
 ('to', 2241),
 ('you', 2228),
 (',', 1871),
 ('?', 1541),
 ('a', 1424),
 ('!', 1381),
 ('the', 1324),
 ('u', 1133),
 ('...', 1131),
 ('and', 977),
 ('it', 951),
 ('&', 916),
 ('is', 898),
 ('in', 887),
 ('me', 804),
 (';', 764),
 ('my', 759),
 (':', 717),
 ('for', 703),
 ('your', 703),
 ('..', 681),
 ('call', 656),
 ('have', 638),
 ('do', 632),
 ('of', 619),
 ('that', 613),
 ('on', 536),
 (')', 494)]

### Selecting the most common words

In [5]:
most_common = []
for (i,j) in fdist.most_common(30):
    most_common.append(i)
print(most_common)

['.', 'i', 'to', 'you', ',', '?', 'a', '!', 'the', 'u', '...', 'and', 'it', '&', 'is', 'in', 'me', ';', 'my', ':', 'for', 'your', '..', 'call', 'have', 'do', 'of', 'that', 'on', ')']


### features scaling

In [6]:
def scaling(st):
    tokens = word_tokenize(st)
    stemmed = []
    for token in tokens:
        stemmed.append(pst.stem(token.lower()))
    return stemmed

In [7]:
temprow = []
for i in range(len(rows)):
    temprow.append(scaling(rows[i][1]))
print(temprow)

[['go', 'until', 'jurong', 'point', ',', 'crazi', '..', 'avail', 'onli', 'in', 'bugi', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'cine', 'there', 'got', 'amor', 'wat', '...'], ['ok', 'lar', '...', 'joke', 'wif', 'u', 'oni', '...'], ['free', 'entri', 'in', '2', 'a', 'wkli', 'comp', 'to', 'win', 'fa', 'cup', 'final', 'tkt', '21st', 'may', '2005', '.', 'text', 'fa', 'to', '87121', 'to', 'receiv', 'entri', 'question', '(', 'std', 'txt', 'rate', ')', 't', '&', 'c', "'s", 'appli', '08452810075over18', "'s"], ['u', 'dun', 'say', 'so', 'earli', 'hor', '...', 'u', 'c', 'alreadi', 'then', 'say', '...'], ['nah', 'i', 'do', "n't", 'think', 'he', 'goe', 'to', 'usf', ',', 'he', 'live', 'around', 'here', 'though'], ['freemsg', 'hey', 'there', 'darl', 'it', "'s", 'been', '3', 'week', "'s", 'now', 'and', 'no', 'word', 'back', '!', 'i', "'d", 'like', 'some', 'fun', 'you', 'up', 'for', 'it', 'still', '?', 'tb', 'ok', '!', 'xxx', 'std', 'chg', 'to', 'send', ',', 'å£1.50', 'to', 'rcv'], ['even', '

### Getting features

In [16]:
def getFeatures(arr , most_common):
    fe = []
    for word in most_common:
        if word in arr:
            fe.append(1)
        else:
            fe.append(0)
    return fe

In [17]:
X = []
for i in range(len(temprow)):
    a = getFeatures(temprow[i] , most_common)
    X.append(a)
print(X)

[[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1], [1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0], [1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 

### Getting target

In [19]:
y = []
for i in range(len(rows)):
    if rows[i][0] == 'ham':
        y.append(0)
    else:
        y.append(1)
print(y)


[0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 

### Splitting dataset into train and test

In [20]:
X = np.array(X)
y = np.array(y)
X_train , X_test , y_train, y_test = train_test_split(X,y,test_size = 0.2)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4457, 30)
(1115, 30)
(4457,)
(1115,)


### Model

#### Logistic Regression

In [21]:
clf = LogisticRegression()
clf.fit(X_train , y_train)

LogisticRegression()

### Prediction

In [22]:
pred = clf.predict(X_test)
print("Predicted: " , pred[: 50])
print("Real: " , y_test[: 50])

Predicted:  [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
Real:  [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0]


### F1 score

In [24]:
f1_score(y_test, pred, average='weighted')

0.9373132489998129

### Taking input

In [57]:
st = input("Enter your Email: ")

Enter your Email: tilak


### Data Preprocessing

In [58]:
stemmed_st = scaling(st)
x_in = getFeatures(stemmed_st, most_common)
print(x_in)


[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


### Ham or Spam

In [56]:
predicted = clf.predict([x_in])
if predicted == 0:
    print("ham")
else:
    print("spam")

ham
