In [80]:
from google.colab import files
upload = files.upload()

Saving spam_ham_dataset.csv to spam_ham_dataset (2).csv


In [81]:
import pandas as pd
import numpy as np
import nltk
nltk.download('popular')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from math import log, sqrt

mails = pd.read_csv('spam_ham_dataset.csv')
mails.drop('Unnamed: 0', axis=1, inplace=True)
mails.drop('label',axis=1,inplace=True)
mails.columns = ['email', 'label'] #spam=1 ham=0

total = mails.shape[0]
train_index = list()
test_index = list()

for i in range(total):
  if np.random.uniform(0,1) < 0.80:
    train_index += [i]
  else:
    test_index += [i]

train = mails.loc[train_index]
test = mails.loc[test_index]

train.reset_index(inplace=True)
train.drop(['index'],axis=1, inplace=True)

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do

In [82]:
def Preprocess(email):
    email = email.lower()
    words = word_tokenize(email)
    wrds = []
    for i in words:
      if len(i) > 2:
        wrds.append(i)
   
    sw = stopwords.words('english')
    wrds1 = []
    for i in wrds:
      if i not in sw:
        wrds1.append(i)
    
    stemmer = PorterStemmer()
    wrds2 = []
    for i in wrds1:
      wrds2.append(stemmer.stem(i))   
    
    return wrds2

In [86]:
class Classifier(object):
    def __init__(self, train):
        self.mails, self.labels = train['email'], train['label']

    def TrainAlgo(self):
        self.WordFrequency()
        self.CalcProb()
   
    def WordFrequency(self):
        noOfEmails = self.mails.shape[0]
        self.SpamMails = self.labels.value_counts()[1]
        self.HamMails = self.labels.value_counts()[0]
        self.TotalMails = self.SpamMails + self.HamMails
        self.SpamWords = 0
        self.HamWords = 0
        self.EmailSpam = {}
        self.EmailHam = {}
        self.DataSpam = {}
        self.DataHam = {}
        for i in range(noOfEmails):
            emails_processed = Preprocess(self.mails[i])
            unique_vocab = [] 
            for word in emails_processed:
                if self.labels[i]:
                    self.EmailSpam[word] = self.EmailSpam.get(word, 0) + 1
                    self.SpamWords += 1
                else:
                    self.EmailHam[word] = self.EmailHam.get(word, 0) + 1
                    self.HamWords += 1
                if word not in unique_vocab:
                    unique_vocab += [word]
            for word in unique_vocab:
                if self.labels[i]:
                    self.DataSpam[word] = self.DataSpam.get(word, 0) + 1
                else:
                    self.DataHam[word] = self.DataHam.get(word, 0) + 1
    
    def CalcProb(self):
        self.ProbSpam = {}
        self.ProbHam = {}
        self.SumSpam = 0
        self.SumHam = 0
        for word in self.EmailSpam:
            self.ProbSpam[word] = (self.EmailSpam[word]) * log((self.SpamMails + self.HamMails)/ (self.DataSpam[word] + self.DataHam.get(word, 0)))
            self.SumSpam += self.ProbSpam[word]
        for word in self.EmailSpam:
            self.ProbSpam[word] = (self.ProbSpam[word] + 1) / (self.SumSpam + len(list(self.ProbSpam.keys())))
            
        for word in self.EmailHam:
            self.ProbHam[word] = (self.EmailHam[word]) * log((self.SpamMails + self.HamMails)/ (self.DataSpam.get(word, 0) + self.DataHam[word]))
            self.SumHam += self.ProbHam[word]
        for word in self.EmailHam:
            self.ProbHam[word] = (self.ProbHam[word] + 1) / (self.SumHam + len(list(self.ProbHam.keys())))
            
    
        self.ProbSpamMail = self.SpamMails / self.TotalMails
        self.ProbHamMail = self.HamMails / self.TotalMails

    def Classify(self, ProcessedEmails):
        pSpam, pHam = 0, 0
        for word in ProcessedEmails:                
            if word in self.ProbSpam:
                pSpam += log(self.ProbSpam[word])
            else:
                pSpam -= log(self.SumSpam + len(list(self.ProbSpam.keys())))
            if word in self.ProbHam:
                pHam += log(self.ProbHam[word])
            else:
                pHam -= log(self.SumHam + len(list(self.ProbHam.keys()))) 
            pSpam += log(self.ProbSpamMail)
            pHam += log(self.ProbHamMail)
        return pSpam >= pHam
    
    def Predict(self, test):
        result = {}
        for (i, email) in enumerate(test):
            ProcessedEmails = Preprocess(email)
            result[i] = int(self.Classify(ProcessedEmails))
        return result         

In [87]:
def main():
  c = Classifier(train)
  c.TrainAlgo()
  preds = c.Predict(test['email'])

  Test = []
  for i in test['label']:
    Test+= [i]

  l = len(preds)
  Pred = []
  for i in range(l):
    Pred += [preds[i]]

  TP = 0 #true positive
  TN = 0 #true negative
  FP = 0 #flase positive
  FN = 0 #flase negative

  for i in range(len(Test)):
    if Test[i] == 1 and Pred[i] == 1 :
      TP += 1
      continue
    if Test[i] == 0 and Pred[i] == 0:
      TN += 1
      continue
    if Test[i] == 0 and Pred[i] == 1:
      FP += 1
      continue
    if Test[i] == 1 and Pred[i] == 0:
      FN += 1

  A = (TP + TN) / (TP + TN + FP + FN)
  print('Accuracy: ',A*100)

In [88]:
main()

Accuracy:  85.62091503267973
