In [111]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

In [112]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [113]:
df=pd.read_csv('emails.csv')
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [114]:
len_spam=(df.spam==1).sum()
len_spam

np.int64(1368)

In [115]:
len_ham=(df.spam==0).sum()
len_ham

np.int64(4360)

In [116]:
print(f"Proportion of spam emails: {len_spam/(len_spam+len_ham)}")

Proportion of spam emails: 0.2388268156424581


In [117]:
df.text[0][9:]

"naturally irresistible your corporate identity  lt is really hard to recollect a company : the  market is full of suqgestions and the information isoverwhelminq ; but a good  catchy logo , stylish statlonery and outstanding website  will make the task much easier .  we do not promise that havinq ordered a iogo your  company will automaticaily become a world ieader : it isguite ciear that  without good products , effective business organization and practicable aim it  will be hotat nowadays market ; but we do promise that your marketing efforts  will become much more effective . here is the list of clear  benefits : creativeness : hand - made , original logos , specially done  to reflect your distinctive company image . convenience : logo and stationery  are provided in all formats ; easy - to - use content management system letsyou  change your website content and even its structure . promptness : you  will see logo drafts within three business days . affordability : your  marketing b

In [118]:
def preprocess_email(df):
  #shuffle
  df=df.sample(frac=1,ignore_index=True,random_state=42)
  #removing Subject word
  text=np.array(df.text.apply(lambda x: x[9:]))
  labels=np.array(df.spam)
  return text,labels

In [119]:
text,labels=preprocess_email(df)

In [120]:
len(text)

5728

In [121]:
labels[:5]

array([0, 0, 0, 0, 0])

In [122]:
type(text[1])

str

In [123]:
type(text)

numpy.ndarray

In [124]:
def preprocess_text(text):
  stop=set(stopwords.words('english')+list(string.punctuation))
  processed_text=[]
  for email in text:
    words=word_tokenize(email.lower())
    processed_text.append([word for word in words if word not in stop])
  return processed_text

In [125]:
processed_text=preprocess_text(text)

In [126]:
#till now we have got processed text, labels

In [127]:
TRAIN_SIZE = int(0.80*len(processed_text)) # 80% of the samples will be used to train.

X_train = processed_text[:TRAIN_SIZE]
Y_train = labels[:TRAIN_SIZE]
X_test = processed_text[TRAIN_SIZE:]
Y_test = labels[TRAIN_SIZE:]

In [128]:
print(f"Proportion of spam in train dataset: {sum(Y_train == 1)/len(Y_train):.4f}")
print(f"Proportion of spam in test dataset: {sum(Y_test == 1)/len(Y_test):.4f}")

Proportion of spam in train dataset: 0.2431
Proportion of spam in test dataset: 0.2216


### Naive bayes algorithm

In [129]:
#first get probablity of spam and ham
p_spam=sum(Y_train==1)/len(Y_train)
p_ham=sum(Y_train==0)/len(Y_train)
print(f"Probability of spam: {p_spam:.4f}")
print(f"Probability of ham: {p_ham:.4f}")

Probability of spam: 0.2431
Probability of ham: 0.7569


In [130]:
#check how many times individual words are seen in  spam and ham
word_freq={}
for label,email in zip(Y_train,X_train):
  email=set(email) #To remove repeated words in the email.
  for word in email:
    if word not in word_freq:
      word_freq[word]={"spam":1,"ham":1}

    if label==1:
      word_freq[word]["spam"]=word_freq[word]["spam"]+1
    else:
      word_freq[word]["ham"]=word_freq[word]["ham"]+1


In [131]:
def predict(email,p_spam,p_ham,word_freq):
  spam_chances=p_spam
  ham_chance=p_ham

  for word in email:
    if word in word_freq:
      spam_chances=spam_chances*word_freq[word]["spam"]
      ham_chance=ham_chance*word_freq[word]["ham"]

  if spam_chances>=ham_chance:
    return 1
  else:
    return 0

In [132]:
#test
correctly_predicted=0
wrong_predicted=0
for email,label in zip(X_test,Y_test):
  prediction=predict(email,p_spam,p_ham,word_freq)
  if prediction==label:
    correctly_predicted+=1
  else:
    wrong_predicted+=1

print(f"Correctly predicted: {correctly_predicted}")
print(f"Wrongly predicted: {wrong_predicted}")
print(f"Accuracy: {correctly_predicted/len(Y_test):.4f}")

Correctly predicted: 895
Wrongly predicted: 251
Accuracy: 0.7810


  ham_chance=ham_chance*word_freq[word]["ham"]
  spam_chances=spam_chances*word_freq[word]["spam"]


The warning is due to underflow, which is solved in log naive bayes algorithm

In [133]:
def log_bayes_algo(email,p_spam,p_ham,word_freq):
  spam_chances=np.log(p_spam)
  ham_chance=np.log(p_ham)

  for word in email:
    if word in word_freq:
      spam_chances=spam_chances+np.log(word_freq[word]["spam"])
      ham_chance=ham_chance+np.log(word_freq[word]["ham"])

  if spam_chances>ham_chance:
    return 1
  else:
    return 0

In [134]:
#test
correctly_predicted=0
wrong_predicted=0
for email,label in zip(X_test,Y_test):
  prediction=log_bayes_algo(email,p_spam,p_ham,word_freq)
  if prediction==label:
    correctly_predicted+=1
  else:
    wrong_predicted+=1

print(f"Correctly predicted: {correctly_predicted}")
print(f"Wrongly predicted: {wrong_predicted}")
print(f"Accuracy: {correctly_predicted/len(Y_test):.4f}")

Correctly predicted: 971
Wrongly predicted: 175
Accuracy: 0.8473
