In [1]:
import pandas as pd
import re

In [2]:
mail_df = pd.read_csv("spam.csv" , usecols=[0,1] , names = ["label" , "mail"])
mail_df = mail_df.drop(axis = 0 , labels = 0)

In [3]:
mail_df.head()

Unnamed: 0,label,mail
1,ham,"Go until jurong point, crazy.. Available only ..."
2,ham,Ok lar... Joking wif u oni...
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...
4,ham,U dun say so early hor... U c already then say...
5,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# Cleaning the data
def cleaner(mail):
    mail = re.sub(r'[^\w\s]' , '' , mail) # removing punctuations
    mail = mail.lower()
    return mail.split()

mail_df["mail"] = mail_df["mail"].apply(cleaner)

In [5]:
# Splitting into train and test set
train = mail_df.sample(frac = 0.8 , random_state = 0)
test = mail_df.drop(train.index)
train = train.reset_index(drop = True)
test = test.reset_index(drop = True)

In [6]:
train["label"].value_counts() , test["label"].value_counts()

(ham     3857
 spam     601
 Name: label, dtype: int64,
 ham     968
 spam    146
 Name: label, dtype: int64)

In [7]:
vocabulary = {word : 0 for word in train["mail"].sum()}

In [8]:
data_vocabulary = []
for row in train.iterrows():
    temp = vocabulary.copy()
    for word in row[1][1]:
        temp[word] = 1
    data_vocabulary.append(temp)
vocb_df = pd.DataFrame(data_vocabulary)
vocb_df.head()

Unnamed: 0,aight,should,i,just,plan,to,come,up,later,tonight,...,toyota,camry,olayiwolas,mileage,kits,landing,kane,shud,feelin,nuther
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
vocb_df.tail()

Unnamed: 0,aight,should,i,just,plan,to,come,up,later,tonight,...,toyota,camry,olayiwolas,mileage,kits,landing,kane,shud,feelin,nuther
4453,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4454,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4455,0,0,1,0,0,1,0,0,0,0,...,1,1,1,1,1,1,0,0,0,0
4456,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
4457,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [10]:
counts = train["label"].value_counts()

In [11]:
counts

ham     3857
spam     601
Name: label, dtype: int64

In [12]:
phi_ham , phi_spam = counts["ham"]/train.shape[0] , counts["spam"]/train.shape[0]
phi_ham , phi_spam

(0.8651861821444594, 0.1348138178555406)

In [13]:
ham_word_cnt = train[train["label"] == "ham"].apply(len).sum()
spam_word_cnt = train[train["label"] == "spam"].apply(len).sum()
ham_word_cnt , spam_word_cnt

(7714, 1202)

In [14]:
vocab_cnt = len(vocabulary.keys())
vocab_cnt

8409

In [15]:
def p_word(word):
    
    if word in vocb_df.columns:
        p_ham = (vocb_df[train["label"] == "ham"][word].sum() + 1)/(ham_word_cnt + vocab_cnt)
        p_spam = (vocb_df[train["label"] == "ham"][word].sum() + 1)/(spam_word_cnt + vocab_cnt)
        return p_ham , p_spam
    
    else:
        return 1 , 1

def predict(mail):
    
    p_ham_mail , p_spam_mail = phi_ham , phi_spam
    
    for word in mail:
        pham , pspam = p_word(word)
        p_ham_mail *= pham
        p_spam_mail *= pspam
    
    if p_ham_mail >= p_spam_mail:
        return "ham"
    elif p_ham_mail < p_spam_mail:
        return "spam"

In [19]:
mail = "haha , buy our product and get lost"
predict(cleaner(mail))

'spam'