In [1]:
import numpy as np
import pandas as pd

#Data Gathering

data = [
    {   "text":"Congrats, You have won!! reply to our sms for a free nokia mobile + free camcorder.",
        "label" : "spam"},
    {   "text":"Congrats! 1 year special cinema pass for 2 is yours. reply to this sms to claim your prize.",
        "label" : "spam"},
    {   "text":"I am pleased to tell you that you are awarded with a 1500 Bonus Prize, reply to this sms to claim your prize.",
        "label" : "spam"},
    {   "text":"Dont worry. I guess he is busy.","label" : "not spam"},
    {   "text":"Going for dinner. msg you later.", "label" : "not spam"}
]

df = pd.DataFrame(data)

In [2]:
#Data Preprocessing
# getting the tokens for one hot encoding
def filter_word(word):
    stop_words = [".", ",", "!", "+"]
    if(len(word) > 1):
        while(word[-1] in stop_words):
            word = word[:-1]            
        return word.lower()

def filter_data(corpus):
    tokens = set()
    for sentence in corpus:
        if(len(sentence) > 1):
            for token in sentence.split(" "):
                token = filter_word(token)
                if(token):
                    tokens.add(token)
    return tokens

# assigning to all the columns in a dataframe
def initialize_df(df):
    row = df.shape[0]
    col = df.shape[1]
    df.loc[row] = [0 for n in range(col)]
    return df

# add train data to the dataframe
def add_to_df(text, label, df):
    df = initialize_df(df)
    row = df.shape[0]-1
    for word in text.lower().split(" "):
        word = filter_word(word)

        if(word in df.columns):
            df.loc[row, word] = df.loc[row, word] + 1
        df.loc[row, "label"] = int(1 if label == "spam" else 0)
    return df
        
            

In [3]:
# Creating a model
# Calculating the probablity
def calculate_prob(sentence, df2):
    words_count = dict()
    total = len(df2["label"])
    positive = len(df2[(df2["label"] == 1)])
    negative = total-positive
    positive_prob = positive/total
    negative_prob = negative/total
    
    prop_yes_given_word = 1
    prop_no_given_word = 1
    
#     Calculating the positive probablity
    for word in sentence.split(" "):
        word = filter_word(word)
        if(word and word in df2.columns.tolist()):
            prop_yes_given_word *= (np.sum(df2[(df2["label"] == 1)][word]) + 1)/positive
    
#     Calculating the negative probablity
    for word in sentence.split(" "):
        word = filter_word(word)
        if(word and word in df2.columns.tolist()):
            prop_no_given_word *= (np.sum(df2[(df2["label"] == 0)][word]) + 1)/negative
    
    prop_yes_given_word = positive_prob * prop_yes_given_word    
    prop_no_given_word = negative_prob * prop_no_given_word
    
    if(prop_yes_given_word > prop_no_given_word):
        return "Spam"
    
    return "Not Spam"

In [4]:
#making predictions
def predict(sentence):
    corpus = df['text']
    tokens = filter_data(corpus)

    data_collection = [{"text" : text, "label" : label} for text, label in zip(df['text'].tolist(), df['label'].tolist())]

    df2 = pd.DataFrame(columns = tokens)
    for data in data_collection:
        df2 = add_to_df(df = df2, **data)

    return calculate_prob(sentence, df2)


In [14]:
predict("I am busy. I will msg you later.")

'Not Spam'

# Testing

In [6]:
corpus = df['text']
tokens = filter_data(corpus)
tokens

{'1500',
 'am',
 'are',
 'awarded',
 'bonus',
 'busy',
 'camcorder',
 'cinema',
 'claim',
 'congrats',
 'dinner',
 'dont',
 'for',
 'free',
 'going',
 'guess',
 'have',
 'he',
 'is',
 'later',
 'mobile',
 'msg',
 'nokia',
 'our',
 'pass',
 'pleased',
 'prize',
 'reply',
 'sms',
 'special',
 'tell',
 'that',
 'this',
 'to',
 'with',
 'won',
 'worry',
 'year',
 'you',
 'your',
 'yours'}

In [7]:
data_collection = [{"text" : text, "label" : label} for text, label in zip(df['text'].tolist(),
                                                                           df['label'].tolist())]
df2 = pd.DataFrame(columns = tokens)
add_to_df(df = df2, text = "Dont worry. I guess he is busy.", label = "not spam")

Unnamed: 0,he,prize,for,guess,cinema,pleased,bonus,this,claim,to,...,have,dinner,won,worry,you,reply,am,with,are,label
0,1,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0.0


In [8]:
df2 = pd.DataFrame(columns = tokens)
for data in data_collection:
    df2 = add_to_df(df = df2, **data)


In [9]:
total = len(df2["label"])
positive = len(df2[(df2["label"] == 1)])
negative = total-positive
positive_prob = positive/total
negative_prob = negative/total

print(" total :", total,"\n","positive :", positive, "\n","negative : ", negative)
print()
print(" positive probablity :", positive_prob, "\n","negative probalbity : ", negative_prob)


 total : 5 
 positive : 3 
 negative :  2

 positive probablity : 0.6 
 negative probalbity :  0.4


In [10]:
df2[(df2["label"] == 1)]["mobile"]

0    1
1    0
2    0
Name: mobile, dtype: int64

In [11]:
# prop_yes_given_word = (np.sum(df2[(df2["label"] == 1)]["you"]) + 1)/positive
# prop_yes_given_word

In [12]:
prop_yes_given_word = 1
sentence = "I am busy. I will msg you later."
for word in sentence.split(" "):
        word = filter_word(word)
        if(word and word in df2.columns.tolist()):
            prop_yes_given_word *= (np.sum(df2[(df2["label"] == 1)][word]) + 1)/positive
prop_no_given_word = 1

for word in sentence.split(" "):
    word = filter_word(word)
    if(word and word in df2.columns.tolist()):
        prop_no_given_word *= (np.sum(df2[(df2["label"] == 0)][word]) + 1)/negative


print("Spam : ", prop_yes_given_word)
print("Not Spam: ",prop_no_given_word)

Spam :  0.03292181069958847
Not Spam:  0.5


In [16]:
!open .