In [1]:
import pandas as pd
import re

ham_spam_sms = pd.read_csv("SMSSpamCollection.txt",
                            delimiter = "\t",header=None, names=['Class', 'SMS'])
ham_spam_sms


Unnamed: 0,Class,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [2]:
#we are counting how many are spam and how many are hams in the dataset

ham_spam_sms['Class'].value_counts(normalize = False)

ham     4825
spam     747
Name: Class, dtype: int64

In [3]:
#we are randomizing the dataset so that our model can learn properly from distributed datasets.

from sklearn.model_selection import KFold

training_test = round(len(ham_spam_sms) * 0.80)

train_set_final = ham_spam_sms[:training_test].reset_index(drop = True)
test_set_final = ham_spam_sms[training_test:].reset_index(drop = True)

# prepare cross validation
kfold = KFold(5)
train_set = []
test_set = []

# enumerate splits
for train, test in kfold.split(train_set_final):
    train_set.append(train)
    test_set.append(test)

In [4]:
# train_set_final[train_set_final.index.isin(train_set[1])]

In [5]:
# vocab = ("A","B")
# per_sms_words_count = {unique_word : [0] * len(train_set_one['SMS']) for unique_word in vocab}
# word_count = pd.DataFrame(per_sms_words_count)
# cleaned_train_set = pd.concat([train_set_one, word_count], axis = 1)
# cleaned_train_set

In [6]:
# Training the model for k folds with training datset (80% of orginal data)

# declaring empty dictionaries and variables to store parameters for every fold
para_spam_k = {}
para_ham_k = {}
prob_spam_k = 0
prob_ham_k = 0

k = 5
    
for i in range(0, k):
    
    train_set_i = train_set_final[train_set_final.index.isin(train_set[i])]
    validation_set_i = train_set_final[train_set_final.index.isin(test_set[i])]
    
    
    train_set_i['SMS'] = train_set_i['SMS'].str.replace('\W', ' ', regex = True) # It Removes punctuation
    
    #It is used to lowercase all strings
    train_set_i['SMS'] = train_set_i['SMS'].str.lower()

################################################

    train_set_i['SMS'] = train_set_i['SMS'].str.split() #splitting the string at the space character in train_set

    vocabs = [] #declaring a list to contain all words

    for message in train_set_i['SMS']: #appending all the training words in list
        for word in message:
            vocabs.append(word)

    vocabs = list(set(vocabs)) #this is done to eliminate the duplicates by using set function
    

####################################################
#initialising a dictionary with all value 0 and length of each dictionary is words in vocabs

    per_sms_words_count = {unique_word : [0] * len(train_set_i['SMS']) for unique_word in vocabs}

    for index, sms in enumerate(train_set_i['SMS']):
        for word in sms:
            per_sms_words_count[word][index] += 1
        
#####################################################
        
    #it will display how many times word is used in a sentence for an index

    word_count = pd.DataFrame(per_sms_words_count)
    
    cleaned_train_set = pd.concat([train_set_i, word_count], axis = 1)

#done with cleaning the dataset


########################################################

     #isolating sam and ham messages first

    spam_messages  = cleaned_train_set[cleaned_train_set['Class'] == 'spam']
    ham_messages = cleaned_train_set[cleaned_train_set['Class'] == 'ham']

     #probability of spam and ham messages
    prob_ham = len(ham_messages)/len(cleaned_train_set)
    prob_spam = len(spam_messages)/len(cleaned_train_set)
    
    prob_ham_k += (prob_ham)
    prob_spam_k += (prob_spam)

    #no. of words in all spam msgs
    words_spam_messages = spam_messages['SMS'].apply(len)
    n_spam = words_spam_messages.sum()

    #no. of words in all ham msgs
    words_ham_messages = ham_messages['SMS'].apply(len)
    n_ham = words_ham_messages.sum()

    #no. of vacabulary
    n_vocabs = len(vocabs)

    #laplace smoothening
    alpha = 1

#laplace smoothening is used so that prob can't be zero otherwise our classifier will classify that
#message as only a single classifier no matter how many times spam words has been occured in that message
        
    
###############################################################

    #initiating parameters    (conditional probabilities)
    parameters_spam = {unique_word:0 for unique_word in vocabs}
    parameters_ham = {unique_word:0 for unique_word in vocabs}


#calculating the parameters   (conditional probabilities)
    for words in vocabs:
        n_words_given_spam = spam_messages[words].sum()
        prob_words_given_spam = (n_words_given_spam + alpha)/(n_spam + alpha*n_vocabs)
        
        parameters_spam[words] = prob_words_given_spam
        if words in para_spam_k.keys():
            para_spam_k[words] += prob_words_given_spam
        else:    
            para_spam_k[words] = prob_words_given_spam
    
        n_words_given_ham = ham_messages[words].sum()
        prob_words_given_ham = (n_words_given_ham + alpha)/(n_ham + alpha*n_vocabs)
        
        parameters_ham[words] = prob_words_given_ham
        if words in para_ham_k.keys():
            para_ham_k[words] += prob_words_given_ham
        else:    
            para_ham_k[words] = prob_words_given_ham
        
        

##################################################################

#writing the function to classify the message

    def classify_test_set(msg):
        msg = re.sub('\W', ' ', msg)
        msg = msg.lower().split()
    
        prob_spam_giv_msg = prob_spam #initial guess for spam based on the training dataset : prior probability
        prob_ham_giv_msg = prob_ham #initial guess for ham based on the training dataset : prior
    
        for word in msg:
            if word in parameters_spam:
                prob_spam_giv_msg *= parameters_spam[word]
        
            if word in parameters_ham:
                prob_ham_giv_msg *= parameters_ham[word]
    
        if prob_spam_giv_msg > prob_ham_giv_msg:
            return 'spam'
        else:
            return 'ham'

    validation_set_i['predicted'] = validation_set_i['SMS'].apply(classify_test_set)
    
    correct = 0
    total = validation_set_i.shape[0]

    for row in validation_set_i.iterrows():
        row = row[1]
        if row['Class'] == row['predicted']:
            correct += 1

    print(i+1, end = '')
    print(" fold result is")
    print('Correct:', correct)
    print('Incorrect:', total - correct)
    print('Accuracy:', correct/total*100)
    print("##################")

#acuracy can be adjusted by providing more training set
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_set_i['SMS'] = train_set_i['SMS'].str.replace('\W', ' ', regex = True) # It Removes punctuation
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_set_i['SMS'] = train_set_i['SMS'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_set_i['SMS'] = train_set_i['SMS'].str.split()

1 fold result is
Correct: 750
Incorrect: 142
Accuracy: 84.08071748878923
##################
2 fold result is
Correct: 822
Incorrect: 70
Accuracy: 92.152466367713
##################
3 fold result is
Correct: 873
Incorrect: 19
Accuracy: 97.86995515695067
##################
4 fold result is
Correct: 876
Incorrect: 15
Accuracy: 98.31649831649831
##################
5 fold result is
Correct: 879
Incorrect: 12
Accuracy: 98.65319865319864
##################


In [7]:
# # no_ham = train_set_final['Class'].value_counts()[0]
# # no_spam = train_set_final['Class'].value_counts()[1]
# # total_msg = no_ham + no_spam
# # prob_h = no_ham/total_msg
# # prob_s = no_spam/total_msg
# print(prob_h,prob_s)
# print(prob_ham_k /5,prob_spam_k /5)

In [8]:
for para in para_spam_k:
    para_spam_k[para] /= 5

for para in para_ham_k:
    para_ham_k[para] /= 5

def classify_test_set_final(msg):
    msg = re.sub('\W', ' ', msg)
    msg = msg.lower().split()
    
    prob_spam_giv_msg = prob_spam_k /5 #initial guess for spam based on the training dataset
    prob_ham_giv_msg = prob_ham_k /5 #initial guess for ham based on the training dataset

    for word in msg:
        if word in para_spam_k:
            prob_spam_giv_msg *= para_spam_k[word]

        if word in parameters_ham:
            prob_ham_giv_msg *= para_ham_k[word]
    
    if prob_spam_giv_msg > prob_ham_giv_msg:
        return 'spam'
    else:
        return 'ham'

        
test_set_final['predicted'] = test_set_final['SMS'].apply(classify_test_set_final)
    
correct = 0
total = test_set_final.shape[0]

for row in test_set_final.iterrows():
    row = row[1]
    if row['Class'] == row['predicted']:
        correct += 1

print("Final result is")
print('Correct:', correct)
print('Incorrect:', total - correct)
print('Accuracy:', correct/total*100)
print("##################")            

Final result is
Correct: 1078
Incorrect: 36
Accuracy: 96.76840215439856
##################
