In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
import nltk
# nltk.download('punkt')

In [2]:
spam_data = pd.read_csv("SMSSpamCollection.csv",sep='\t',header=None,names=["Label","Message"])

In [3]:
spam_data.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
spam_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Label    5572 non-null   object
 1   Message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


## P(SPAM/SMS) = (P(SMS/SPAM)*P(SPAM)) / P(SMS)

In [5]:
for i in range(len(spam_data["Message"])):
    spam_data["Message"][i]=spam_data["Message"][i].lower() # lower the message
    sentences = nltk.sent_tokenize(spam_data["Message"][i]) #extract the sentences
    for j in range(len(sentences)):
        spam_data["Message"][i]=re.sub('[^a-zA-Z]',' ', sentences[j]) #remove un necessary punctuation

In [6]:
X=spam_data["Message"]
y=spam_data["Label"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,stratify=y,random_state=42)

In [8]:
print("Length of X_Train",len(X_train))
print("Length of Y_Train",len(y_train))
print("Length of X_Test",len(X_test))
print("Length of Y_Test",len(y_test))

Length of X_Train 4457
Length of Y_Train 4457
Length of X_Test 1115
Length of Y_Test 1115


## Training dataset

In [9]:
d = {'Label': y_train, 'Message': X_train}
training_dateset = pd.DataFrame(data=d)

In [10]:
training_dateset.reset_index(drop=True,inplace=True)

## Testing Dataset

In [11]:
d = {'Label': y_test, 'Message': X_test}
testing_dateset = pd.DataFrame(data=d)

In [12]:
testing_dateset.reset_index(drop=True,inplace=True)

### Bag of words

In [13]:
#create a corpus based on the words in the training dataset

In [14]:
corpus_words=[]
for i in range(len(training_dateset["Message"])):
    words = training_dateset["Message"][i].split()
    corpus_words.extend(words)
    training_dateset["Message"][i]=training_dateset["Message"][i].split()

In [15]:
corpus = set(corpus_words)
corpus_len= len(corpus)
print("Total length of corpus words is",corpus_len)

Total length of corpus words is 5089


In [16]:
word_counts_per_sms = {unique_word: [0] * len(training_dateset["Message"]) for unique_word in corpus}

In [17]:
for index, sms in enumerate(training_dateset['Message']):
    for word in sms:
        word_counts_per_sms[word][index] = 1      

In [18]:
word_counts = pd.DataFrame(word_counts_per_sms)

In [19]:
training_dateset_new = pd.concat([training_dateset, word_counts], axis=1)
training_dateset_new.head(2)

Unnamed: 0,Label,Message,orc,fast,philosophy,panalam,calls,li,rply,winaweek,...,throw,decimal,staff,website,her,renewal,er,outsider,woozles,raining
0,ham,"[he, will, you, guys, close]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[can, i, please, come, up, now, imin, town, do...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### verify if the bag of words are correct are not

In [20]:
training_dateset_new[["Message","he","will","you","guys","can","i","please"]].loc[0:3]

Unnamed: 0,Message,he,will,you,guys,can,i,please
0,"[he, will, you, guys, close]",1,1,1,1,0,0,0
1,"[can, i, please, come, up, now, imin, town, do...",0,0,0,0,1,1,1
2,"[ok, k, sry, i, knw, siva, tats, y, i, askd]",0,0,0,0,0,1,0
3,"[i, ll, see, but, prolly, yeah]",0,0,0,0,0,1,0


In [21]:
# code the soam filter based on the training dataset

P(SPAM/given a word wi) = P(Wi/SPAM)*P(SPAM) /  P(wi)

P(wi/SPAM) = (no of spam messgaes containing this word)+alpha/(total no of spam messages+alpha*corpus_len)

alpha = laplacian smoothening factor

P(wi/HAM) = (no of ham messgaes containing this word)+alpha/(total no of ham messages+alpha*corpus_len)

alpha = laplacian smoothening factor

In [22]:
spam_messages = training_dateset_new[training_dateset_new["Label"]=="spam"]
ham_messages = training_dateset_new[training_dateset_new["Label"]=="ham"]

Probability_SPAM = len(spam_messages) / len(training_dateset_new)
Probability_HAM = len(ham_messages) / len(training_dateset_new)

n_spam = len(spam_messages)
n_ham = len(ham_messages)
alpha = 1

print("Probability of spam", Probability_SPAM)
print("Probability of ham", Probability_HAM)

Probability of spam 0.13417096701817366
Probability of ham 0.8658290329818263


In [23]:
#calculate P(wi/SPAM) and P(wi/HAM)

In [24]:
parameters_spam = {unique_word:[0] for unique_word in corpus_words}
parameters_ham = {unique_word:[0] for unique_word in corpus_words}

In [25]:
for word in corpus_words:
    #P(unique_word/givenspam)
    unique_word_count_in_spam = spam_messages[word].sum()
    prob_word_given_spam = (unique_word_count_in_spam + alpha) / (n_spam + alpha*corpus_len)
    parameters_spam[word] = prob_word_given_spam
    
    #P(unique_word/givenham)
    unique_word_count_in_ham = ham_messages[word].sum()
    prob_word_given_ham = (unique_word_count_in_ham + alpha) / (n_ham + alpha*corpus_len)
    parameters_ham[word] = prob_word_given_ham

### create a classification function

In [26]:
def spam_classification(message):
    #data cleaning
    # lower the message
    message = message.lower().split()  
    
    for word in message:
        if word in parameters_spam:
            p_spam_given_message = parameters_spam[word]*Probability_SPAM

        if word in parameters_ham: 
            p_ham_given_message  = parameters_ham[word]*Probability_HAM
    
    prob_total = p_spam_given_message+p_ham_given_message
    
    p_spam_given_message = (p_spam_given_message)/(prob_total)
    p_ham_given_message = (p_ham_given_message)/(prob_total)
    
    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)
    
    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
    else:
        print('Equal proabilities, have a human classify this!')

In [27]:
spam_classification("Hi.. how are you? I am fine")

P(Spam|message): 0.010961242412235202
P(Ham|message): 0.9890387575877647
Label: Ham


In [28]:
spam_classification("Hurray.. secret code to win")

P(Spam|message): 0.5059394938530497
P(Ham|message): 0.4940605061469503
Label: Spam


### testing this classifier on test data

In [29]:
def spam_classification_test(message):
    #data cleaning
    #lower the message
    message = message.lower().split() 
    
    p_spam_given_message = Probability_SPAM
    p_ham_given_message = Probability_HAM
    
    for word in message:
        if word in parameters_spam:
            p_spam_given_message = parameters_spam[word]*p_spam_given_message
        if word in parameters_ham: 
            p_ham_given_message  = parameters_ham[word]*p_ham_given_message
    
    prob_total = p_spam_given_message+ p_ham_given_message
    
    p_spam_given_message = (p_spam_given_message)/(prob_total)
    p_ham_given_message = (p_ham_given_message)/(prob_total)
    
    if p_ham_given_message > p_spam_given_message:
        return "ham"
    elif p_ham_given_message < p_spam_given_message:
        return "spam"
    else:
        return "needs human intervention"

In [30]:
testing_dateset["Predicted"] = testing_dateset["Message"].apply(spam_classification_test)

In [31]:
testing_dateset.head()

Unnamed: 0,Label,Message,Predicted
0,ham,no need to buy lunch for me i eat maggi mee,ham
1,ham,love me xxx,ham
2,ham,reach home already,ham
3,spam,to claim call,spam
4,ham,if you r home then come down within min,ham


### Metrics Accuracy

In [32]:
correctly_classfied=0
wrongly_classfied=0
for i in range(len(testing_dateset)):
    if(testing_dateset["Label"][i]==testing_dateset["Predicted"][i]):
        correctly_classfied+=1
    else:
        wrongly_classfied+=1

In [33]:
print('Correct:', correctly_classfied)
print('Incorrect:', wrongly_classfied)
print('Accuracy:', correctly_classfied/len(testing_dateset))

Correct: 1055
Incorrect: 60
Accuracy: 0.9461883408071748
