In [1]:
pip install problog

Note: you may need to restart the kernel to use updated packages.


In [2]:
from problog.tasks import sample
from problog.program import PrologString

import pandas as pd
import math
import string
import random
from tabulate import tabulate

## Pre-processing the dataset

### 1. Read the dataset

In [3]:
sms = pd.read_csv('sms_spam', sep='\t', names=['Label', 'Message'])
sms.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
sms.shape

(5572, 2)

### 2. Splitting the dataset into training set and test set

In [5]:
# Randomize the dataset
sms_randomized = sms.sample(frac=1, random_state=1)

# 20% test set and 80% training set
test_index = math.floor(5572 * 0.2)

test_sms = sms_randomized[:test_index]
training_sms = sms_randomized[test_index:]

test_sms = test_sms.reset_index(drop = True)
training_sms = training_sms.reset_index(drop = True)

print(test_sms.head())
print(training_sms.head())
type(training_sms['Message'][0])

  Label                                            Message
0   ham                       Yep, by the pretty sculpture
1   ham      Yes, princess. Are you going to make me moan?
2   ham                         Welp apparently he retired
3   ham                                            Havent.
4   ham  I forgot 2 ask ü all smth.. There's a card on ...
  Label                                            Message
0   ham  Yeah do! Don‘t stand to close tho- you‘ll catc...
1   ham  Hi , where are you? We're at  and they're not ...
2   ham        If you r @ home then come down within 5 min
3   ham  When're you guys getting back? G said you were...
4   ham  Tell my  bad character which u Dnt lik in me. ...


str

## Data Cleaning

### 1. Remove all the punctuation and transform words to lower case in the messages

In [6]:
punctuation = string.punctuation

# Remove all the punctuation in the messages of testing data
for i in range(len(test_sms)):
    for l in test_sms["Message"][i]:
        if l in punctuation:
            test_sms["Message"][i] = test_sms["Message"][i].replace(l, "")
    # Transform to lower case
    test_sms["Message"][i] = test_sms["Message"][i].lower()

# Remove all the punctuation in the messages of training data
for i in range(len(training_sms)):
    for l in training_sms["Message"][i]:
        if l in punctuation:
            training_sms["Message"][i] = training_sms["Message"][i].replace(l, "")
            
    # Transform to lower case
    training_sms["Message"][i] = training_sms["Message"][i].lower()
            
print(test_sms.head())
print(training_sms.head())

  Label                                            Message
0   ham                        yep by the pretty sculpture
1   ham         yes princess are you going to make me moan
2   ham                         welp apparently he retired
3   ham                                             havent
4   ham  i forgot 2 ask ü all smth theres a card on da ...
  Label                                            Message
0   ham  yeah do don‘t stand to close tho you‘ll catch ...
1   ham  hi  where are you were at  and theyre not keen...
2   ham         if you r  home then come down within 5 min
3   ham  whenre you guys getting back g said you were t...
4   ham  tell my  bad character which u dnt lik in me i...


### 2. Splitting the message into words for training dataset

#### a. Finding all the words in the dataset

In [7]:
words = []

for message in training_sms['Message']:
    words_sms = message.split()
    for word in words_sms:
        #empty word, numbers and single character will not be counted as valid words
        if word not in words and word != "" and any(char.isdigit() for char in word) == False and len(word) > 1:
            words.append(word)

In [8]:
len(words) # There are 8484 unique words in all training messages

7322

#### b. Counting the number times of each words appear in each message

In [9]:
words_count = {}
for word in words:
    
    words_count[word] = [] 

for message in training_sms['Message']:
    for word in words:
        count = message.count(word)
        words_count[word].append(count)

In [10]:
words_count_data = pd.DataFrame(words_count)
words_count_data.head()

Unnamed: 0,yeah,do,don‘t,stand,to,close,tho,you‘ll,catch,something,...,skyving,kkyesterday,arr,oscar,assumed,ceri,rebel,dreamz,buddy,recdthirtyeight
0,1,2,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,2,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### c. Combine the words_count dataframe with the training_sms dataset

In [11]:
training_sms_count = pd.concat([training_sms, words_count_data], axis = 1)
training_sms_count.head()

Unnamed: 0,Label,Message,yeah,do,don‘t,stand,to,close,tho,you‘ll,...,skyving,kkyesterday,arr,oscar,assumed,ceri,rebel,dreamz,buddy,recdthirtyeight
0,ham,yeah do don‘t stand to close tho you‘ll catch ...,1,2,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,ham,hi where are you were at and theyre not keen...,0,2,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,if you r home then come down within 5 min,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,whenre you guys getting back g said you were t...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,tell my bad character which u dnt lik in me i...,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
training_sms_count.shape

(4458, 7324)

## Problog Example

In [13]:
modeltext = """
    my_uniform(0,10)::a.
    0.5::b.
    c :- value(a, A), A >= 3; b.
    query(a).
    query(b).
    query(c).
"""

# Define a function that generates a sample.
def integer_uniform(a, b):
    return math.floor(random.uniform(a, b))

model = PrologString(modeltext)
# Pass the mapping between name and function using the distributions parameter.
result = sample.sample(model, n=3, format='dict', distributions={'my_uniform': integer_uniform})

In [14]:
for expression in result:
    print(expression)

{a: 8, b: True, c: True}
{a: 1, b: True, c: True}
{a: 0, b: False, c: False}


## Creating the spam filter

#### a. Calculating the constants we need

In [15]:
#Splits the "spam" and "ham" messages
training_spam = training_sms_count[training_sms_count["Label"] == "spam"]
training_ham = training_sms_count[training_sms_count["Label"] == "ham"]

#Calculate N_Spam: the number of words in all the spam messages
n_Spam = 0
for eachmessage in training_spam["Message"]:
    eachmessage = eachmessage.split(" ")
    curcount = 0
    for eachword in eachmessage:
        #empty word and numbers will not be counted as valid words
        if eachword != "" and any(char.isdigit() for char in eachword) == False and len(word) > 1:
            curcount += 1
    n_Spam += curcount

#Calculate N_Ham: the number of words in all the non-spam messages
n_Ham = 0
for eachmessage in training_ham["Message"]:
    eachmessage = eachmessage.split(" ")
    curcount = 0
    for eachword in eachmessage:
        #empty word and numbers will not be counted as valid words
        if eachword != "" and any(char.isdigit() for char in eachword) == False and len(word) > 1:
            curcount += 1
    n_Ham += curcount

#Calculate n_Words
n_Words = len(words)
    
print("n_Spam:", n_Spam)
print("n_Ham:", n_Ham)
print("n_Words:", n_Words)

n_Spam: 11843
n_Ham: 53084
n_Words: 7322


#### b. Calculating the probabilities of Pr(wi|spam) and Pr(wi|ham)

In [16]:
pSpamWord = {}
pHamWord = {}

for w in words:
    # For Spam message
    # Calculate N(wi|Spam)
    count = 0
    for i in training_spam[w]:
        count += i
    # Calculate the probability Pr(wi|spam)
    prob = count / n_Spam
    pSpamWord[w] = prob
    
    # For Ham message
    # Calculate N(wi|Ham)
    count = 0
    for i in training_ham[w]:
        count += i
    # Calculate the probability Pr(wi|spam)
    prob = count / n_Ham
    pHamWord[w] = prob

#### c. Building the model

In [17]:
pSpam = len(training_spam) / len(training_sms_count) # Pr(Spam)
pHam = len(training_ham) / len(training_sms_count) # Pr(Ham)

def classify(message):
    message = message.split(" ")
    # Calculating Pr(Spam|w1,w2...) and Pr(Ham|w1,w2...)
    pSpamWords = 1
    pHamWords = 1
    for word in message:
        if word in pSpamWord:
            pSpamWords *= pSpamWord[word]
            
        if word in pHamWord:
            pHamWords *= pHamWord[word]
    
    pSpamMessage = pSpam * pSpamWords
    pHamMessage = pHam * pHamWords
    
    if pSpamMessage > pHamMessage:
        result = "spam"
    elif pSpamMessage < pHamMessage:
        result = "ham"
    else:
        result = "Can't not classify"
    
    return result

#### d. Testing

In [18]:
testing = []
for message in test_sms['Message']:
    prediction = classify(message)
    testing.append(prediction)
    
test_result = test_sms.copy()
test_result['Prediction'] = testing
test_result.head()

Unnamed: 0,Label,Message,Prediction
0,ham,yep by the pretty sculpture,ham
1,ham,yes princess are you going to make me moan,ham
2,ham,welp apparently he retired,ham
3,ham,havent,ham
4,ham,i forgot 2 ask ü all smth theres a card on da ...,ham


## Accuracy

In [19]:
correct = 0
n_test = len(test_result)
    
for i in range(n_test):
    if test_result['Label'][i] == test_result['Prediction'][i]:
        correct += 1

incorrect = n_test - correct
accuracy = correct/n_test

print('Correct:' + str(correct))
print('Incorrect:' + str(incorrect))
print('Accuracy:' + str(accuracy))

Correct:1045
Incorrect:69
Accuracy:0.9380610412926391


### Increase Accuracy by using Lapace Smoothing

In [20]:
alpha = 10

for w in words:
    # For Spam message
    # Calculate N(wi|Spam)
    count = 0
    for i in training_spam[w]:
        count += i
    # Calculate the probability Pr(wi|spam) by using Lapace Smoothing
    prob = (count + alpha) / (n_Spam + alpha * n_Words)
    pSpamWord[w] = prob
    
    # For Ham message
    # Calculate N(wi|Ham)
    count = 0
    for i in training_ham[w]:
        count += i
    # Calculate the probability Pr(wi|spam) by using Lapace Smoothing
    prob = (count + alpha) / (n_Ham + alpha * n_Words)
    pHamWord[w] = prob

In [21]:
# Finding some words commonly associated with spam
spam_words = []
for key in pSpamWord:
    if pSpamWord[key] - pHamWord[key] > 0.0011 and len(key) > 2:
        spam_words.append(key)
        
# Making table of spam words
table = {}
for word in spam_words:
    table[word] = [round(pSpamWord[word], 6), round(pHamWord[word], 6)]

print(tabulate(table, headers='keys', showindex=('Spam', 'Ham')))

          call      stop      free       txt    mobile     claim       top      tone       mob       cal
----  --------  --------  --------  --------  --------  --------  --------  --------  --------  --------
Spam  0.004068  0.001517  0.002504  0.002187  0.001599  0.001211  0.001716  0.00134   0.002222  0.004115
Ham   0.002154  0.000388  0.000515  0.000206  0.00015   7.9e-05   0.000546  0.000135  0.00015   0.002423


In [22]:
testing = []
for message in test_sms['Message']:
    prediction = classify(message)
    testing.append(prediction)
    
test_result_improve = test_sms.copy()
test_result_improve['Prediction'] = testing
test_result_improve.head()

Unnamed: 0,Label,Message,Prediction
0,ham,yep by the pretty sculpture,ham
1,ham,yes princess are you going to make me moan,ham
2,ham,welp apparently he retired,ham
3,ham,havent,ham
4,ham,i forgot 2 ask ü all smth theres a card on da ...,ham


In [23]:
correct = 0
n_test = len(test_result_improve)
    
for i in range(n_test):
    if test_result_improve['Label'][i] == test_result_improve['Prediction'][i]:
        correct += 1

incorrect = n_test - correct
accuracy = correct/n_test

print('Correct:' + str(correct))
print('Incorrect:' + str(incorrect))
print('Accuracy:' + str(accuracy))

Correct:1050
Incorrect:64
Accuracy:0.9425493716337523
