In [2]:
pip install problog


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
from problog.tasks import sample
from problog.program import PrologString

import pandas as pd
import math
import string
import random

## Pre-processing the dataset

### 1. Read the dataset

In [20]:
sms = pd.read_csv('sms_spam', sep='\t', names=['label', 'message'])
sms.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [21]:
sms.shape

(5572, 2)

### 2. Splitting the dataset into training set and test set

In [24]:
# Randomize the dataset
sms_randomized = sms.sample(frac=1, random_state=1)

# 20% test set and 80% training set
test_index = math.floor(5572 * 0.2)

test_sms = sms_randomized[:test_index]
training_sms = sms_randomized[test_index:]

test_sms = test_sms.reset_index(drop = True)
training_sms = training_sms.reset_index(drop = True)

print(test_sms.head())
print(training_sms.head())
type(training_sms['message'][0])

  label                                            message
0   ham                       Yep, by the pretty sculpture
1   ham      Yes, princess. Are you going to make me moan?
2   ham                         Welp apparently he retired
3   ham                                            Havent.
4   ham  I forgot 2 ask ü all smth.. There's a card on ...
  label                                            message
0   ham  Yeah do! Don‘t stand to close tho- you‘ll catc...
1   ham  Hi , where are you? We're at  and they're not ...
2   ham        If you r @ home then come down within 5 min
3   ham  When're you guys getting back? G said you were...
4   ham  Tell my  bad character which u Dnt lik in me. ...


str

## Data Cleaning

### 1. Remove all the punctuation and transform words to lower case in the messages

In [26]:
punctuation = string.punctuation

# Remove all the punctuation in the messages of testing data
for i in range(len(test_sms)):
    for l in test_sms["message"][i]:
        if l in punctuation:
            test_sms["message"][i] = test_sms["message"][i].replace(l, "")
    # Transform to lower case
    test_sms["message"][i] = test_sms["message"][i].lower()

# Remove all the punctuation in the messages of training data
for i in range(len(training_sms)):
    for l in training_sms["message"][i]:
        if l in punctuation:
            training_sms["message"][i] = training_sms["message"][i].replace(l, "")
            
    # Transform to lower case
    training_sms["message"][i] = training_sms["message"][i].lower()
            
print(test_sms.head())
print(training_sms.head())

  label                                            message
0   ham                        yep by the pretty sculpture
1   ham         yes princess are you going to make me moan
2   ham                         welp apparently he retired
3   ham                                             havent
4   ham  i forgot 2 ask ü all smth theres a card on da ...
  label                                            message
0   ham  yeah do don‘t stand to close tho you‘ll catch ...
1   ham  hi  where are you were at  and theyre not keen...
2   ham         if you r  home then come down within 5 min
3   ham  whenre you guys getting back g said you were t...
4   ham  tell my  bad character which u dnt lik in me i...


### 2. Splitting the message into words for training dataset

#### a. Finding all the words in the dataset

In [27]:
words = []

for message in training_sms['message']:
    words_sms = message.split()
    for word in words_sms:
        if word not in words:
            words.append(word)

In [28]:
len(words) # There are 8484 unique words in all training messages

8484

#### b. Counting the number times of each words appear in each message

In [29]:
words_count = {}
for word in words:
    words_count[word] = [] 

for message in training_sms['message']:
    for word in words:
        count = message.count(word)
        words_count[word].append(count)

In [30]:
words_count_data = pd.DataFrame(words_count)
words_count_data.head()

Unnamed: 0,yeah,do,don‘t,stand,to,close,tho,you‘ll,catch,something,...,skyving,kkyesterday,arr,oscar,assumed,ceri,rebel,dreamz,buddy,recdthirtyeight
0,1,2,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,2,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### c. Combine the words_count dataframe with the training_sms dataset

In [31]:
training_sms_count = pd.concat([training_sms, words_count_data], axis = 1)
training_sms_count.head()

Unnamed: 0,label,message,yeah,do,don‘t,stand,to,close,tho,you‘ll,...,skyving,kkyesterday,arr,oscar,assumed,ceri,rebel,dreamz,buddy,recdthirtyeight
0,ham,yeah do don‘t stand to close tho you‘ll catch ...,1,2,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,ham,hi where are you were at and theyre not keen...,0,2,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,if you r home then come down within 5 min,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,whenre you guys getting back g said you were t...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,tell my bad character which u dnt lik in me i...,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
training_sms_count.shape

(4458, 8486)

## Problog Example

In [15]:
modeltext = """
    my_uniform(0,10)::a.
    0.5::b.
    c :- value(a, A), A >= 3; b.
    query(a).
    query(b).
    query(c).
"""

# Define a function that generates a sample.
def integer_uniform(a, b):
    return math.floor(random.uniform(a, b))

model = PrologString(modeltext)
# Pass the mapping between name and function using the distributions parameter.
result = sample.sample(model, n=3, format='dict', distributions={'my_uniform': integer_uniform})

In [16]:
for expression in result:
    print(expression)

{a: 7, b: False, c: True}
{a: 4, b: False, c: True}
{a: 9, b: False, c: True}


## Creating the spam filter

#### a. Calculating the constants we need

In [56]:
#Splits the "spam" and "ham" messages
training_spam = training_sms[training_sms["label"] == "spam"]
training_ham = training_sms[training_sms["label"] == "ham"]

#Calculate N_Spam
n_Spam = 0
for eachmessage in training_spam["message"]:
    eachmessage = eachmessage.split(" ")
    curcount = 0
    for eachword in eachmessage:
        #empty word and numbers will not be counted as valid words
        if eachword != "" and eachword.isdigit() == False:
            curcount += 1
    n_Spam += curcount

#Calculate N_Ham
n_Ham = 0
for eachmessage in training_ham["message"]:
    eachmessage = eachmessage.split(" ")
    curcount = 0
    for eachword in eachmessage:
        #empty word and numbers will not be counted as valid words
        if eachword != "" and eachword.isdigit() == False:
            curcount += 1
    n_Ham += curcount

#Calculate n_Words
n_Words = len(words)
    
print("n_Spam:", n_Spam)
print("n_Ham:", n_Ham)
print("n_Words:", n_Words)

n_Spam: 12960
n_Ham: 53405
n_Words: 8484
