In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import nltk

In [22]:
sms_data = pd.read_csv('/home/piotr/Documents/Uczelnia/dydaktyka/wyklady/DataMining/repos_EAD/lab-ead/src/_resources/lab_12/SMSSpamCollection.bin', header=None, sep='\t', names=['Label', 'SMS'])

In [5]:
sms_data

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
def prep(string):
    
    # Remove HTML tags.
    string = BeautifulSoup(string,'html.parser').get_text()
    
    # Remove non-letters
    string = re.sub("[^a-zA-Z]", " ", string)
    
    # Lower case
    string = string.lower()
    
    # Tokenize to each word.
    token = nltk.word_tokenize(string)
    
    # Stemming
    string = [nltk.stem.SnowballStemmer('english').stem(w) for w in token]
    
    # Join the words back into one string separated by space, and return the result.
    return string

In [7]:
sms_data['SMS'].iloc[:1].apply(prep).iloc[0]

['go',
 'until',
 'jurong',
 'point',
 'crazi',
 'avail',
 'onli',
 'in',
 'bugi',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'there',
 'got',
 'amor',
 'wat']

In [8]:
sms_data['SMS'].iloc[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [9]:
sms_data['clean_sms'] = sms_data['SMS'].apply(prep)

In [10]:
sms_data

Unnamed: 0,Label,SMS,clean_sms
0,ham,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point, crazi, avail, onli,..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joke, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entri, in, a, wkli, comp, to, win, fa, ..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, so, earli, hor, u, c, alreadi, t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, i, don, t, think, he, goe, to, usf, he, ..."
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,"[this, is, the, nd, time, we, have, tri, conta..."
5568,ham,Will ü b going to esplanade fr home?,"[will, b, go, to, esplanad, fr, home]"
5569,ham,"Pity, * was in mood for that. So...any other s...","[piti, was, in, mood, for, that, so, ani, othe..."
5570,ham,The guy did some bitching but I acted like i'd...,"[the, guy, did, some, bitch, but, i, act, like..."


In [12]:
train_data = sms_data.sample(frac=0.8,random_state=1).reset_index(drop=True)
test_data = sms_data.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)

In [13]:
train_data

Unnamed: 0,Label,SMS,clean_sms
0,ham,"Yep, by the pretty sculpture","[yep, by, the, pretti, sculptur]"
1,ham,"Yes, princess. Are you going to make me moan?","[yes, princess, are, you, go, to, make, me, moan]"
2,ham,Welp apparently he retired,"[welp, appar, he, retir]"
3,ham,Havent.,[havent]
4,ham,I forgot 2 ask ü all smth.. There's a card on ...,"[i, forgot, ask, all, smth, there, s, a, card,..."
...,...,...,...
4453,ham,"Sorry, I'll call later in meeting any thing re...","[sorri, i, ll, call, later, in, meet, ani, thi..."
4454,ham,Babe! I fucking love you too !! You know? Fuck...,"[babe, i, fuck, love, you, too, you, know, fuc..."
4455,spam,U've been selected to stay in 1 of 250 top Bri...,"[u, ve, been, select, to, stay, in, of, top, b..."
4456,ham,Hello my boytoy ... Geeee I miss you already a...,"[hello, my, boytoy, geeee, i, miss, you, alrea..."


In [14]:
train_data['Label'].value_counts() / train_data.shape[0] * 100

ham     86.54105
spam    13.45895
Name: Label, dtype: float64

In [15]:
test_data['Label'].value_counts() / test_data.shape[0] * 100

ham     86.983842
spam    13.016158
Name: Label, dtype: float64

In [16]:
vocabulary = list(set(train_data['clean_sms'].sum()))

['lou',
 'xxxxxxxxx',
 'dump',
 'paint',
 'cupboard',
 'shirt',
 'dracula',
 'registr',
 'done',
 'colour',
 'ship',
 'geoenvironment',
 'heater',
 'sourc',
 'enemi',
 'bridal',
 'oreo',
 'birth',
 'tiz',
 'pataistha',
 'dock',
 'lololo',
 'sway',
 'tok',
 'common',
 'sheet',
 'thrown',
 'chapel',
 'morrow',
 'mobileupd',
 'mahav',
 'awww',
 'ari',
 'graviti',
 'rang',
 'edha',
 'away',
 'pro',
 'fightng',
 'nothin',
 'headset',
 'ji',
 'fear',
 'went',
 'um',
 'paracetamol',
 'giv',
 'ee',
 'ubandu',
 'squat',
 'aathi',
 'ntimat',
 'ctter',
 'ba',
 'trivia',
 'landlineon',
 'lost',
 'monkeespeopl',
 'best',
 'jackpot',
 'knock',
 'sleepin',
 'mymobi',
 'iam',
 'ae',
 'stuf',
 'wow',
 'syd',
 'honeybe',
 'web',
 'wthout',
 'terribl',
 'anji',
 'astound',
 'hour',
 'agidhan',
 'rimac',
 'rdi',
 'nuerologist',
 'approv',
 'gmw',
 'charact',
 'reslov',
 'roomat',
 'txting',
 'dont',
 'flow',
 'hope',
 'goodi',
 'volcano',
 'sur',
 'behind',
 'daili',
 'batsman',
 'question',
 'coop',
 'mi

In [16]:
word_counts_per_sms = pd.DataFrame([
    [row[1].count(word) for word in vocabulary]
    for _, row in train_data.iterrows()], columns=vocabulary)

In [17]:
train_data = pd.concat([train_data.reset_index(), word_counts_per_sms], axis=1).iloc[:,1:]

In [18]:
train_data

Unnamed: 0,Label,SMS,clean_sms,master,prakasamanu,hangin,en,spontan,merememberin,beforehand,...,soooo,offdam,wallpap,taunton,goodo,fireplac,terrif,smartcal,hog,purpl
0,ham,"Yep, by the pretty sculpture","[yep, by, the, pretti, sculptur]",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"Yes, princess. Are you going to make me moan?","[yes, princess, are, you, go, to, make, me, moan]",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,Welp apparently he retired,"[welp, appar, he, retir]",0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,Havent.,[havent],0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,I forgot 2 ask ü all smth.. There's a card on ...,"[i, forgot, ask, all, smth, there, s, a, card,...",0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4453,ham,"Sorry, I'll call later in meeting any thing re...","[sorri, i, ll, call, later, in, meet, ani, thi...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4454,ham,Babe! I fucking love you too !! You know? Fuck...,"[babe, i, fuck, love, you, too, you, know, fuc...",0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4455,spam,U've been selected to stay in 1 of 250 top Bri...,"[u, ve, been, select, to, stay, in, of, top, b...",0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4456,ham,Hello my boytoy ... Geeee I miss you already a...,"[hello, my, boytoy, geeee, i, miss, you, alrea...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
alpha = 1

In [19]:
Nvoc = len(train_data.columns) - 3

In [20]:
Pspam = train_data['Label'].value_counts()['spam'] / train_data.shape[0]

In [21]:
Pham = train_data['Label'].value_counts()['ham'] / train_data.shape[0]

In [23]:
Nspam = train_data.loc[train_data['Label'] == 'spam', 'clean_sms'].apply(len).sum()

In [24]:
Nham = train_data.loc[train_data['Label'] == 'ham', 'clean_sms'].apply(len).sum()

In [25]:
Pham

0.8654104979811574

In [26]:
Pspam

0.13458950201884254

In [None]:
def p_w_spam(word):
    if word in train_data.columns[4:]:
        return (train_data.loc[train_data['Label'] == 'spam', word].sum()) / (Nspam )
    else:
        return 1
def p_w_ham(word):
    if word in train_data.columns[4:]:
        return (train_data.loc[train_data['Label'] == 'ham', word].sum()) / (Nham)
    else:
        return 1

In [27]:
def p_w_spam(word):
    if word in train_data.columns[4:]:
        return (train_data.loc[train_data['Label'] == 'spam', word].sum() + alpha) / (Nspam + alpha*Nvoc)
    else:
        return 1
def p_w_ham(word):
    if word in train_data.columns[4:]:
        return (train_data.loc[train_data['Label'] == 'ham', word].sum() + alpha) / (Nham + alpha*Nvoc)
    else:
        return 1

In [28]:
def classify(message):
    p_spam_given_message = Pspam
    p_ham_given_message = Pham
    for word in message:
        p_spam_given_message *= p_w_spam(word)
        p_ham_given_message *= p_w_ham(word)
    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_ham_given_message < p_spam_given_message:
        return 'spam'
    else:
        return 'needs human classification'

In [29]:
test_data['predicted'] = test_data['clean_sms'].apply(classify)

In [36]:
classify(prep("You"))

'ham'

In [34]:
prep("You")

['you']

In [30]:


correct = (test_data['predicted'] == test_data['Label']).sum() / test_data.shape[0] * 100
correct



98.29443447037703