In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import nltk

sms_data = pd.read_csv('SMSSpamCollection', header=None, sep='\t', names=['Label', 'SMS'])

In [2]:
sms_data

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
def prep(string):
    
    # Remove HTML tags.
    string = BeautifulSoup(string,'html.parser').get_text()
    
    # Remove non-letters
    string = re.sub("[^a-zA-Z]", " ", string)
    
    # Lower case
    string = string.lower()
    
    # Tokenize to each word.
    token = nltk.word_tokenize(string)
    
    
    # Stemming
    string = [nltk.stem.SnowballStemmer('english').stem(w) for w in token]
    
    # Join the words back into one string separated by space, and return the result.
    return string

In [5]:
sms_data.iloc[0:2,1]

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
Name: SMS, dtype: object

In [8]:
sms_data['SMS'].iloc[:3].apply(prep).iloc[:]

0    [go, until, jurong, point, crazi, avail, onli,...
1                         [ok, lar, joke, wif, u, oni]
2    [free, entri, in, a, wkli, comp, to, win, fa, ...
Name: SMS, dtype: object

In [9]:
sms_data['clean_sms'] = sms_data['SMS'].apply(prep)

In [10]:
sms_data

Unnamed: 0,Label,SMS,clean_sms
0,ham,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point, crazi, avail, onli,..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joke, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entri, in, a, wkli, comp, to, win, fa, ..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, so, earli, hor, u, c, alreadi, t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, i, don, t, think, he, goe, to, usf, he, ..."
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,"[this, is, the, nd, time, we, have, tri, conta..."
5568,ham,Will ü b going to esplanade fr home?,"[will, b, go, to, esplanad, fr, home]"
5569,ham,"Pity, * was in mood for that. So...any other s...","[piti, was, in, mood, for, that, so, ani, othe..."
5570,ham,The guy did some bitching but I acted like i'd...,"[the, guy, did, some, bitch, but, i, act, like..."


In [11]:
train_data = sms_data.sample(frac=0.8,random_state=1).reset_index(drop=True)
test_data = sms_data.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)

In [12]:
Pspam = train_data['Label'].value_counts()['spam'] / train_data.shape[0]
Pham = train_data['Label'].value_counts()['ham'] / train_data.shape[0]
Nvoc = len(train_data.columns) - 3 #całkowita liczba unikalnych wyrazów
Nspam = train_data.loc[train_data['Label'] == 'spam', 'clean_sms'].apply(len).sum() #liczba wyrazów we wiadomościach typu spam
Nham = train_data.loc[train_data['Label'] == 'ham', 'clean_sms'].apply(len).sum() #liczba wyrazów we wiadomościach typu ham

In [16]:
vocabulary = list(set(train_data['clean_sms'].sum()))

In [21]:
word_counts_per_sms = pd.DataFrame([
    [row[1].count(word) for word in vocabulary]
    for _, row in train_data.iterrows()], columns=vocabulary)
train_data = pd.concat([train_data.reset_index(), word_counts_per_sms], axis=1).iloc[:,1:]

In [22]:
train_data_data

Unnamed: 0,Label,SMS,clean_sms,sf,dwn,smsreward,yest,wine,kz,fromwrk,...,dust,disturb,fav,outdoor,abt,oppos,content,requir,modul,math
0,ham,"Yep, by the pretty sculpture","[yep, by, the, pretti, sculptur]",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"Yes, princess. Are you going to make me moan?","[yes, princess, are, you, go, to, make, me, moan]",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,Welp apparently he retired,"[welp, appar, he, retir]",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,Havent.,[havent],0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,I forgot 2 ask ü all smth.. There's a card on ...,"[i, forgot, ask, all, smth, there, s, a, card,...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4453,ham,"Sorry, I'll call later in meeting any thing re...","[sorri, i, ll, call, later, in, meet, ani, thi...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4454,ham,Babe! I fucking love you too !! You know? Fuck...,"[babe, i, fuck, love, you, too, you, know, fuc...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4455,spam,U've been selected to stay in 1 of 250 top Bri...,"[u, ve, been, select, to, stay, in, of, top, b...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4456,ham,Hello my boytoy ... Geeee I miss you already a...,"[hello, my, boytoy, geeee, i, miss, you, alrea...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
def p_w_spam(word, alpha=1):
    if word in train_data.columns[4:]:
        return (train_data.loc[train_data['Label'] == 'spam', word].sum() + alpha) / (Nspam + alpha*Nvoc)
    else:
        return 1

In [26]:
p_w_spam('requir')

7.28279076542131e-05

In [27]:
def classify(message):
    p_spam_given_message = Pspam
    p_ham_given_message = Pham
    for word in message:
        p_spam_given_message  = ... # Do zaimplementowania
        p_ham_given_message = ... # Do zaimplementowania
    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_ham_given_message < p_spam_given_message:
        return 'spam'
    else:
        return 'unknown'

In [None]:
test_data['predicted'] = test_data['clean_sms'].apply(classify)

In [None]:
correct = (test_data['predicted'] == test_data['Label']).sum() / test_data.shape[0] * 100