# Spam filter

In [1]:
import pandas as pd
import re
import en_core_web_sm

### Step 1. Preprocessing

In [2]:
df = pd.read_csv('spam.csv', encoding='iso-8859-1')
df_new = df.iloc[:, [0, 1]]
df_new = df_new.rename({'v1':'Target', 'v2':'SMS'}, axis=1)

# iteration over the dataframe and conversion the text to lowercase;
rows = []
for index, row in df_new.iterrows():
    row = row['SMS'].lower()
    rows.append(row)

# lemmatization, punctuation removal
en_sm_model = en_core_web_sm.load()

a = []
for i in rows:
    doc = en_sm_model(i)
    tokens = []
    for r in doc:
        r = "{0} – {1}".format(r.lemma_, r.pos_)
        tokens.append(r)

# removal of pos tags, initial form, replacement of numbers, removal of words consisting of one letter
    tokenss=[]
    for i in tokens:
        i = re.sub('...[A-Z]+','',i).strip()
        i = re.sub('.*\d+.*','aanumbers',i).strip()
        i = re.sub('[!"#$%&\'()*+,-./:;<=>?@^_`{|}~]+', '', i).strip()
        i = re.sub('\\\\', '', i).strip()
        if len(i) > 1:
            tokenss.append(i)

    a.append(tokenss)

# removal of stop words
from spacy.lang.en.stop_words import STOP_WORDS

sw = []
c = []
for i in a:
    c = []
    for w in i:
        if w not in STOP_WORDS:

            if w == 'aanumber':
                c.append('aanumbers')
            else:
                c.append(w)
    sw.append(c)

swn = []
for i in sw:
    i = ' '.join(i)
    swn.append(i)

# dataframe upgrade
df_new.drop(columns = ['SMS'],axis = 1)
df_new['SMS'] = swn

pd.options.display.max_columns = df_new.shape[1]
pd.options.display.max_rows = df_new.shape[0]

print(df_new.head(200))

    Target                                                SMS
0      ham  jurong point crazy available bugis great world...
1      ham                                ok lar joke wif oni
2     spam  free entry aanumbers wkly comp win fa cup fina...
3      ham                                      dun early hor
4      ham                                 nah think usf live
5     spam  freemsg hey darle aanumbers week word like fun...
6      ham          brother like speak treat like aids patent
7      ham  request melle melle oru minnaminunginte nurung...
8     spam  winner value network customer select receivea ...
9     spam  mobile aanumbers month entitle update late col...
10     ham  home soon want talk stuff anymore tonight ve c...
11    spam  chance win cash aanumbers aanumbers pound txt ...
12    spam  urgent win aanumbers week free membership aanu...
13     ham  ve search right word thank breather promise he...
14     ham                                        date sunday
15    sp

### Step 2. Naive Bayes classifier

In [3]:
# train-test split
df = df_new
df = df.sample(n=df.shape[0], random_state=43)
train_last_index = int(df.shape[0] * 0.8)
train_set = df.iloc[0:train_last_index]
test_set = df.iloc[train_last_index:]
train_set.reset_index(drop=True, inplace=True)


#train_set.to_csv('train_set.csv')
#print('train_set is writen')



In [4]:
vocab = set()
for row in train_set['SMS']:
    vocab.update(row.split())
vocab = list(vocab)
vocab = sorted(vocab)


In [5]:
def bayes(data1, data2=train_set['SMS'], vocab=vocab):
    sum_of_all_words = 0
    all_words = []
    for row in data1['SMS']:
        row = row.split()
        for el in row:
            all_words.append(el)
    for word in all_words:
        sum_of_all_words += 1

    sum_of_vocab = 0
    for v in vocab:
        sum_of_vocab += 1

    sum_of_word_lst = []
    for w in vocab:
        vec = []
        for el in all_words:
            if w == el:
                vec.append(1)
            else:
                vec.append(0)
        sum_of_word_lst.append(vec)
        
    sum_of_word = []
    for lst in sum_of_word_lst:
        sum = 0
        for el in lst:
            sum += el
        sum_of_word.append(sum)

    acp = []
    for i in range(len(sum_of_word)):
        probability = (sum_of_word[i] + 1) / (sum_of_all_words + sum_of_vocab)
        acp.append(format(probability, 'f'))

    return acp

In [6]:
spam = bayes(train_set.loc[train_set['Target'] == 'spam'])
ham = bayes(train_set.loc[train_set['Target'] == 'ham'])

d = {'Spam Probability': spam, 'Ham Probability': ham}
df_bayes = pd.DataFrame(d, index=vocab)
df_bayes['ind'] = df_bayes.index

pd.options.display.max_rows=204
print(df_bayes.head(5))


          Spam Probability  ...        ind
aa                0.000066  ...         aa
aah               0.000066  ...        aah
aanumbers         0.156118  ...  aanumbers
aathi             0.000066  ...      aathi
abbey             0.000066  ...      abbey

[5 rows x 3 columns]


### Step 3. Probabilities of spam 

In [7]:
def prob_s_h(s_h, data=train_set):
    s = []
    for i in data['SMS']:
        s.append(i)
    s = len(s)
    lsts = []
    for i in s_h['SMS']:
        lsts.append(i)
    ss = len(lsts)
    p = ss / s
    return p


p_spam = prob_s_h(train_set.loc[train_set['Target'] == 'spam'])
p_ham = prob_s_h(train_set.loc[train_set['Target'] == 'ham'])

In [8]:
sents = []
for i in train_set['SMS']:
    i = str(i).split()
    sents.append(i)

sents_p = []
for s in sents:
    s_p = []
    for w in s:
        p = str(df_bayes.loc[df_bayes['ind'] == w, 'Spam Probability']).split()
        p = p[1]
        if p != 'Name:':
            s_p.append(float(p))
    sents_p.append(s_p)
    
probs = []
for s in sents_p:
    prob = 1
    for w in s:
        prob*=w
    probs.append(prob*p_spam)

sents_p_h = []
for s in sents:
    s_p = []
    for w in s:
        p = str(df_bayes.loc[df_bayes['ind'] == w, 'Ham Probability']).split()
        p = p[1]
        if p != 'Name:':
            s_p.append(float(p))
    sents_p_h.append(s_p)
    
probs_h = []
for s in sents_p_h:
    prob = 1
    for w in s:
        prob*=w
    probs_h.append(prob*p_ham)

data = []
for i in range(len(sents)):
    if probs[i] > probs_h[i]:
        data.append('spam')
    elif probs[i] < probs_h[i]:
        data.append('ham')
    else:
        data.append('unknown')

train_set.insert(loc=0, column='Predicted', value=data)
train_set.rename(columns={'Target': 'Actual'}, inplace=True)
train_set.drop(columns=['SMS'], axis=1, inplace=True)

print(train_set)



     Predicted Actual
0          ham    ham
1          ham    ham
2          ham    ham
3          ham    ham
4          ham    ham
...        ...    ...
4452       ham    ham
4453       ham    ham
4454       ham    ham
4455       ham    ham
4456       ham    ham

[4457 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


###  Step 4. Model performance

In [9]:
predict = []
act = []
for i in train_set['Predicted']:
    predict.append(i)
for i in train_set['Actual']:
    act.append(i)
tp = 0
tn = 0
fp = 0
fn = 0
for i in range(len(predict)):
    if predict[i] == 'ham':
        if predict[i] == act[i]:
            tp+=1
        else:
            fp+=1
    else:
        if predict[i] == act[i]:
            tn+=1
        else:
            fn+=1

accuracy = (tp+tn)/(tp+tn+fp+fn)
recall = tp/(tp+fn)
precision = tp / (tp+fp)
f1 = 2*precision*recall/(precision+recall)
d = {'Accuracy': accuracy, 'Recall': recall, 'Precision': precision, 'F1': f1}
print(d)


{'Accuracy': 0.9896791563832175, 'Recall': 0.9922239502332815, 'Precision': 0.9958376690946931, 'F1': 0.9940275253180992}
