## SMS SPAM

The project aims to create an algorithm that it's able to filter the messagges in Spam and not Spam. We have to check if the probability of receiving  a Spam message is greater than receiving a non Spam message

In [1]:
import pandas as pd
df= pd.read_csv("SMSSpamCollection", sep='\t', header=None, names=['Label', 'SMS'])

In [2]:
df.shape

(5572, 2)

In [3]:
df.head(5)

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df["Label"].value_counts(normalize=True)*100

ham     86.593683
spam    13.406317
Name: Label, dtype: float64

In [5]:
df_random= df.sample(frac=1, random_state=1)

In [6]:
from sklearn.model_selection import train_test_split

training_data, testing_data = train_test_split(df_random, test_size=0.2)

In [7]:
training_data=training_data.reset_index(drop=True)
testing_data=testing_data.reset_index(drop=True)

In [8]:
training_data.shape

(4457, 2)

In [9]:
testing_data.shape

(1115, 2)

In [10]:
training_data["Label"].value_counts(normalize=True)*100

ham     86.291227
spam    13.708773
Name: Label, dtype: float64

In [11]:
testing_data["Label"].value_counts(normalize=True)*100

ham     87.802691
spam    12.197309
Name: Label, dtype: float64

In [12]:
training_data["SMS"]=training_data["SMS"].str.replace(r"\W", " ").str.lower()

In [13]:
training_data["SMS"]=training_data["SMS"].str.split()
vocabulary=[]
for i in training_data["SMS"]:
    for l in i:
        vocabulary.append(l)


In [14]:
vocabulary= set(vocabulary)

In [15]:
vocabulary =list(vocabulary)
len(vocabulary)

7792

In [16]:
word_counts_per_sms = {unique_word: [0] * len(training_data['SMS']) for unique_word in vocabulary}

for index, sms in enumerate(training_data['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1

In [17]:
word_counts_per_sms=pd.DataFrame(word_counts_per_sms)
training_data1=pd.concat([training_data,word_counts_per_sms], axis=1)

In [18]:
training_data1.head(5)

Unnamed: 0,Label,SMS,0,00,000,000pes,008704050406,0089,0121,01223585236,...,zogtorius,zoom,zouk,zyada,èn,é,ú1,ü,〨ud,鈥
0,ham,"[can, you, just, come, in, for, a, sec, there,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,spam,"[a, 400, xmas, reward, is, waiting, for, you, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[do, you, know, why, god, created, gap, betwee...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,"[can, you, do, online, transaction]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,spam,"[this, message, is, free, welcome, to, the, ne...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
training_data1["Label"].value_counts()

ham     3846
spam     611
Name: Label, dtype: int64

In [20]:
spam_df=training_data1[training_data1["Label"]=="spam"]
n_spam_mex= spam_df["SMS"].apply(len)
n_spam=n_spam_mex.sum()
n_spam

15416

In [21]:
hpam_df=training_data1[training_data1["Label"]=="ham"]
p_spam = len(spam_df) / len(training_data1)
p_ham = len(hpam_df) / len(training_data1)
h_spam_mex= spam_df["SMS"].apply(len)
h_spam=h_spam_mex.sum()
n_vocabulary=len(vocabulary)
alpha=1

In [22]:
spam_word = {unique_word: 0  for unique_word in vocabulary}
ham_word = {unique_word: 0  for unique_word in vocabulary}

In [23]:
for i in vocabulary:
    pwsp=(spam_df[i].sum()+alpha)/(n_spam+alpha*n_vocabulary)
    spam_word[i]=pwsp    

In [24]:
for l in vocabulary:
    hwsp=(hpam_df[l].sum()+alpha)/(h_spam+alpha*n_vocabulary)
    ham_word[l]=hwsp

In [25]:
import re

def classify(message):

    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()

    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    for i in message:
        if i in spam_word:
            p_spam_given_message *=spam_word[i]
        if i in ham_word:
            p_ham_given_message *= ham_word[i]

    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)

    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
    else:
        print('Equal proabilities, have a human classify this!')

In [26]:
classify('WINNER!! This is the secret code to unlock the money: C3421.')

P(Spam|message): 1.7556961675327057e-25
P(Ham|message): 2.0476660987271066e-23
Label: Ham


In [27]:
classify("Sounds good, Tom, then see u there")

P(Spam|message): 3.3394054290545134e-25
P(Ham|message): 5.304790650470401e-18
Label: Ham


In [29]:
def classify_test_set(message):

    message = re.sub('\W', ' ', message)
    message = message.lower()
    message = message.split()

    p_spam_given_message = p_spam
    p_ham_given_message = p_ham

    for word in message:
        if word in spam_word:
            p_spam_given_message *= spam_word[word]

        if word in ham_word:
            p_ham_given_message *= ham_word[word]

    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_spam_given_message > p_ham_given_message:
        return 'spam'
    else:
        return 'needs human classification'

In [32]:
testing_data['predicted'] = testing_data['SMS'].apply(classify_test_set)
testing_data.head()

Unnamed: 0,Label,SMS,predicted
0,ham,He also knows about lunch menu only da. . I know,ham
1,ham,Yeah confirmed for you staying at that weekend,ham
2,ham,This pain couldn't have come at a worse time.,ham
3,ham,Wat r u doing?,ham
4,ham,U still painting ur wall?,ham


In [43]:
correct=0
total=1115
for i in testing_data.iterrows():
    rows=i[1]
    if rows["Label"]==rows["predicted"]:
        correct+=1
        

In [44]:
correct/total

0.9426008968609866

It's a good spam filter with an accurancy of 94,26%. Using the testing data set in order to test it.

In [45]:
inaccurancy=total-correct
inaccurancy

64

In [54]:
spam_ham=testing_data[(testing_data["Label"]=="spam")&(testing_data["predicted"]=="ham")]
ham_spam=testing_data[(testing_data["Label"]=="ham")&(testing_data["predicted"]=="spam")]

In [58]:
spam_ham.shape

(63, 3)

In [59]:
ham_spam.shape

(0, 3)

It's interesting that all the messages classified as inaccurate are effectivily spam messages classified as not spam. On the other hand, the non spam messages misclassified are 0. So,  the last one is under the "a human needs to control it". 