# Homework 2 - SMS Spam detection

Import Modules 

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import scipy as sp

Read data and change column names 

In [3]:
colnames = ['label', 'sms_message']
df = pd.read_csv('SMSSpamCollection', sep='\t', names=colnames)

Preview data

In [4]:
df.head(n=5)

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Stats about data 

In [5]:
df.groupby('label').describe()

Unnamed: 0_level_0,sms_message,sms_message,sms_message,sms_message
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [20]:
print(df.shape)

(5572, 2)


We can see asymetric data (label) distribution

In [7]:
print(df["label"].value_counts())

ham     4825
spam     747
Name: label, dtype: int64


## Data Processing 

In [8]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1}) #Getting binary values for spam or not spam

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df['sms_message'], df['label'], random_state=11)

In [10]:
count_vector = CountVectorizer() #stop words not used because SMS messages are very short
training_data = count_vector.fit_transform(X_train)
testing_data = count_vector.transform(X_test)

## Naive Bayes built in solution

In [11]:
multi_NB = MultinomialNB()
multi_NB.fit(training_data, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### F-score as primary quality control due to bayes approach of imbalanced sample

In [12]:
print(classification_report(y_test, multi_NB.predict(testing_data)))

             precision    recall  f1-score   support

          0       0.99      1.00      0.99      1216
          1       0.98      0.90      0.94       177

avg / total       0.98      0.98      0.98      1393



## Naive Bayes from scratch

Probability of drawing random sample in test sample and receiving spam or ham respectively 

In [13]:
prob_spam = sum(y_train) / len(y_train)
prob_ham = 1 - prob_spam

Probability of a text given(!) ham 

In [14]:
ham_loc = np.where(y_train == 0)
ham = training_data.tocsr()[ham_loc]

ham_freq = ham.toarray().sum(axis=0)+1
prob_ham2 = ham_freq / (sum(ham_freq))

Probability of a text given(!) spam 

In [15]:
spam_loc = np.where(y_train == 1)
spam = training_data.tocsr()[spam_loc]

spam_freq = spam.toarray().sum(axis=0)+1
prob_spam2 = spam_freq / (sum(spam_freq))

Function to determine log and bayes probabilities 

In [16]:
def decider(key):
    val = sp.sparse.find(key)
    prob_ham3 = np.log(prob_ham)
    prob_spam3 = np.log(prob_spam)
    
    
    for a in range(len(val[1])):
        prob_ham3 =+ np.log(prob_ham2[val[1][a]])*val[2][a]
        prob_spam3 =+ np.log(prob_spam2[val[1][a]])*val[2][a]

    if prob_spam3 >= prob_ham3:
        return 1
    else:
        return 0

Running the loop through testing data

In [17]:
report = []
for a in testing_data:
    report.append(decider(a))

Get results for accuracy

In [18]:
print(classification_report(y_test, report))

             precision    recall  f1-score   support

          0       0.94      0.78      0.85      1216
          1       0.30      0.63      0.41       177

avg / total       0.86      0.76      0.80      1393



## Conclusion 

Due to built-in optimization of the MultinomialNB module, the hands-on method of coding a Naive Bayes solution yields less accurate results. The hands-on method does, however, yield reasonable results for classifying an SMS as either spam or not spam.   