In [1]:
from sklearn import preprocessing
import numpy as np
import pandas as pd

### Data Loading

In [2]:
data = pd.read_csv('dataset/spam.csv', encoding='latin-1')
# Show first 5 in dataset
data.head(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
# Drop soem unwanted columns that do not provide insight
data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
data = data.rename(columns={"v1":"label", "v2":"text"})
# Show last 5 in dataset
data.tail(5)

Unnamed: 0,label,text
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [4]:
data["label_tag"] = data.label.map({'ham':0, 'spam':1})
data.head(5)

Unnamed: 0,label,text,label_tag
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [5]:
# get the size of our dataset
print(data.count())
data.label.value_counts()

label        5572
text         5572
label_tag    5572
dtype: int64


ham     4825
spam     747
Name: label, dtype: int64

### Data Preparation

Training data

In [6]:
# first 4572/5572 emails
training_data = data[0:4572]
training_data_length = len(training_data.label)
training_data.head()

Unnamed: 0,label,text,label_tag
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


Testing data

In [7]:
# last 1000/5572
test_data = data[-1000:]
test_data_length = len(test_data.label)
test_data.head()

Unnamed: 0,label,text,label_tag
4572,spam,\URGENT! This is the 2nd attempt to contact U!...,1
4573,ham,:( but your not here....,0
4574,ham,Not directly behind... Abt 4 rows behind Ì_...,0
4575,spam,Congratulations ur awarded 500 of CD vouchers ...,1
4576,spam,Had your contract mobile 11 Mnths? Latest Moto...,1


#### What is the shape of our input data

Training data

In [8]:
print(training_data.shape)
print(training_data.label.shape)

(4572, 3)
(4572,)


There are 3 features and 4572 samples in our trtaining set

Test data

In [9]:
print(test_data.shape)
print(test_data.label.shape)

(1000, 3)
(1000,)


### Develop a Predictive Theory

In [10]:
import random

In [11]:
def pretty_print_text_and_label(i):
    print(training_data.label[i] + "\t:\t" + training_data.text[i][:80] + "...")

In [12]:
print("labels \t : \t texts\n")
# choose  a random spam set to analyse
# random.randrange(start, stop, step)
pretty_print_text_and_label(random.randrange(0,4572))
pretty_print_text_and_label(random.randrange(0,4572,4))
pretty_print_text_and_label(random.randrange(0,4572,50))
pretty_print_text_and_label(random.randrange(0,4572,100))
pretty_print_text_and_label(random.randrange(0,4572,200))
pretty_print_text_and_label(random.randrange(0,4572,500))
pretty_print_text_and_label(random.randrange(0,4572,800))
pretty_print_text_and_label(random.randrange(0,4572,1000))

labels 	 : 	 texts

ham	:	Sorry me going home first... Daddy come fetch Ì_ later......
ham	:	I jokin oni lar.. ÌÏ busy then i wun disturb Ì_....
ham	:	Feb  &lt;#&gt;  is \I LOVE U\" day. Send dis to all ur \"VALUED FRNDS\" evn me. ...
ham	:	Pathaya enketa maraikara pa'...
ham	:	K.then any other special?...
ham	:	No..but heard abt tat.....
ham	:	Nice.nice.how is it working?...
ham	:	But i'll b going 2 sch on mon. My sis need 2 take smth....


It is very easy to distinguish a spam text from a non-spam text (in this case ham) . Spam text occasionaly contain words like **free**, **sell**, **promotion**, **deal**, **offer**, **discount**, **lucky** e.t.c. This way we can let our network learn some of the words assocaiated with spams and based on such criteria we can classify a text as a spam or not.

#### Theory Validation

In [13]:
from collections import Counter
import numpy as np
import pprint 

In [14]:
spam_counts = Counter()
ham_counts = Counter()
total_counts = Counter()
spam_ham_ratios = Counter()

pp = pprint.PrettyPrinter(indent=4)

In [15]:
for i in range(training_data_length):
    if(training_data.label[i] == 0):
        for word in training_data.text[i].split(" "):
            ham_counts[word] += 1
            total_counts[word] += 1
    else:
        for word in training_data.text[i].split(" "):
            spam_counts[word] += 1
            total_counts[word] += 1

In [16]:
pp.pprint(spam_counts.most_common()[0:30])

[   ('to', 1758),
    ('you', 1368),
    ('I', 1204),
    ('a', 1094),
    ('the', 989),
    ('and', 736),
    ('in', 652),
    ('is', 648),
    ('i', 612),
    ('u', 567),
    ('for', 529),
    ('my', 522),
    ('', 521),
    ('of', 498),
    ('me', 465),
    ('your', 447),
    ('on', 410),
    ('have', 402),
    ('2', 371),
    ('that', 358),
    ('are', 327),
    ('it', 313),
    ('or', 304),
    ('call', 303),
    ('at', 300),
    ('be', 299),
    ('not', 292),
    ('with', 281),
    ('get', 270),
    ('will', 266)]


In [17]:
for word,count in list(total_counts.most_common()):
    if(count > 100):
        spam_ham_ratio = spam_counts[word] / float(ham_counts[word]+1)
        spam_ham_ratios[word] = spam_ham_ratio

for word,ratio in spam_ham_ratios.most_common():
    if(ratio > 1):
        spam_ham_ratios[word] = np.log(ratio)
    else:
        spam_ham_ratios[word] = -np.log((1 / (ratio+0.01)))

In [18]:
# words most frequently seen in a text with a "spam" label
pp.pprint(spam_ham_ratios.most_common()[0:30])

[   ('to', 7.4719320782451222),
    ('you', 7.2211050981824956),
    ('I', 7.0934046258687662),
    ('a', 6.9975959829819265),
    ('the', 6.8966943316227125),
    ('and', 6.6012301187288767),
    ('in', 6.4800445619266531),
    ('is', 6.4738906963522744),
    ('i', 6.4167322825123261),
    ('u', 6.3403593037277517),
    ('for', 6.2709884318582994),
    ('my', 6.2576675878826391),
    ('', 6.2557500417533669),
    ('of', 6.2106000770246528),
    ('me', 6.1420374055873559),
    ('your', 6.1025585946135692),
    ('on', 6.0161571596983539),
    ('have', 5.9964520886190211),
    ('2', 5.916202062607435),
    ('that', 5.8805329864007003),
    ('are', 5.7899601708972535),
    ('it', 5.7462031905401529),
    ('or', 5.7170277014062219),
    ('call', 5.7137328055093688),
    ('at', 5.7037824746562009),
    ('be', 5.7004435733906869),
    ('not', 5.6767538022682817),
    ('with', 5.6383546693337454),
    ('get', 5.598421958998375),
    ('will', 5.5834963087816991)]


In [19]:
# words most frequently seen in a text with a "ham" label
pp.pprint(list(reversed(spam_ham_ratios.most_common()))[0:30])

[   ('there', 4.6151205168412597),
    ('he', 4.6151205168412597),
    ('no', 4.6249728132842707),
    ('our', 4.6443908991413725),
    ('one', 4.6443908991413725),
    ('been', 4.6443908991413725),
    ('If', 4.6634390941120669),
    ('No', 4.6728288344619058),
    ('But', 4.7004803657924166),
    ('still', 4.7095302013123339),
    ('text', 4.7184988712950942),
    ('need', 4.7361984483944957),
    ('as', 4.7449321283632502),
    ('only', 4.7449321283632502),
    ('n', 4.7621739347977563),
    ("I'll", 4.7706846244656651),
    ('what', 4.7706846244656651),
    ('How', 4.7706846244656651),
    ('then', 4.7874917427820458),
    ('going', 4.7874917427820458),
    ('Call', 4.7957905455967413),
    ('...', 4.8040210447332568),
    ('time', 4.8202815656050371),
    ('want', 4.8441870864585912),
    ('about', 4.8520302639196169),
    ('send', 4.8675344504555822),
    ('by', 4.8978397999509111),
    ('was', 5.0304379213924353),
    ('?', 5.0498560072495371),
    ('now', 5.0498560072495371)]


### Transform Text into Numbers

Neural Networks only understand numbers hence we have to find a way to represent our text inputs in a way it can understand

In [20]:
vocab = set(total_counts.keys())
vocab_size = len(vocab)
print(vocab_size)

13874


We can see that from all our dataset, we have a total of **13874** unique words. Use this to build up our vocabulary vector containing columns of all these words.

Because, **13874**, can be a large size in memory (a matrix of size **13874 by 4572**), lets allocate its memory once with default zeros and will only change its contents accordingly later.

In [21]:
vocab_vector = np.zeros((1, vocab_size))
pp.pprint(vocab_vector.shape)
pp.pprint(vocab_vector)

(1, 13874)
array([[ 0.,  0.,  0., ...,  0.,  0.,  0.]])


Now, let's create a dictionary that allows us to look at every word in our vocabulary and map it to the `vocab_vector` column.

In [22]:
#  Maps a word to its column in the vocab_vector
word_column_dict = {}

for i, word in enumerate(vocab):
    # {key: value} is {word: column}
    word_column_dict[word] = i
    
pp.pprint(word_column_dict)

{   '': 0,
    '!': 6573,
    '!!': 6076,
    '!!!': 2577,
    '!!!!': 6201,
    "!!''.": 8155,
    '!1': 3601,
    '#': 6048,
    '#150': 12446,
    '#5000': 7177,
    '$': 3087,
    '$1': 1367,
    '$2': 13666,
    '$350': 7222,
    '$5.00': 13068,
    '$50': 1484,
    '$50...': 7035,
    '$700': 11279,
    '$900': 4446,
    '$95/pax,': 3780,
    '%': 6745,
    '%.': 6231,
    '%of': 2277,
    '&': 9799,
    '&SAM': 3861,
    '&XXX': 789,
    '&amp;': 11599,
    '&gt;:(': 13352,
    '&it': 6281,
    '&lt;#&gt;': 13672,
    '&lt;)': 6224,
    '&lt;3': 8044,
    '&lt;DECIMAL&gt;': 11391,
    '&lt;EMAIL&gt;': 7317,
    '&lt;TIME&gt;': 11333,
    '&lt;URL&gt;': 11315,
    '&othrs': 10011,
    "'": 3155,
    "''": 162,
    "''OK'',": 9038,
    "'An": 10245,
    "'Comfort'": 6624,
    "'IF": 10045,
    "'Luxury'": 6885,
    "'MARRIED'": 1036,
    "'Maangalyam": 9707,
    "'Melle": 13753,
    "'Need'": 1538,
    "'SIMPLE'": 5834,
    "'Uptown": 6183,
    "'Wnevr": 9641,
    "'anything'": 35

We are going to use the count of words as the input to our neural network. The `vocab_vector` will have columns for all the words in our training data in the form of `{key: value}` i.e `{word: count}` as held by the `word_column_dict`  python `Dictionary`. The individual word counts in any particular text is updated from 0 to a number based on a word's total count in any single text.

This means that the words with a higher count might have a higher weight in determining whether a text is a spam or not.

In [23]:
def update_input_layer(text):
    pp.pprint(text)
    global vocab_vector
    
    # clear out previous state, reset the vector to be all 0s
    vocab_vector *= 0
    for word in text.split(" "):
        vocab_vector[0][word_column_dict[word]] += 1

update_input_layer(training_data["text"][random.randrange(0,4572,4)])

'Can you open the door?'


### Build the SpamClassificationNeuralNetwork

In [24]:
import time
import sys

In [28]:
# Let's tweak our network from before to model these phenomena
class SpamClassificationNeuralNetwork(object):
    def __init__(self, training_data, num_hidden_nodes = 10, num_epochs = 10, learning_rate = 0.1):
        # set our random number generator 
        np.random.seed(1)
        # pre-process data
        self.pre_process_data(training_data)
        
        self.num_features = len(self.vocab)
        self.vocab_vector = np.zeros((1, len(self.vocab)))
        self.num_input_nodes = self.num_features
        self.num_hidden_nodes = num_hidden_nodes
        self.num_epochs = num_epochs
        self.num_output_nodes = 1
        self.learning_rate = learning_rate

        # Initialize weights
        self.weights_i_h = np.random.randn(self.num_input_nodes, self.num_hidden_nodes)
        self.weights_h_o = np.random.randn(self.num_hidden_nodes, self.num_output_nodes)
        
    def forward_backward_propagate(self, text, label):
        ### Forward pass ###
        # Input Layer
        self.update_input_layer(text)
        # Hidden layer
        hidden_layer = self.vocab_vector.dot(self.weights_i_h)
        # Output layer
        output_layer = self.sigmoid(hidden_layer.dot(self.weights_h_o))
        
        ### Backward pass ###
        # Output error
        output_layer_error = output_layer - label 
        output_layer_delta = output_layer_error * self.sigmoid_derivative(output_layer)

        # Backpropagated error - to the hidden layer
        hidden_layer_error = output_layer_delta.dot(self.weights_h_o.T)
        # hidden layer gradients - no nonlinearity so it's the same as the error
        hidden_layer_delta = output_layer_error 

        # update the weights - with grdient descent
        self.weights_h_o -= hidden_layer.T.dot(output_layer_delta) * self.learning_rate 
        self.weights_i_h -= self.vocab_vector.T.dot(hidden_layer_delta) * self.learning_rate 
        
        if(np.abs(output_layer_error) < 0.5):
                self.correct_so_far += 1
        
        
    
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    
    def sigmoid_derivative(self,x):
        return x * (1 - x)

        
    def train(self):
        for epoch in range(self.num_epochs):
            self.correct_so_far = 0
            start = time.time()

            for i in range(len(training_data)):
                # Forward and Back Propagation
                self.forward_backward_propagate(training_data["text"][i], training_data["label_tag"][i])

                samples_per_second = i / float(time.time() - start + 0.001)

                sys.stdout.write("\rEpoch: "+ str(epoch)
                                 +" Progress: " + str(100 * i/float(len(training_data)))[:4] 
                                 + " % Speed(samples/sec): " + str(samples_per_second)[0:5] 
                                 + " #Correct: " + str(self.correct_so_far) 
                                 + " #Trained: " + str(i+1) 
                                 + " Training Accuracy: " + str(self.correct_so_far * 100 / float(i+1))[:4] + "%")
            print("")
        
    def pre_process_data(self, training_data):
        vocab = set()
        
        for review in training_data["text"]:
            for word in review.split(" "):
                vocab.add(word)
                
        self.vocab = list(vocab)
        
        self.word_to_column = {}
        for i, word in enumerate(self.vocab):
            self.word_to_column[word] = i
            
    def update_input_layer(self, text):
        global vocab_vector

        # clear out previous state, reset the vector to be all 0s
        self.vocab_vector *= 0
        for word in text.split(" "):
            self.vocab_vector[0][word_column_dict[word]] += 1
            

In [29]:
nn = SpamClassificationNeuralNetwork(training_data, num_epochs = 10, learning_rate=0.01)

In [30]:
nn.train()

Epoch: 0 Progress: 99.9 % Speed(texts/sec): 2107. #Correct: 3296 #Trained: 4572 Training Accuracy: 72.0%
Epoch: 1 Progress: 99.9 % Speed(texts/sec): 2165. #Correct: 3887 #Trained: 4572 Training Accuracy: 85.0%
Epoch: 2 Progress: 99.9 % Speed(texts/sec): 2187. #Correct: 4042 #Trained: 4572 Training Accuracy: 88.4%
Epoch: 3 Progress: 99.9 % Speed(texts/sec): 2136. #Correct: 4139 #Trained: 4572 Training Accuracy: 90.5%
Epoch: 4 Progress: 99.9 % Speed(texts/sec): 2157. #Correct: 4192 #Trained: 4572 Training Accuracy: 91.6%
Epoch: 5 Progress: 99.9 % Speed(texts/sec): 2181. #Correct: 4251 #Trained: 4572 Training Accuracy: 92.9%
Epoch: 6 Progress: 99.9 % Speed(texts/sec): 2144. #Correct: 4290 #Trained: 4572 Training Accuracy: 93.8%
Epoch: 7 Progress: 99.9 % Speed(texts/sec): 2137. #Correct: 4314 #Trained: 4572 Training Accuracy: 94.3%
Epoch: 8 Progress: 99.9 % Speed(texts/sec): 2158. #Correct: 4353 #Trained: 4572 Training Accuracy: 95.2%
Epoch: 9 Progress: 99.9 % Speed(texts/sec): 2150. #Corr