In [238]:
from sklearn import preprocessing
import numpy as np
import pandas as pd

### Data Loading

In [239]:
data = pd.read_csv('dataset/spam.csv', encoding='latin-1')
# Show first 5 in dataset
data.head(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [240]:
# Drop soem unwanted columns that do not provide insight
data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
data = data.rename(columns={"v1":"label", "v2":"text"})
# Show last 5 in dataset
data.tail(5)

Unnamed: 0,label,text
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [241]:
data["label_tag"] = data.label.map({'ham':0, 'spam':1})
data.head(5)

Unnamed: 0,label,text,label_tag
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [242]:
# get the size of our dataset
print(data.count())
data.label.value_counts()

label        5572
text         5572
label_tag    5572
dtype: int64


ham     4825
spam     747
Name: label, dtype: int64

### Data Preparation

Training data

In [243]:
# first 4572/5572 emails
training_data = data[0:4572]
training_data_length = len(training_data.label)
training_data.head()

Unnamed: 0,label,text,label_tag
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


Testing data

In [244]:
# last 1000/5572
test_data = data[-1000:]
test_data_length = len(test_data.label)
test_data.head()

Unnamed: 0,label,text,label_tag
4572,spam,\URGENT! This is the 2nd attempt to contact U!...,1
4573,ham,:( but your not here....,0
4574,ham,Not directly behind... Abt 4 rows behind Ì_...,0
4575,spam,Congratulations ur awarded 500 of CD vouchers ...,1
4576,spam,Had your contract mobile 11 Mnths? Latest Moto...,1


#### What is the shape of our input data

Training data

In [245]:
print(training_data.shape)
print(training_data.label.shape)

(4572, 3)
(4572,)


There are 3 features and 4572 samples in our trtaining set

Test data

In [246]:
print(test_data.shape)
print(test_data.label.shape)

(1000, 3)
(1000,)


### Develop a Predictive Theory

In [247]:
import random

In [248]:
def pretty_print_text_and_label(i):
    print(training_data.label[i] + "\t:\t" + training_data.text[i][:80] + "...")

In [249]:
print("labels \t : \t texts\n")
# choose  a random spam set to analyse
# random.randrange(start, stop, step)
pretty_print_text_and_label(random.randrange(0,4572))
pretty_print_text_and_label(random.randrange(0,4572,4))
pretty_print_text_and_label(random.randrange(0,4572,50))
pretty_print_text_and_label(random.randrange(0,4572,100))
pretty_print_text_and_label(random.randrange(0,4572,200))
pretty_print_text_and_label(random.randrange(0,4572,500))
pretty_print_text_and_label(random.randrange(0,4572,800))
pretty_print_text_and_label(random.randrange(0,4572,1000))

labels 	 : 	 texts

ham	:	I can probably come by, everybody's done around  &lt;#&gt;  right?...
ham	:	S da..al r above  &lt;#&gt;...
ham	:	Okay name ur price as long as its legal! Wen can I pick them up? Y u ave x ams x...
ham	:	Hi! This is Roger from CL. How are you?...
spam	:	This message is free. Welcome to the new & improved Sex & Dogging club! To unsub...
ham	:	says the  &lt;#&gt;  year old with a man and money. I'm down to my last  &lt;#&g...
spam	:	Last Chance! Claim ur å£150 worth of discount vouchers today! Text SHOP to 85023...
ham	:	But i'll b going 2 sch on mon. My sis need 2 take smth....


It is very easy to distinguish a spam text from a non-spam text (in this case ham) . Spam text occasionaly contain words like **free**, **sell**, **promotion**, **deal**, **offer**, **discount**, **lucky** e.t.c. This way we can let our network learn some of the words assocaiated with spams and based on such criteria we can classify a text as a spam or not.

#### Theory Validation

In [250]:
from collections import Counter
import numpy as np
import pprint 

In [251]:
spam_counts = Counter()
ham_counts = Counter()
total_counts = Counter()
spam_ham_ratios = Counter()

pp = pprint.PrettyPrinter(indent=4)

In [252]:
for i in range(training_data_length):
    if(training_data.label[i] == 0):
        for word in training_data.text[i].split(" "):
            ham_counts[word] += 1
            total_counts[word] += 1
    else:
        for word in training_data.text[i].split(" "):
            spam_counts[word] += 1
            total_counts[word] += 1

In [253]:
pp.pprint(spam_counts.most_common()[0:30])

[   ('to', 1758),
    ('you', 1368),
    ('I', 1204),
    ('a', 1094),
    ('the', 989),
    ('and', 736),
    ('in', 652),
    ('is', 648),
    ('i', 612),
    ('u', 567),
    ('for', 529),
    ('my', 522),
    ('', 521),
    ('of', 498),
    ('me', 465),
    ('your', 447),
    ('on', 410),
    ('have', 402),
    ('2', 371),
    ('that', 358),
    ('are', 327),
    ('it', 313),
    ('or', 304),
    ('call', 303),
    ('at', 300),
    ('be', 299),
    ('not', 292),
    ('with', 281),
    ('get', 270),
    ('will', 266)]


In [254]:
for word,count in list(total_counts.most_common()):
    if(count > 100):
        spam_ham_ratio = spam_counts[word] / float(ham_counts[word]+1)
        spam_ham_ratios[word] = spam_ham_ratio

for word,ratio in spam_ham_ratios.most_common():
    if(ratio > 1):
        spam_ham_ratios[word] = np.log(ratio)
    else:
        spam_ham_ratios[word] = -np.log((1 / (ratio+0.01)))

In [255]:
# words most frequently seen in a text with a "spam" label
pp.pprint(spam_ham_ratios.most_common()[0:30])

[   ('to', 7.4719320782451222),
    ('you', 7.2211050981824956),
    ('I', 7.0934046258687662),
    ('a', 6.9975959829819265),
    ('the', 6.8966943316227125),
    ('and', 6.6012301187288767),
    ('in', 6.4800445619266531),
    ('is', 6.4738906963522744),
    ('i', 6.4167322825123261),
    ('u', 6.3403593037277517),
    ('for', 6.2709884318582994),
    ('my', 6.2576675878826391),
    ('', 6.2557500417533669),
    ('of', 6.2106000770246528),
    ('me', 6.1420374055873559),
    ('your', 6.1025585946135692),
    ('on', 6.0161571596983539),
    ('have', 5.9964520886190211),
    ('2', 5.916202062607435),
    ('that', 5.8805329864007003),
    ('are', 5.7899601708972535),
    ('it', 5.7462031905401529),
    ('or', 5.7170277014062219),
    ('call', 5.7137328055093688),
    ('at', 5.7037824746562009),
    ('be', 5.7004435733906869),
    ('not', 5.6767538022682817),
    ('with', 5.6383546693337454),
    ('get', 5.598421958998375),
    ('will', 5.5834963087816991)]


In [256]:
# words most frequently seen in a text with a "ham" label
pp.pprint(list(reversed(spam_ham_ratios.most_common()))[0:30])

[   ('he', 4.6151205168412597),
    ('there', 4.6151205168412597),
    ('no', 4.6249728132842707),
    ('been', 4.6443908991413725),
    ('one', 4.6443908991413725),
    ('our', 4.6443908991413725),
    ('If', 4.6634390941120669),
    ('No', 4.6728288344619058),
    ('But', 4.7004803657924166),
    ('still', 4.7095302013123339),
    ('text', 4.7184988712950942),
    ('need', 4.7361984483944957),
    ('only', 4.7449321283632502),
    ('as', 4.7449321283632502),
    ('n', 4.7621739347977563),
    ('How', 4.7706846244656651),
    ("I'll", 4.7706846244656651),
    ('what', 4.7706846244656651),
    ('going', 4.7874917427820458),
    ('then', 4.7874917427820458),
    ('Call', 4.7957905455967413),
    ('...', 4.8040210447332568),
    ('time', 4.8202815656050371),
    ('want', 4.8441870864585912),
    ('about', 4.8520302639196169),
    ('send', 4.8675344504555822),
    ('by', 4.8978397999509111),
    ('was', 5.0304379213924353),
    ('?', 5.0498560072495371),
    ('now', 5.0498560072495371)]


### Transform Text into Numbers

Neural Networks only understand numbers hence we have to find a way to represent our text inputs in a way it can understand

In [257]:
vocab = set(total_counts.keys())
vocab_size = len(vocab)
print(vocab_size)

13874


We can see that from all our dataset, we have a total of **13874** unique words. Use this to build up our vocabulary vector containing columns of all these words.

Because, **13874**, can be a large size in memory (a matrix of size **13874 by 4572**), lets allocate its memory once with default zeros and will only change its contents accordingly later.

In [258]:
vocab_vector = np.zeros((1, vocab_size))
pp.pprint(vocab_vector.shape)
pp.pprint(vocab_vector)

(1, 13874)
array([[ 0.,  0.,  0., ...,  0.,  0.,  0.]])


Now, let's create a dictionary that allows us to look at every word in our vocabulary and map it to the `vocab_vector` column.

In [261]:
#  Maps a word to its column in the vocab_vector
word_column_dict = {}

for i, word in enumerate(vocab):
    # {key: value} is {word: column}
    word_column_dict[word] = i
    
pp.pprint(word_column_dict)

{   '': 0,
    '!': 2147,
    '!!': 13153,
    '!!!': 1502,
    '!!!!': 6397,
    "!!''.": 4423,
    '!1': 3267,
    '#': 9205,
    '#150': 8932,
    '#5000': 7650,
    '$': 3945,
    '$1': 7241,
    '$2': 10719,
    '$350': 8268,
    '$5.00': 13527,
    '$50': 2758,
    '$50...': 13657,
    '$700': 5488,
    '$900': 6048,
    '$95/pax,': 3217,
    '%': 7706,
    '%.': 8614,
    '%of': 6631,
    '&': 12062,
    '&SAM': 7235,
    '&XXX': 1297,
    '&amp;': 13869,
    '&gt;:(': 5164,
    '&it': 8398,
    '&lt;#&gt;': 5318,
    '&lt;)': 6334,
    '&lt;3': 11479,
    '&lt;DECIMAL&gt;': 12981,
    '&lt;EMAIL&gt;': 6195,
    '&lt;TIME&gt;': 13218,
    '&lt;URL&gt;': 849,
    '&othrs': 9707,
    "'": 13649,
    "''": 8873,
    "''OK'',": 4486,
    "'An": 12637,
    "'Comfort'": 8025,
    "'IF": 5575,
    "'Luxury'": 7864,
    "'MARRIED'": 1227,
    "'Maangalyam": 7401,
    "'Melle": 6143,
    "'Need'": 11133,
    "'SIMPLE'": 5186,
    "'Uptown": 11565,
    "'Wnevr": 5505,
    "'anything'": 51

We are going to use the count of words as the input to our neural network. The `vocab_vector` will have columns for all the words in our training data in the form of `{key: value}` i.e `{word: count}` as held by the `word_column_dict`  python `Dictionary`. The individual word counts in any particular text is updated from 0 to a number based on a word's total count in any single text.

This means that the words with a higher count might have a higher weight in determining whether a text is a spam or not.

In [263]:
def update_input_layer(text):
    pp.pprint(text)
    global vocab_vector
    
    # clear out previous state, reset the vector to be all 0s
    vocab_vector *= 0
    for word in text.split(" "):
        vocab_vector[0][word_column_dict[word]] += 1

update_input_layer(training_data["text"][random.randrange(0,4572,4)])

('Camera - You are awarded a SiPix Digital Camera! call 09061221066 fromm '
 'landline. Delivery within 28 days')


### Build the SpamClassificationNeuralNetwork