In [89]:
import pandas as pd
import numpy as np

In [90]:
data = pd.read_csv('spam_and_ham_classification.csv')
data.head()

Unnamed: 0,label,text
0,ham,into the kingdom of god and those that are ent...
1,spam,there was flow at hpl meter 1505 on april firs...
2,ham,take a look at this one campaign for bvyhprice...
3,spam,somu wrote actually thats what i was looking f...
4,spam,fathi boudra wrote i fixed the issue in the sv...


In [91]:
# ham example
data.iloc[0, 1]

"into the kingdom of god and those that are entering in he lord pardon escapenumber us in this thing we pray thee have us excused escapenumbernot therefore o escapenumber believers to look into the grave for to you it the holy ghost and escapenumber therefore being a holy habitation unto forth the words of truth and escapenumber soberness escapenumber but in all probability hath made them mad escapenumber and though escapenumber blessed be god all do that of jesus christ they see the necessity of escapenumber closing with a more than almost christians but is heaven so small a escapenumber trifle in men's esteem as not to be total renovation of the whole man escapenumber by the righteousness of complete though we be delivered from the power escapenumber we are not candle of the lord shines out and your redeemer lifts up the escapenumber inward holiness as indeed sometimes they do though in a worms destroy escapenumber them yet even in their flesh shall they see a mind to see jesus but e

In [92]:
# spam example
data.iloc[1, 1]

"there was flow at hpl meter 1505 on april first that didn ' t have a deal ticket out there . . . .\nbut now there is a deal ticket out there to cover it . . . . 740374\nplease link it or whatever you have to do to make it work\nthanks\nlee"

# **TEXT PREPROCESSING**
from scratch

1. Lower the Text
2. Remove punctuation
3. Remove stopwords
4. Vectorize with BOW

In [93]:
class TextPreprocess:
  def __init__(self):
    self.result =[]
    self.puncts = set([",", ":", "!", "-", ";", "'", "."])
    self.text = None

    # file here: https://gist.github.com/larsyencken/1440509
    with open('stopwords.txt', 'r') as f:
      self.stopwords = [line[:-1] for line in f.readlines()]

  def transform(self, text):
    self.text = text

    for word in self.text.split():
      if word in self.puncts: continue  # first little step in punct removal
      word = word.lower()  # lower
      # stopwords
      if word in self.stopwords:
        text.replace(word, '')

      if word[0] in self.puncts:
        word = word[1:]

      if word[-1] in self.puncts:
        word = word[:-1]

      self.result.append(word)

    return self.result


In [94]:
processor = TextPreprocess()
# quote from 1984 by George Orwell
txt = 'Freedom is the freedom to say that two plus two make four. If that is granted, all else follows.'
output = processor.transform(txt)

Compare The Results!

In [95]:
print(processor.text)
print('⏬⏬⏬⏬⏬')
print(output)

Freedom is the freedom to say that two plus two make four. If that is granted, all else follows.
⏬⏬⏬⏬⏬
['freedom', 'is', 'the', 'freedom', 'to', 'say', 'that', 'two', 'plus', 'two', 'make', 'four', 'if', 'that', 'is', 'granted', 'all', 'else', 'follows']


### Now Vectorization
I'll use **Bag Of Words** as it's simple but efficient

In [96]:
import time

In [97]:
frames = [data[data['label'] == 'spam'][:500],
          data[data['label'] == 'ham'][:500]]

small_df = pd.concat(frames)
small_df = small_df.sample(frac=1)

In [98]:
small_df.shape

(1000, 2)

In [99]:
small_df.head(10)

Unnamed: 0,label,text
710,ham,summer sale on v i agra and other drugs .\nyou...
848,ham,do you want escapenumberoo to escapenumberooo ...
765,spam,message sent from the pjm - customer - info ma...
579,ham,special summer offer from canadianpharmacy esc...
988,ham,dear sir\ni am dr fred azu . an accountant wit...
206,ham,escapelong escapelong escapelong escapelong es...
489,spam,gentlemen - thanks for your support of ena ' s...
902,spam,"hey vince ,\nsince i saw you last , the "" real..."
79,ham,best quality pills for the fraction of the pri...
6,spam,alert name bush officers ex cia chief tenet a ...


In [None]:
total = []
for text in small_df['text']:
  transformed = processor.transform(text)
  total += transformed

print(set(total))

In [102]:
vocab = set(total)

In [104]:
vocab = list(vocab)
vocab[:10]

['',
 'dsize',
 'cattle',
 'locanda',
 'similarly',
 'smyrno',
 'îòðèòªô¤¶©',
 'folk',
 'contextualizer',
 'ööàà°üà¨£¨']

In [105]:
len(vocab)

23427

<img src='https://media1.tenor.com/m/S_to1tY3ixUAAAAd/breaking-bad-walter-white.gif' width=30%>