In [5]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

# What I Have So Far...

In [6]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
print(data_path)
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
print(sms_raw.head())
sms_raw.columns = ['spam', 'message']
print(sms_raw.columns)
print(sms_raw.head())

https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/master/sms_spam_collection/SMSSpamCollection
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
Index(['spam', 'message'], dtype='object')
   spam                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [7]:
keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'sale', 'limited','urgent']

for key in keywords:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    sms_raw[str(key)] = sms_raw.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
    )
print(sms_raw.head())

   spam                                            message  click  offer  \
0   ham  Go until jurong point, crazy.. Available only ...  False  False   
1   ham                      Ok lar... Joking wif u oni...  False  False   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...  False  False   
3   ham  U dun say so early hor... U c already then say...  False  False   
4   ham  Nah I don't think he goes to usf, he lives aro...  False  False   

   winner    buy   free   cash   sale  limited  urgent  
0   False  False  False  False  False    False   False  
1   False  False  False  False  False    False   False  
2   False  False  False  False  False    False   False  
3   False  False  False  False  False    False   False  
4   False  False  False  False  False    False   False  


In [8]:
sms_raw['allcaps'] = sms_raw.message.str.isupper()
print(sms_raw.head())

   spam                                            message  click  offer  \
0   ham  Go until jurong point, crazy.. Available only ...  False  False   
1   ham                      Ok lar... Joking wif u oni...  False  False   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...  False  False   
3   ham  U dun say so early hor... U c already then say...  False  False   
4   ham  Nah I don't think he goes to usf, he lives aro...  False  False   

   winner    buy   free   cash   sale  limited  urgent  allcaps  
0   False  False  False  False  False    False   False    False  
1   False  False  False  False  False    False   False    False  
2   False  False  False  False  False    False   False    False  
3   False  False  False  False  False    False   False    False  
4   False  False  False  False  False    False   False    False  


In [9]:
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
# Note that if you run this cell a second time everything will become false.
# So... Don't.
print(sms_raw.head())

    spam                                            message  click  offer  \
0  False  Go until jurong point, crazy.. Available only ...  False  False   
1  False                      Ok lar... Joking wif u oni...  False  False   
2   True  Free entry in 2 a wkly comp to win FA Cup fina...  False  False   
3  False  U dun say so early hor... U c already then say...  False  False   
4  False  Nah I don't think he goes to usf, he lives aro...  False  False   

   winner    buy   free   cash   sale  limited  urgent  allcaps  
0   False  False  False  False  False    False   False    False  
1   False  False  False  False  False    False   False    False  
2   False  False  False  False  False    False   False    False  
3   False  False  False  False  False    False   False    False  
4   False  False  False  False  False    False   False    False  


In [10]:
data = sms_raw[keywords + ['allcaps']]
print(data.head())
target = sms_raw['spam']
print(target.head())

   click  offer  winner    buy   free   cash   sale  limited  urgent  allcaps
0  False  False   False  False  False  False  False    False   False    False
1  False  False   False  False  False  False  False    False   False    False
2  False  False   False  False  False  False  False    False   False    False
3  False  False   False  False  False  False  False    False   False    False
4  False  False   False  False  False  False  False    False   False    False
0    False
1    False
2     True
3    False
4    False
Name: spam, dtype: bool


In [12]:
# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

print(bnb)

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

print(y_pred)
# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))
print(data.shape)

print('success rate = %s' % ((data.shape[0] - (target != y_pred).sum())/data.shape[0]))

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
[False False False ..., False False False]
Number of mislabeled points out of a total 5572 points : 604
(5572, 10)
success rate = 0.89160086145


# Going to try to increase the success rate!

In [None]:
sms_raw[lower_message] = sms_raw.message.str