In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# nltk.download('stopwords')

In [6]:
# get data files
TRAIN_DATA_URL = "https://raw.githubusercontent.com/beaucarnes/fcc_python_curriculum/master/sms/train-data.tsv"
TEST_DATA_URL = "https://raw.githubusercontent.com/beaucarnes/fcc_python_curriculum/master/sms/valid-data.tsv"

train_file_path = tf.keras.utils.get_file("train-data.tsv", TRAIN_DATA_URL)
test_file_path = tf.keras.utils.get_file("valid-data.tsv", TEST_DATA_URL)


Downloading data from https://raw.githubusercontent.com/beaucarnes/fcc_python_curriculum/master/sms/train-data.tsv
Downloading data from https://raw.githubusercontent.com/beaucarnes/fcc_python_curriculum/master/sms/valid-data.tsv


In [7]:
train_data = pd.read_csv(train_file_path,sep="\t")
test_data = pd.read_csv(test_file_path,sep="\t")

In [8]:
train_data.head()

Unnamed: 0,ham,"ahhhh...just woken up!had a bad dream about u tho,so i dont like u right now :) i didnt know anything about comedy night but i guess im up for it."
0,ham,you can never do nothing
1,ham,"now u sound like manky scouse boy steve,like! ..."
2,ham,mum say we wan to go then go... then she can s...
3,ham,never y lei... i v lazy... got wat? dat day ü ...
4,ham,in xam hall boy asked girl tell me the startin...


In [9]:
test_data.head()

Unnamed: 0,ham,i am in hospital da. . i will return home in evening
0,ham,"not much, just some textin'. how bout you?"
1,ham,i probably won't eat at all today. i think i'm...
2,ham,don‘t give a flying monkeys wot they think and...
3,ham,who are you seeing?
4,ham,your opinion about me? 1. over 2. jada 3. kusr...


In [10]:
def process_message(text):
    
    #text = text.to_list()

    processed = [w.lower() for w in nltk.word_tokenize(text) if w not in set(stopwords.words('english'))]
    
    return processed

print(test_data.iloc[0,1])
print(process_message(test_data.iloc[0,1]))


not much, just some textin'. how bout you?
['much', ',', 'textin', "'", '.', 'bout', '?']


In [11]:

ham_train = train_data.loc[train_data.iloc[:,0] == 'ham']

ham_proc_train = ham_train.iloc[:,1].apply(process_message)

vocabulary_ham = [inner for outer in ham_proc_train.to_list() for inner in outer]

freq = nltk.FreqDist(vocabulary_ham)
print(freq.most_common(10))

[('.', 2855), (',', 1156), ('?', 1006), ('...', 988), ('u', 715), ('!', 610), (';', 590), ('&', 569), (':', 410), ("'s", 324)]


In [12]:

spam_train = train_data.loc[train_data.iloc[:,0] == 'spam']

spam_proc_train = spam_train.iloc[:,1].apply(process_message)

vocabulary_spam = [inner for outer in spam_proc_train.to_list() for inner in outer]

freq = nltk.FreqDist(vocabulary_spam)
print(freq.most_common(10))

[('.', 661), ('!', 413), (',', 292), ('call', 252), ('free', 167), ('&', 137), ('?', 137), (':', 134), ('2', 128), ('txt', 121)]


In [14]:
from sklearn.feature_extraction.text import CountVectorizer


def stemmed_words(doc):
    return (ps.stem(w) for w in analyzer(doc))


In [15]:
analyzer = CountVectorizer().build_analyzer()
ps = PorterStemmer()
stem_analyzer = CountVectorizer(analyzer=stemmed_words)

# Vectorize and Stem corpus
X_train = stem_analyzer.fit_transform(train_data.iloc[:,1]).toarray()
X_test = stem_analyzer.transform(test_data.iloc[:,1]).toarray()

# Is it a spam or not
y_train = pd.get_dummies(train_data.iloc[:,0]).iloc[:,1].values
y_test = pd.get_dummies(test_data.iloc[:,0]).iloc[:,1].values

In [16]:
print(X_train.shape)
print(X_test.shape)

(4178, 6302)
(1391, 6302)


In [23]:
tuned_parameters = [{'alpha': [0.001, 0.01, 0.05, 0.1, 0.5, 1, 3, 5], 'fit_prior': [True, False]}]


clf = GridSearchCV(
    MultinomialNB(), tuned_parameters, scoring='precision'
)
clf.fit(X_train, y_train)

print("Best parameters :\n")
print(clf.best_params_)

print("\nGrid scores [Mean, Parameters] :\n")

for mean, params in zip(clf.cv_results_['mean_test_score'], clf.cv_results_['params']):
    print(params, " : ", round(mean,3))

    
y_true, y_pred = y_test, clf.predict(X_test)
print("\nClassification Report :\n")
print(classification_report(y_true, y_pred))


Best parameters :

{'alpha': 5, 'fit_prior': True}

Grid scores [Mean, Parameters] :

{'alpha': 0.001, 'fit_prior': True}  :  0.934
{'alpha': 0.001, 'fit_prior': False}  :  0.885
{'alpha': 0.01, 'fit_prior': True}  :  0.928
{'alpha': 0.01, 'fit_prior': False}  :  0.873
{'alpha': 0.05, 'fit_prior': True}  :  0.912
{'alpha': 0.05, 'fit_prior': False}  :  0.844
{'alpha': 0.1, 'fit_prior': True}  :  0.899
{'alpha': 0.1, 'fit_prior': False}  :  0.826
{'alpha': 0.5, 'fit_prior': True}  :  0.904
{'alpha': 0.5, 'fit_prior': False}  :  0.813
{'alpha': 1, 'fit_prior': True}  :  0.93
{'alpha': 1, 'fit_prior': False}  :  0.817
{'alpha': 3, 'fit_prior': True}  :  0.969
{'alpha': 3, 'fit_prior': False}  :  0.898
{'alpha': 5, 'fit_prior': True}  :  0.982
{'alpha': 5, 'fit_prior': False}  :  0.911

Classification Report :

              precision    recall  f1-score   support

           0       0.97      1.00      0.99      1204
           1       0.99      0.82      0.89       187

    accuracy     

In [24]:
model = MultinomialNB(alpha=5, fit_prior=True)
model.fit(X_train, y_train)

MultinomialNB(alpha=5, class_prior=None, fit_prior=True)

In [25]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(pred_text):
    
    stemmed_text = stem_analyzer.transform([pred_text]).toarray()
    
    result = "ham"
    
    if model.predict(stemmed_text) == 1:
        result = "spam"
    
    prediction = [pred_text, result]


    return (prediction)

pred_text = "how are you doing today?"

prediction = predict_message(pred_text)
print(prediction)

['how are you doing today?', 'ham']


In [26]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()


You passed the challenge. Great job!
