In [15]:
import numpy as np
import pandas as pd
import io, pkgutil

In [22]:
df = pd.read_csv('../combined-csv-files.csv', encoding='latin1')

In [24]:
df.head()

Unnamed: 0,S. No.,Message_body,Label
0,1,"UpgrdCentre Orange customer, you may now claim...",Spam
1,2,"Loan for any purpose £500 - £75,000. Homeowner...",Spam
2,3,Congrats! Nokia 3650 video camera phone is you...,Spam
3,4,URGENT! Your Mobile number has been awarded wi...,Spam
4,5,Someone has contacted our dating service and e...,Spam


In [28]:
df.drop('S. No.', axis=1, inplace= True)

In [34]:
# Text Pre-processing
import string
from nltk.corpus import stopwords

In [32]:
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [35]:
# Vectorization
from sklearn.feature_extraction.text import CountVectorizer

In [37]:
bow_transformer = CountVectorizer(analyzer=text_process).fit(df['Message_body'])

# Print total number of vocab words
print(len(bow_transformer.vocabulary_))

4232


In [38]:
messages_bow = bow_transformer.transform(df['Message_body'])

In [39]:
print('Shape of Sparse Matrix: ', messages_bow.shape)
print('Amount of Non-Zero occurences: ', messages_bow.nnz)

Shape of Sparse Matrix:  (1082, 4232)
Amount of Non-Zero occurences:  10188


In [40]:
sparsity = (100.0 * messages_bow.nnz / (messages_bow.shape[0] * messages_bow.shape[1]))
print('sparsity: {}'.format(round(sparsity)))

sparsity: 0


In [41]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer

In [42]:
tfidf_transformer = TfidfTransformer().fit(messages_bow)

In [44]:
messages_tfidf = tfidf_transformer.transform(messages_bow)
print(messages_tfidf.shape)

(1082, 4232)


In [45]:
# Training the model
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(messages_tfidf, df['Label'])

In [47]:
# model evaluation
all_predictions = spam_detect_model.predict(messages_tfidf)
print(all_predictions)

['Spam' 'Spam' 'Spam' ... 'Non-Spam' 'Non-Spam' 'Non-Spam']


In [48]:
from sklearn.metrics import classification_report
print (classification_report(df['Label'], all_predictions))

              precision    recall  f1-score   support

    Non-Spam       0.97      1.00      0.98       884
        Spam       1.00      0.84      0.92       198

    accuracy                           0.97      1082
   macro avg       0.98      0.92      0.95      1082
weighted avg       0.97      0.97      0.97      1082



In [50]:
# Train Test Split
from sklearn.model_selection import train_test_split

msg_train, msg_test, label_train, label_test = train_test_split(df['Message_body'], df['Label'], test_size=0.2)

print(len(msg_train), len(msg_test), len(msg_train) + len(msg_test))

865 217 1082


In [51]:
# Creating a pipeilne
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [52]:
pipeline.fit(msg_train,label_train)

Pipeline(steps=[('bow',
                 CountVectorizer(analyzer=<function text_process at 0x00000131EA67E670>)),
                ('tfidf', TfidfTransformer()),
                ('classifier', MultinomialNB())])

In [53]:
predictions = pipeline.predict(msg_test)

In [54]:
print(classification_report(predictions,label_test))

              precision    recall  f1-score   support

    Non-Spam       1.00      0.90      0.95       189
        Spam       0.61      1.00      0.76        28

    accuracy                           0.92       217
   macro avg       0.80      0.95      0.85       217
weighted avg       0.95      0.92      0.93       217

