In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split 
from sklearn import metrics

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/paulivanespiritu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#data set from kaggle: may not be fully english, but this data has distinction of which is spam and not --"ham"
messages = pd.read_csv('/Users/paulivanespiritu/Documents/COMMS3/FINALS/FINAL FINAL/spam.csv',encoding = 'latin-1')
messages.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["tag", "message"]

In [4]:
messages.head()#check if unncessary columns are removed | data preparation:

Unnamed: 0,tag,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
messages.describe()

Unnamed: 0,tag,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [6]:
messages.groupby('tag').describe().T #transposing for better view

Unnamed: 0,tag,ham,spam
message,count,4825,747
message,unique,4516,653
message,top,"Sorry, I'll call later",Please call our customer service representativ...
message,freq,30,4


In [7]:
messages['length'] = messages['message'].apply(len)
messages.head()

Unnamed: 0,tag,message,length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [8]:
#check for the most common texts
messages['message'].value_counts().rename_axis(['message']).reset_index(name='counts').head()

Unnamed: 0,message,counts
0,"Sorry, I'll call later",30
1,I cant pick the phone right now. Pls send a me...,12
2,Ok...,10
3,7 wonders in My WORLD 7th You 6th Ur style 5th...,4
4,"Say this slowly.? GOD,I LOVE YOU &amp; I NEED ...",4


In [9]:
#data cleansing by removing unnecessary characters
def text_preprocess(mess):
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    nopunc = nopunc.lower()
    
    # Now just remove any stopwords and non alphabets
    nostop=[word for word in nopunc.split() if word.lower() not in stopwords.words('english') and word.isalpha()]
    
    return nostop

In [10]:
spam_messages = messages[messages["tag"] == "spam"]["message"]
ham_messages = messages[messages["tag"] == "ham"]["message"]
print("No of spam messages : ",len(spam_messages))
print("No of ham messages : ",len(ham_messages))

No of spam messages :  747
No of ham messages :  4825


In [11]:
spam_words = text_preprocess(spam_messages)

In [12]:
messages["message"] = messages["message"].apply(text_preprocess) ##cleansing again

In [13]:
messages["message"] = messages["message"].agg(lambda x: ' '.join(map(str, x)))#converting to strings

In [14]:
messages["message"][7]#checking

'per request melle melle oru minnaminunginte nurungu vettam set callertune callers press copy friends callertune'

In [15]:
messages["message"][6]#checking

'even brother like speak treat like aids patent'

In [16]:
#using bag of words model, we can modify weights of messages, this is very useful in weighting recurring messages or words

In [17]:
vectorizer = CountVectorizer()
bow_transformer = vectorizer.fit(messages['message'])

print("20 Bag of Words (BOW) Features: \n")
print(vectorizer.get_feature_names_out()[20:40])

print("\nTotal number of vocab words : ",len(vectorizer.vocabulary_))

20 Bag of Words (BOW) Features: 

['absence' 'absolutely' 'abstract' 'abt' 'abta' 'aburo' 'abuse' 'abusers'
 'ac' 'academic' 'acc' 'accent' 'accenture' 'accept' 'access' 'accessible'
 'accidant' 'accident' 'accidentally' 'accommodation']

Total number of vocab words :  8084


In [18]:
messages_bow = bow_transformer.transform(messages['message'])

In [19]:
print('Shape of Sparse Matrix: ', messages_bow.shape)
print('Amount of Non-Zero occurences: ', messages_bow.nnz)

Shape of Sparse Matrix:  (5572, 8084)
Amount of Non-Zero occurences:  44211


In [20]:
#using TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer().fit(messages_bow)

In [21]:
messages_tfidf = tfidf_transformer.transform(messages_bow)
print(messages_tfidf.shape)

(5572, 8084)


In [22]:
messages["message"][:10]

0    go jurong point crazy available bugis n great ...
1                              ok lar joking wif u oni
2    free entry wkly comp win fa cup final tkts may...
3                  u dun say early hor u c already say
4          nah dont think goes usf lives around though
5    freemsg hey darling weeks word back id like fu...
6       even brother like speak treat like aids patent
7    per request melle melle oru minnaminunginte nu...
8    winner valued network customer selected receiv...
9    mobile months u r entitled update latest colou...
Name: message, dtype: object

In [23]:
#processing data into a representation to use for ML
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer(encoding = "latin-1", strip_accents = "unicode", stop_words = "english")
features = vec.fit_transform(messages["message"])
print(features.shape)

print(len(vec.vocabulary_))

(5572, 7927)
7927


In [24]:
msg_train, msg_test, tag_train, tag_test = \
train_test_split(messages_tfidf, messages['tag'], test_size=0.2)

In [25]:
print("train dataset features size : ",msg_train.shape)
print("train dataset tag size", tag_train.shape)

print("\n")

print("test dataset features size", msg_test.shape)
print("test dataset lable size", tag_test.shape)

train dataset features size :  (4457, 8084)
train dataset tag size (4457,)


test dataset features size (1115, 8084)
test dataset lable size (1115,)


In [26]:
#using naive bayes model 
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
spam_detect_model = clf.fit(msg_train, tag_train)

In [27]:
predict_train = spam_detect_model.predict(msg_train)

In [28]:
print("Classification Report \n",metrics.classification_report(tag_train, predict_train))
print("\n")
print("Confusion Matrix \n",metrics.confusion_matrix(tag_train, predict_train))
print("\n")
print("Accuracy of Train dataset : {0:0.3f}".format(metrics.accuracy_score(tag_train, predict_train)))

Classification Report 
               precision    recall  f1-score   support

         ham       0.97      1.00      0.98      3857
        spam       1.00      0.80      0.89       600

    accuracy                           0.97      4457
   macro avg       0.98      0.90      0.94      4457
weighted avg       0.97      0.97      0.97      4457



Confusion Matrix 
 [[3857    0]
 [ 122  478]]


Accuracy of Train dataset : 0.973


In [29]:
#evaluation of model
tag_predictions = spam_detect_model.predict(msg_test)
print(tag_predictions)

['ham' 'ham' 'ham' ... 'ham' 'ham' 'ham']


In [30]:
print(metrics.classification_report(tag_test, tag_predictions))
print(metrics.confusion_matrix(tag_test, tag_predictions))

              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       968
        spam       1.00      0.76      0.86       147

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115

[[968   0]
 [ 35 112]]


In [31]:
print("Accuracy: {0:0.3f}".format(metrics.accuracy_score(tag_test, tag_predictions)))

Accuracy: 0.969
