In [1]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB, MultinomialNB

stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am","an", "and", "any", "are", "as", "at", "be",
"because", "been", "before","being", "below", "between", "both", "but", "by", "could", "did",
"do","does", "doing", "down", "during", "each", "few", "for", "from", "further",
"had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here",
"here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i",
"i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its",
"itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on",
"once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out",
"over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so",
"some", "such", "than", "that", "that's", "the", "their", "theirs", "them",
"themselves", "then", "there", "there's", "these", "they", "they'd",
"they'll", "they're", "they've", "this", "those", "through", "to", "too",
"under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're",
"we've", "were", "what", "what's", "when", "when's", "where", "where's",
"which", "while", "who", "who's", "whom", "why", "why's", "with", "would",
"you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself",
"yourselves",'let','ll','re','ve']

In [4]:
datasets = pd.read_csv('spam2.csv')
df = pd.DataFrame(datasets)
dataset = df.iloc[:,0:2]
dataset

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will �_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
vc_doc = CountVectorizer(stop_words=stopwords)
print(vc_doc)

CountVectorizer(stop_words=['a', 'about', 'above', 'after', 'again', 'against',
                            'all', 'am', 'an', 'and', 'any', 'are', 'as', 'at',
                            'be', 'because', 'been', 'before', 'being', 'below',
                            'between', 'both', 'but', 'by', 'could', 'did',
                            'do', 'does', 'doing', 'down', ...])


In [6]:
X = vc_doc.fit_transform(dataset['v2'])
print(vc_doc.get_feature_names())
print(X.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]




In [7]:
newtarget = dataset['v1']
#import the necessary module
from sklearn.model_selection import train_test_split
#split data set into train and test sets
data_train, data_test, target_train, target_test = train_test_split(X,
newtarget, test_size = 0.85, random_state = 109)

1). Using Multinomial Naive Bayes Classifier

In [8]:
import numpy as np
mnb = MultinomialNB()
mnb.fit(data_train, target_train)

target_pred = mnb.predict(data_test)
print(target_pred)

['ham' 'ham' 'ham' ... 'ham' 'ham' 'spam']


In [9]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn import metrics

accuracy = metrics.accuracy_score(target_test, target_pred)
precision = precision_score(target_test, target_pred,average=None)
recall = recall_score(target_test, target_pred,average=None)
print('Accuracy is:',accuracy)
print('Precision is:',precision)
print('Recall is:',recall)

Accuracy is: 0.9759341355288157
Precision is: [0.97843795 0.95737123]
Recall is: [0.99415774 0.85691574]


2). Using Decision Tree Classifier

In [10]:
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier


clf = DecisionTreeClassifier(criterion='gini')


clf = clf.fit(data_train,target_train)


y_pred = clf.predict(data_test)
print(y_pred)

['ham' 'ham' 'ham' ... 'ham' 'ham' 'spam']


In [11]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn import metrics

accuracy = metrics.accuracy_score(target_test, y_pred)
precision = precision_score(target_test, y_pred,average=None)
recall = recall_score(target_test, y_pred,average=None)
print('Accuracy is:',accuracy)
print('Precision is:',precision)
print('Recall is:',recall)

Accuracy is: 0.9301245514038421
Precision is: [0.96664196 0.71594203]
Recall is: [0.95228822 0.78537361]


##Exercise: Using bigram matching instead of unigram matching

In [12]:
vc_doc_bigram = CountVectorizer(stop_words=stopwords,ngram_range=(2,2))
print(vc_doc_bigram)

CountVectorizer(ngram_range=(2, 2),
                stop_words=['a', 'about', 'above', 'after', 'again', 'against',
                            'all', 'am', 'an', 'and', 'any', 'are', 'as', 'at',
                            'be', 'because', 'been', 'before', 'being', 'below',
                            'between', 'both', 'but', 'by', 'could', 'did',
                            'do', 'does', 'doing', 'down', ...])


In [13]:
Y = vc_doc_bigram.fit_transform(dataset['v2'])
print(vc_doc_bigram.get_feature_names())
print(Y.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]




In [14]:
newtarget = dataset['v1']
#import the necessary module
from sklearn.model_selection import train_test_split
#split data set into train and test sets
data_train1, data_test1, target_train1, target_test1 = train_test_split(Y,
newtarget, test_size = 0.85, random_state = 109)

1). Using Multinomial Naive Bayes Classifier

In [15]:
import numpy as np
mnb = MultinomialNB()
mnb.fit(data_train1, target_train1)

target_pred1 = mnb.predict(data_test1)
print(target_pred1)

['ham' 'ham' 'ham' ... 'ham' 'ham' 'spam']


In [16]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn import metrics

accuracy = metrics.accuracy_score(target_test1, target_pred1)
precision = precision_score(target_test1, target_pred1,average=None)
recall = recall_score(target_test1, target_pred1,average=None)
print('Accuracy is:',accuracy)
print('Precision is:',precision)
print('Recall is:',recall)

Accuracy is: 0.9525015832805573
Precision is: [0.95650129 0.91735537]
Recall is: [0.9902629  0.70588235]


2).Using Decision Tree Classifier

In [17]:
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier


clf1 = DecisionTreeClassifier(criterion='gini')


clf1 = clf1.fit(data_train1,target_train1)


y_pred1 = clf1.predict(data_test1)
print(y_pred1)

['ham' 'ham' 'ham' ... 'ham' 'ham' 'spam']


In [18]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn import metrics

accuracy = metrics.accuracy_score(target_test1, y_pred1)
precision = precision_score(target_test1, y_pred1,average=None)
recall = recall_score(target_test1, y_pred1,average=None)
print('Accuracy is:',accuracy)
print('Precision is:',precision)
print('Recall is:',recall)

Accuracy is: 0.9161916824994722
Precision is: [0.91445164 0.94615385]
Recall is: [0.99659202 0.39109698]
