In [264]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
import sklearn
nltk.download('stopwords')
nltk.download('wordnet')
import gensim
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.ensemble import VotingClassifier

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
data = pd.read_table('SMSSpamCollection', header=None, encoding='utf-8')

In [229]:
data

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [230]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       5572 non-null   object
 1   1       5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [231]:
#Check the number of ham and spam messages
no_of_ham=0
no_of_spam=0
for i in range(len(data)):
  if(data[0][i] == 'ham'):
    no_of_ham+=1
  else:
    no_of_spam+=1
print('ham', no_of_ham)
print('spam', no_of_spam)

ham 4825
spam 747


In [232]:
from sklearn.preprocessing import LabelEncoder

# convert ham to 0 and spam to 1
encoder = LabelEncoder()
Y = encoder.fit_transform(data[0])

Y

array([0, 0, 1, ..., 0, 0, 0])

In [233]:
messages = data[1]
messages

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: 1, Length: 5572, dtype: object

In [0]:
def preprocess_regex(text):
  """Replace some fixed patterns with single word. """

  # Replacing the email addresses with string 'email'
  text = text.str.replace('^([a-zA-Z0-9_\-\.]+)@([a-zA-Z0-9_\-\.]+)\.([a-zA-Z]{2,5})$', 'email')

  # Replacing the URLs with 'url'
  text = text.str.replace('^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$', 'url')
      
  # Replacing the phone numbers with 'phone'
  text = text.str.replace('^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', 'phone')
      
  # Replacing the numbers with 'number'
  text = text.str.replace(r'\d+(\.\d+)?', 'number')

  return text

In [0]:
messages = preprocess_regex(messages)

In [0]:
#Data Cleaning round 1
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('\w*\f\w*', '', text)
    text = re.sub('\(.*?\)', '', text)
    text = re.sub('\[.*]\)', '', text)
    text = text.lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [0]:
messages = messages.apply(clean_text_round1)

In [0]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\t', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [0]:
messages = messages.apply(clean_text_round2)

In [240]:
messages

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in number a wkly comp to win fa cup...
3             u dun say so early hor u c already then say
4       nah i dont think he goes to usf he lives aroun...
                              ...                        
5567    this is the numbernd time we have tried number...
5568                  will ü b going to esplanade fr home
5569    pity  was in mood for that soany other suggest...
5570    the guy did some bitching but i acted like id ...
5571                            rofl its true to its name
Name: 1, Length: 5572, dtype: object

In [0]:
from nltk.corpus import stopwords
def remove_stopwords(text):
  stop_words = set(stopwords.words('english'))

  text = text.apply(lambda x: ' '.join(
      term for term in x.split() if term not in stop_words))
  
  return text

In [0]:
messages = remove_stopwords(messages)

In [0]:
from nltk.corpus import wordnet
#lemmatizing using wordnet lemmatizer
lemmatizer = WordNetLemmatizer()
messages = messages.apply(lambda x: ' '.join(lemmatizer.lemmatize(term) for term in x.split()))

In [0]:
from collections import Counter
#Creating Bag of Words for the messages data
def count_words(text):
  for word in text:
    words = word_tokenize(word)
    for w in words:
        all_words.append(w)
  word_counts = Counter(all_words)
  return word_counts

In [0]:
word_counts = count_words(messages)

In [246]:
# printing 10 most common words
word_counts.most_common(10)

[('number', 14118),
 ('u', 7173),
 ('call', 3612),
 ('im', 2820),
 ('get', 2370),
 ('ur', 2340),
 ('go', 1842),
 ('dont', 1752),
 ('ok', 1668),
 ('free', 1650)]

In [0]:
#Using the 2000 most common words as features.
w_features = list(word_counts.keys())[:2000]

In [0]:
# The find_features function will determine which of the 1500 word features are contained in the review.
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in w_features:
        features[word] = (word in words)

    return features

In [0]:
features = list(zip(messages, Y))
seed = 1
np.random.seed = seed
np.random.shuffle(features)
feature_set=[]
for (x,y) in features:
  feature_set.append((find_features(x), y))

In [0]:
# spliting feature set into training data and testing data using scikit-learn
from sklearn import model_selection
train_data, test_data = model_selection.train_test_split(feature_set, test_size = 0.25, random_state=seed)

In [258]:
len(train_data)

4179

In [259]:
len(test_data)

1393

In [267]:
# Defining all the models to train
model_name = ["K Nearest Neighbors", "Decision Tree", "SGD Classifier", "Naive Bayes", "SVM Linear"]

model_classifier = [ KNeighborsClassifier(), DecisionTreeClassifier(), SGDClassifier(max_iter = 100), MultinomialNB(), SVC(kernel = 'linear')]

all_models = list(zip(model_name, model_classifier))

for model_name, model_classifier in all_models:
    nltk_model = SklearnClassifier(model_classifier)
    nltk_model.train(train_data)
    accuracy = nltk.classify.accuracy(nltk_model, test_data)*100
    print("{} Accuracy: {}".format(model_name, accuracy))

K Nearest Neighbors Accuracy: 94.25699928212491
Decision Tree Accuracy: 96.91313711414213
SGD Classifier Accuracy: 98.06173725771716
Naive Bayes Accuracy: 97.70279971284997
SVM Linear Accuracy: 98.49246231155779


In [269]:
# Voting classifier to find the best of all models.

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = all_models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(train_data)
accuracy = nltk.classify.accuracy(nltk_model, test_data)*100
print("Voting Classifier Accuracy: {}".format(accuracy))



Voting Classifier Accuracy: 98.49246231155779


In [0]:
# make class label prediction for testing set
txtfeatures, labels = zip(*test_data)
prediction = nltk_ensemble.classify_many(txtfeatures)

In [272]:
# print a classification report
print(classification_report(labels, prediction))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1221
           1       0.99      0.92      0.95       172

    accuracy                           0.99      1393
   macro avg       0.99      0.96      0.97      1393
weighted avg       0.99      0.99      0.99      1393



In [273]:
# print a confusion matrix
confusionmatrix= pd.DataFrame(confusion_matrix(labels, prediction), index = [['actual', 'actual'], ['ham', 'spam']], columns = [['predicted', 'predicted'], ['ham', 'spam']])
confusionmatrix

Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1219,2
actual,spam,14,158
