In [4]:
import sys
import pandas as pd
import numpy as np
import nltk
import sklearn
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\purvi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\purvi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\purvi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
#Loading the sms dataset
df = pd.read_csv('SMSSPamCollection.tsv.txt', sep='\t', names=["Type", "sms"])
print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
Type    5572 non-null object
sms     5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB
None
   Type                                                sms
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [6]:
#Checking class distribution
#print(df.describe())
print(df.describe(include='all'))
print('--------------')
type = df['Type']
print(df['Type'].value_counts())


        Type                     sms
count   5572                    5572
unique     2                    5169
top      ham  Sorry, I'll call later
freq    4825                      30
--------------
ham     4825
spam     747
Name: Type, dtype: int64


In [7]:
#Preprocessing data

from sklearn.preprocessing import LabelEncoder
#Converting ham and spam values to 0 and 1 
encoder = LabelEncoder()
Y = encoder.fit_transform(type)
print(Y[:10])


[0 0 1 0 0 1 0 0 1 1]


In [8]:
# Storing sms data in another dataframe

messages = df['sms']
print(messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: sms, dtype: object


In [9]:
#PREPROCESSING

# Need to replace email addresses, URLs, numbers, etc in the text 
# so it can have some meaning rather than being a separate instance
# of itself

#will do this regular expressions
# http://regexlib.com

#Replacing email ids with 'emailid'
processed = messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','emailid')

#Replacing URLs with 'webaddr'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','webaddr')

#Replacing money symbols with 'moneysymbol' 
processed = processed.str.replace(r'£|\$|\₹', 'moneysymb')

#Replacing 10digit phone numbers 
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$','phonenum')

#Replacing any numbers (digits)
processed = processed.str.replace(r'\d+(\.\d+)?', 'num')

In [10]:
#Removing punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

#Removing leading and trailing whitespace in a line of sms
processed = processed.str.replace(r'^\s+|\s+?$', '')

#Replacing whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

In [11]:
#Change all words to lower case 
processed = processed.str.lower()
print(processed)

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in num a wkly comp to win fa cup fi...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
5       freemsg hey there darling it s been num week s...
6       even my brother is not like to speak with me t...
7       as per your request melle melle oru minnaminun...
8       winner as a valued network customer you have b...
9       had your mobile num months or more u r entitle...
10      i m gonna be home soon and i don t want to tal...
11      six chances to win cash from num to num num po...
12      urgent you have won a num week free membership...
13      i ve been searching for the right words to tha...
14                      i have a date on sunday with will
15      xxxmobilemovieclub to use your credit click th...
16                                 oh k i m watching here
17      eh u r

In [12]:
#Removing stop words from the corpus data

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = stopwords.words('english')


processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

# #tokenizing words, easier to remove stop words
# words = []
# cleaned = []  #cleaned after removing stop words 
# for m in processed:
#     k = word_tokenize(m)
#     #print(k)
#     for w in k:
#         words.append(w)
        
# for word in words:
#     if word not in stop_words:
#         cleaned.append(word)

# print(cleaned)

In [13]:
#Stemming the words to further utilize this data to the fullest

ps = nltk.PorterStemmer()
processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))
# stemmed = []

# for word in cleaned:
#     word = ps.stem(word)
#     stemmed.append(word)
    
print(processed)

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri num wkli comp win fa cup final tkt ...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
5       freemsg hey darl num week word back like fun s...
6           even brother like speak treat like aid patent
7       per request mell mell oru minnaminungint nurun...
8       winner valu network custom select receivea mon...
9       mobil num month u r entitl updat latest colour...
10      gonna home soon want talk stuff anymor tonight...
11      six chanc win cash num num num pound txt cshnu...
12      urgent num week free membership moneysymbnum n...
13      search right word thank breather promis wont t...
14                                            date sunday
15      xxxmobilemovieclub use credit click wap link n...
16                                             oh k watch
17      eh u r

In [14]:
words = []
for m in processed:
     k = word_tokenize(m)
     #print(k)
     for w in k:
        words.append(w)
words = nltk.FreqDist(words) #to get the freq distribution of words
print(words) 

print('Number of words:' , len(words))  #length of words
print('Most frequent words:',words.most_common(10)) #most frequently occuring words (top 10)
      

<FreqDist with 6577 samples and 53376 outcomes>
Number of words: 6577
Most frequent words: [('num', 2654), ('u', 1207), ('call', 674), ('go', 456), ('get', 451), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 304), ('moneysymbnum', 303)]


In [15]:
#How to do feature selection here
#How to determine which words will be useful for differentiating betweem Spam and Ham
#Can use 1000 most frequent words as features


feature_words = list(words.keys())[:1000]

In [16]:
#This method will find which of the 1000 word features are contained in messages

def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in feature_words:
        features[word] = (word in words)
    return features


features = find_features(processed[0])
# print(features)
# for key, value in features.items():
#     if value == True:
#         print(key)

messages = list(zip(processed, Y))
#Each sms text message will have the 1000 features to itself which will 
#tell if those 1000 features occured in the message or not (true or false)


In [17]:
# #Defining seed for reproducibility
seed = 1
np.random.seed = seed
np.random.shuffle(messages)
print(messages)

[('mm kanji dont eat anyth heavi ok', 0), ('ah well confus thing', 0), ('prob hon u doinat mo', 0), ('gift tri get throw cliff someth', 0), ('prize go anoth custom c www c biz num nump min polo ltd suit num london wnumj numhl pleas call back busi', 1), ('eh ur laptop got stock lei say mon muz come take look c got', 0), ('shall call dear food', 0), ('much get', 0), ('chang window logoff sound', 0), ('hen night go swing', 0), ('wonder other took', 0), ('u dogbreath sound like jan c al', 0), ('pl make note expos also find school anyon els vomit dog cat hous let know later', 0), ('probabl coupl hour top', 0), ('mean u could keep ur word', 0), ('see knew give break time woul lead alway want miss curfew gonna gibe til one midnight movi gonna get til num need come home need getsleep anyth need b studdi ear train', 0), ('pleas call custom servic repres freephon num num num numam numpm guarante moneysymbnum cash moneysymbnum prize', 1), ('ye think offic lap room think that last day didnt shut',

In [18]:
#Calling find_features function for each SMS message
featuresets = [(find_features(text), label) for (text, label) in messages]   
# print(featuresets)

# print(featuresets)





In [19]:
#Splitting featuresets into training and testing datasetwsw using sklearn

from sklearn import model_selection
training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state=seed)


In [20]:
print('Training set:',len(training))
print('Testing set:', len(testing))

Training set: 4179
Testing set: 1393


In [21]:
#Using sklearn algorithms 
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC


model = SklearnClassifier(SVC(kernel = 'linear'))

#Training the model on the training data
model.train(training)

#Testing on the testing data
accuracy = nltk.classify.accuracy(model, testing)*100
print("SVC Accuracy: {}".format(accuracy))

SVC Accuracy: 98.34888729361091


In [33]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

#Defining models to train
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier","Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(names, classifiers))
# print(models)

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    print(nltk_model)
    accuracy = nltk.classify.accuracy(nltk_model, testing)*100
#     print("{} Accuracy: {}".format(name, accuracy))

<SklearnClassifier(KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))>
<SklearnClassifier(DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))>




<SklearnClassifier(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))>




<SklearnClassifier(LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))>




<SklearnClassifier(SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=100,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))>
<SklearnClassifier(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))>
<SklearnClassifier(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))>


Ensemble method is a machine learning technique that combines several base models in order to produce one optimal predictive model.

In [34]:
#Not satisfied with the accuracy produced by the classifiers used, will be using voting classifiers to further decide on the best classifier
#Ensemble methods - Voting classifier

from sklearn.ensemble import VotingClassifier

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier","Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(names, classifiers))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting='hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_model, testing)*100
print("Voting Classifier's accuracy:", accuracy)

Voting Classifier's accuracy: 98.34888729361091


In [36]:
#Making class label predictions for testing dataset

text_features, labels = list(zip(*testing))
prediction = nltk_ensemble.classify_many(text_features)

In [38]:
#Printing a confusion matrix and a classification report

print(classification_report(labels, prediction))

pd.DataFrame(
    confusion_matrix(labels, prediction),
    index = [['actual', 'actual'], ['ham', 'spam']],
    columns = [['predicted', 'predicted'], ['ham', 'spam']])

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1208
           1       0.99      0.89      0.94       185

   micro avg       0.98      0.98      0.98      1393
   macro avg       0.99      0.94      0.96      1393
weighted avg       0.98      0.98      0.98      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1207,1
actual,spam,21,164


In [None]:
#THE END ------------ 