In [None]:
from google.colab import drive;drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import sys
import nltk
import sklearn
import pandas as pd 
import numpy as np
import re

nltk.download('stopwords');nltk.download('punkt')
print('Python: {}'.format(sys.version))
print('NLTK: {}'.format(nltk.__version__))
print('Scikit-learn: {}'.format(sklearn.__version__))
print('Pandas: {}'.format(pd.__version__))
print('Numpy: {}'.format(np.__version__))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Python: 3.6.9 (default, Jul 17 2020, 12:50:27) 
[GCC 8.4.0]
NLTK: 3.2.5
Scikit-learn: 0.22.2.post1
Pandas: 1.1.3
Numpy: 1.18.5


In [None]:
df = pd.read_table("/content/drive/My Drive/SMSSpamCollection", header = None , encoding = "utf-8")
df.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df.columns = ["label","sentence"]

In [None]:
df.shape , (df['label'] == "ham").sum()

((5572, 2), 4825)

In [None]:
df["label"].replace({"ham":0,
                 "spam":1},inplace = True)
df.head(n=10)

Unnamed: 0,label,sentence
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
5,1,FreeMsg Hey there darling it's been 3 week's n...
6,0,Even my brother is not like to speak with me. ...
7,0,As per your request 'Melle Melle (Oru Minnamin...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...


In [None]:
def clean(text_messages):

    processed = re.sub(r'^.+@[^\.].*\.[a-z]{2,}$',
                                    'emailaddress',text_messages)

    # Replace URLs with 'webaddress'
    processed = re.sub(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                      'webaddress',processed)

    # Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
    processed = re.sub(r'£|\$', 'moneysymb',processed)
        
    # Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
    processed = re.sub(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                      'phonenumbr',processed)
        
    # Replace numbers with 'numbr'
    processed = re.sub(r'\d+(\.\d+)?', 'numbr',processed)




    # Remove punctuation
    processed = re.sub(r'[^\w\d\s]', ' ',processed)

    # Replace whitespace between terms with a single space
    processed = re.sub(r'\s+', ' ',processed)

    # Remove leading and trailing whitespace
    processed = re.sub(r'^\s+|\s+?$', '',processed)

    processed = processed.lower()

    return processed

In [None]:
df["sentence"] = df["sentence"].apply(clean)
df["sentence"][:10]

0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in numbr a wkly comp to win fa cup ...
3          u dun say so early hor u c already then say
4    nah i don t think he goes to usf he lives arou...
5    freemsg hey there darling it s been numbr week...
6    even my brother is not like to speak with me t...
7    as per your request melle melle oru minnaminun...
8    winner as a valued network customer you have b...
9    had your mobile numbr months or more u r entit...
Name: sentence, dtype: object

In [None]:
from nltk.corpus import stopwords

stopwordsObj = set(stopwords.words("english"))

df["sentence"] = df["sentence"].apply(lambda sentence: " ".join(word for word in sentence.split(" ") if word not in stopwordsObj))
df["sentence"].head()

0    go jurong point crazy available bugis n great ...
1                              ok lar joking wif u oni
2    free entry numbr wkly comp win fa cup final tk...
3                  u dun say early hor u c already say
4               nah think goes usf lives around though
Name: sentence, dtype: object

In [None]:
porterStemmer = nltk.PorterStemmer()
df["sentence"] = df["sentence"].apply(lambda sentence: " ".join(porterStemmer.stem(word) for word in sentence.split(" ")) )

df["sentence"].head()

0    go jurong point crazi avail bugi n great world...
1                                ok lar joke wif u oni
2    free entri numbr wkli comp win fa cup final tk...
3                  u dun say earli hor u c alreadi say
4                 nah think goe usf live around though
Name: sentence, dtype: object

In [None]:
from nltk.tokenize import word_tokenize

corpus = []

for message in df["sentence"]:
  words = word_tokenize(message)
  for word in words:
    corpus.append(word)

corpus = nltk.FreqDist(corpus)
corpus

FreqDist({'go': 456,
          'jurong': 1,
          'point': 33,
          'crazi': 15,
          'avail': 18,
          'bugi': 7,
          'n': 155,
          'great': 116,
          'world': 40,
          'la': 7,
          'e': 96,
          'buffet': 2,
          'cine': 7,
          'got': 252,
          'amor': 1,
          'wat': 113,
          'ok': 293,
          'lar': 38,
          'joke': 17,
          'wif': 27,
          'u': 1207,
          'oni': 4,
          'free': 284,
          'entri': 26,
          'numbr': 2648,
          'wkli': 14,
          'comp': 12,
          'win': 84,
          'fa': 4,
          'cup': 9,
          'final': 33,
          'tkt': 4,
          'numbrst': 41,
          'may': 52,
          'text': 231,
          'receiv': 46,
          'question': 36,
          'std': 11,
          'txt': 190,
          'rate': 42,
          'c': 121,
          'appli': 34,
          'numbrovernumbr': 2,
          'dun': 55,
          'say': 140,
       

In [None]:
print(len(corpus))

6579


In [None]:
word_features = list(corpus.keys())[:1500]

In [None]:
def find_features(message):
  words = word_tokenize(message)
  features = {}
  for word in word_features:
    features[word] = word in words
  
  return features

features = find_features(df["sentence"].loc[5])
for key, value in features.items():
    if value == True:
        print (key)

ok
numbr
std
freemsg
hey
darl
week
word
back
like
fun
still
tb
xxx
chg
send
moneysymbnumbr
rcv


In [None]:
messages = zip(df["sentence"],df["label"])
np.random.seed(1)

featuresets = [(find_features(text),label) for (text,label) in messages] 
featuresets[1]

({'go': False,
  'jurong': False,
  'point': False,
  'crazi': False,
  'avail': False,
  'bugi': False,
  'n': False,
  'great': False,
  'world': False,
  'la': False,
  'e': False,
  'buffet': False,
  'cine': False,
  'got': False,
  'amor': False,
  'wat': False,
  'ok': True,
  'lar': True,
  'joke': True,
  'wif': True,
  'u': True,
  'oni': True,
  'free': False,
  'entri': False,
  'numbr': False,
  'wkli': False,
  'comp': False,
  'win': False,
  'fa': False,
  'cup': False,
  'final': False,
  'tkt': False,
  'numbrst': False,
  'may': False,
  'text': False,
  'receiv': False,
  'question': False,
  'std': False,
  'txt': False,
  'rate': False,
  'c': False,
  'appli': False,
  'numbrovernumbr': False,
  'dun': False,
  'say': False,
  'earli': False,
  'hor': False,
  'alreadi': False,
  'nah': False,
  'think': False,
  'goe': False,
  'usf': False,
  'live': False,
  'around': False,
  'though': False,
  'freemsg': False,
  'hey': False,
  'darl': False,
  'week': Fals

In [None]:
from sklearn.model_selection import train_test_split
training,testing = train_test_split(featuresets,test_size = 0.05,random_state = 1)

print(len(training), len(testing))

5293 279


In [113]:
from nltk import SklearnClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Define models to train
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing)*100
    print("{} Accuracy: {}".format(name, accuracy))

K Nearest Neighbors Accuracy: 95.3405017921147
Decision Tree Accuracy: 97.1326164874552
Random Forest Accuracy: 97.1326164874552
Logistic Regression Accuracy: 98.2078853046595
SGD Classifier Accuracy: 98.56630824372759
Naive Bayes Accuracy: 98.2078853046595
SVM Linear Accuracy: 97.84946236559139


In [None]:
from sklearn.ensemble import VotingClassifier

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(names, classifiers))
nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_model, testing)*100
print("Voting Classifier: Accuracy: {}".format(accuracy))

Voting Classifier: Accuracy: 97.84946236559139


In [None]:
txt_features, labels = zip(*testing)

prediction = nltk_ensemble.classify_many(txt_features)

print(classification_report(labels, prediction))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99       243
           1       0.89      0.92      0.90        36

    accuracy                           0.97       279
   macro avg       0.94      0.95      0.94       279
weighted avg       0.98      0.97      0.98       279

