# SPAM-HAM Detection

In [5]:
#importing libraries

import nltk
import pandas as pd
import random
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

import warnings
warnings.filterwarnings('ignore')

In [6]:
## Reading the given dataset

spam = pd.read_csv("SMSSpamCollection.txt", sep = '\t', names = ['label', 'message'])

In [7]:
print(spam.head())

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [12]:
## Converting the read dataset in to a list of tuples, each tuple(row) contianing the message and it's label
data_set  = []

for index,row in spam.iterrows():
    data_set.append((row['message'], row['label']))

In [13]:
print(data_set[:5])

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', 'ham'), ('Ok lar... Joking wif u oni...', 'ham'), ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", 'spam'), ('U dun say so early hor... U c already then say...', 'ham'), ("Nah I don't think he goes to usf, he lives around here though", 'ham')]


In [14]:
print(len(data_set))

5572


## Preprocessing

In [15]:
## initialise the inbuilt Stemmer and the Lemmatizer

stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

In [16]:
#changes document to lower case, removes stopwords and lemmatizes/stems the remainder of the sentence

def preprocess(document, stem=True):

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)

    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]

    if stem:
        words = [stemmer.stem(word) for word in words]
    else:
        words = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]

    # join words to make sentence
    document = " ".join(words)

    return document

In [20]:
## - Performing the preprocessing steps on all messages

messages_set = []

for (message, label) in data_set:
    words_filtered = [e.lower() for e in preprocess(message, stem = False).split() if len(e)>3]
    messages_set.append((words_filtered, label))

In [21]:
print(messages_set[:5])

[(['jurong', 'point', 'crazy..', 'available', 'bugis', 'great', 'world', 'buffet', 'cine', 'amore'], 'ham'), (['joke'], 'ham'), (['free', 'entry', 'wkly', 'comp', 'final', 'tkts', '21st', '2005.', 'text', '87121', 'receive', 'entry', 'question', 'rate', 'apply', '08452810075over18'], 'spam'), (['early', 'already'], 'ham'), (['think', 'live', 'around', 'though'], 'ham')]


## Preparing to create features

In [22]:
## - creating a single list of all words in the entire dataset for feature list creation

def get_words_in_messages(messages):
    all_words = []
    for (message, label) in messages:
        all_words.extend(message)
    return all_words

In [25]:
## - creating a final feature list using an intuitive FreqDist, to eliminate all the duplicate words

def get_word_features(wordlist):
    
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features

In [27]:
## - creating the word features for the entire dataset

word_features = get_word_features(get_words_in_messages(messages_set))
print(len(word_features))

7483


## Preparing to create a train and test set

In [31]:
## - creating slicing index at 80% threshold

sliceIndex = int((len(messages_set)*0.8))

In [29]:
## - shuffle the pack to create a random and unbiased split of the dataset

random.shuffle(messages_set)

In [32]:
train_messages, test_messages = messages_set[:sliceIndex], messages_set[sliceIndex:]

In [33]:
len(train_messages)

4457

In [34]:
len(test_messages)

1115

## Preparing to create feature maps for train and test data

In [35]:
## creating a LazyMap of feature presence for each of the 8K+ features with respect to each of the SMS messages

def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [36]:
## - creating the feature map of train and test data

training_set = nltk.classify.apply_features(extract_features, train_messages)
testing_set = nltk.classify.apply_features(extract_features, test_messages)

In [37]:
print('Training set size : ', len(training_set))
print('Test set size : ', len(testing_set))

Training set size :  4457
Test set size :  1115


## Training

In [39]:
## Training the classifier with NaiveBayes algorithm

spamclassifier = nltk.NaiveBayesClassifier.train(training_set)

## Evaluation

In [42]:
## - Analyzing the accuracy of the train set

print(nltk.classify.accuracy(spamclassifier, training_set))

0.9914740857078752


In [44]:
## Analyzing the accuracy of the test set

print(nltk.classify.accuracy(spamclassifier, testing_set))

0.9811659192825112


In [46]:
## Priting the most informative features in the classifier
print(spamclassifier.show_most_informative_features(50))

Most Informative Features
         contains(award) = True             spam : ham    =    207.7 : 1.0
      contains(landline) = True             spam : ham    =    137.7 : 1.0
          contains(code) = True             spam : ham    =    102.7 : 1.0
        contains(camera) = True             spam : ham    =     98.4 : 1.0
        contains(urgent) = True             spam : ham    =     94.6 : 1.0
       contains(service) = True             spam : ham    =     92.5 : 1.0
         contains(await) = True             spam : ham    =     89.6 : 1.0
          contains(rate) = True             spam : ham    =     85.2 : 1.0
         contains(video) = True             spam : ham    =     85.2 : 1.0
         contains(nokia) = True             spam : ham    =     68.4 : 1.0
       contains(private) = True             spam : ham    =     67.8 : 1.0
          contains(draw) = True             spam : ham    =     64.6 : 1.0
       contains(voucher) = True             spam : ham    =     63.4 : 1.0

In [48]:
input_msg = input()
print('Classification result : ', spamclassifier.classify(extract_features(input_msg.split())))

'CONGRATULATIONS!! As a valued account holder you have been selected to receive a £900 prize reward! Valid 12 hours only.'
Classification result :  spam
