In [None]:
import sys
import pandas as pd
import numpy as np
import nltk
import sklearn
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')



In [None]:
#Loading the sms dataset
df = pd.read_csv('SMSSPamCollection.tsv.txt', sep='\t', names=["Type", "sms"])
print(df.info())
print(df.head())


In [None]:
#Checking class distribution
#print(df.describe())
print(df.describe(include='all'))
print('--------------')
type = df['Type']
print(df['Type'].value_counts())


In [195]:
#Preprocessing data

from sklearn.preprocessing import LabelEncoder
#Converting ham and spam values to 0 and 1 
encoder = LabelEncoder()
Y = encoder.fit_transform(type)
print(Y[:10])


[0 0 1 0 0 1 0 0 1 1]


In [196]:
# Storing sms data in another dataframe

messages = df['sms']
print(messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: sms, dtype: object


In [None]:
#PREPROCESSING

# Need to replace email addresses, URLs, numbers, etc in the text 
# so it can have some meaning rather than being a separate instance
# of itself

#will do this regular expressions
# http://regexlib.com

#Replacing email ids with 'emailid'
processed = messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','emailid')

#Replacing URLs with 'webaddr'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','webaddr')

#Replacing money symbols with 'moneysymbol' 
processed = processed.str.replace(r'£|\$|\₹', 'moneysymb')

#Replacing 10digit phone numbers 
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$','phonenum')

#Replacing any numbers (digits)
processed = processed.str.replace(r'\d+(\.\d+)?', 'num')

In [None]:
#Removing punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

#Removing leading and trailing whitespace in a line of sms
processed = processed.str.replace(r'^\s+|\s+?$', '')

#Replacing whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

In [197]:
#Change all words to lower case 
processed = processed.str.lower()
print(processed)

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in num a wkly comp to win fa cup fi...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
5       freemsg hey there darling it s been num week s...
6       even my brother is not like to speak with me t...
7       as per your request melle melle oru minnaminun...
8       winner as a valued network customer you have b...
9       had your mobile num months or more u r entitle...
10      i m gonna be home soon and i don t want to tal...
11      six chances to win cash from num to num num po...
12      urgent you have won a num week free membership...
13      i ve been searching for the right words to tha...
14                      i have a date on sunday with will
15      xxxmobilemovieclub to use your credit click th...
16                                 oh k i m watching here
17      eh u r

In [201]:
#Removing stop words from the corpus data

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = stopwords.words('english')


processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

# #tokenizing words, easier to remove stop words
# words = []
# cleaned = []  #cleaned after removing stop words 
# for m in processed:
#     k = word_tokenize(m)
#     #print(k)
#     for w in k:
#         words.append(w)
        
# for word in words:
#     if word not in stop_words:
#         cleaned.append(word)

# print(cleaned)

In [206]:
#Stemming the words to further utilize this data to the fullest

ps = nltk.PorterStemmer()
processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))
# stemmed = []

# for word in cleaned:
#     word = ps.stem(word)
#     stemmed.append(word)
    
print(processed)

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri num wkli comp win fa cup final tkt ...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
5       freemsg hey darl num week word back like fun s...
6           even brother like speak treat like aid patent
7       per request mell mell oru minnaminungint nurun...
8       winner valu network custom select receivea mon...
9       mobil num month u r entitl updat latest colour...
10      gonna home soon want talk stuff anymor tonight...
11      six chanc win cash num num num pound txt cshnu...
12      urgent num week free membership moneysymbnum n...
13      search right word thank breather promi wont ta...
14                                            date sunday
15      xxxmobilemovieclub use credit click wap link n...
16                                             oh k watch
17      eh u r

In [208]:
words = []
for m in processed:
     k = word_tokenize(m)
     #print(k)
     for w in k:
        words.append(w)
words = nltk.FreqDist(words) #to get the freq distribution of words
print(words) 

print('Number of words:' , len(words))  #length of words
print('Most frequent words:',words.most_common(10)) #most frequently occuring words (top 10)
      

<FreqDist with 6552 samples and 53376 outcomes>
Number of words: 6552
Most frequent words: [('num', 2654), ('u', 1207), ('call', 674), ('go', 456), ('get', 451), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 304), ('moneysymbnum', 303)]


In [None]:
#How to do feature selection here
#How to determine which words will be useful for differentiating betweem Spam and Ham
#Can use 1000 most frequent words as features


feature_words = list(words.keys())[:1000]

In [233]:
#This method will find which of the 1000 word features are contained in messages

def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in feature_words:
        features[word] = (word in words)
    return features


features = find_features(processed[0])
# print(features)
# for key, value in features.items():
#     if value == True:
#         print(key)

messages = list(zip(processed, Y))
#Each sms text message will have the 1000 features to itself which will 
#tell if those 1000 features occured in the message or not (true or false)


In [237]:
# #Defining seed for reproducibility
seed = 1
np.random.seed = seed
np.random.shuffle(messages)
print(messages)

[('probabl eat today think gonna pop weekend u miss', 0), ('em olowoyey usc edu great time argentina sad secretari everyth bless', 0), ('ok ok take care understand', 0), ('ugh fuck resub eve', 0), ('oh k think wi nz player unsold', 0), ('take lt gt min', 0), ('nutter cutter ctter cttergg cttargg ctargg ctagg ie', 0), ('u call alter num ok', 0), ('know god creat gap finger one made come amp fill gap hold hand love', 0), ('call germani num penc per minut call fix line via access number num num num num prepay direct access', 1), ('u attend ur drive lesson mani time wk n day', 0), ('horribl u eat mac eat u forgot abt alreadi rite u take long num repli thk toot bnum b prepar wat shall eat', 0), ('lol know hey someon great inperson flea forum love', 0), ('ye fine', 0), ('mani time told stage use laugh listen aha', 0), ('understand loss gain work school', 0), ('haha kid papa need drug', 0), ('free msg rington http tm widel com index wml id numbnumanumecefnumffnum num first truenum num jul num

In [240]:
#Calling find_features function for each SMS message
featuresets = [(find_features(text), label) for (text, label) in messages]   
# print(featuresets)

# print(featuresets)





In [241]:
#Splitting featuresets into training and testing datasetwsw using sklearn

from sklearn import model_selection
training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state=seed)


In [242]:
print(len(training))
print(len(testing))

4179
1393
