In [101]:
import sys
import pandas as pd
import numpy as np
import nltk
import sklearn
nltk.download('stopwords')
nltk.download('punkt')



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\purvi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\purvi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [100]:
#Loading the sms dataset
df = pd.read_csv('SMSSPamCollection.tsv.txt', sep='\t', names=["Type", "sms"])
print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
Type    5572 non-null object
sms     5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB
None
   Type                                                sms
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [82]:
#Checking class distribution
#print(df.describe())
print(df.describe(include='all'))
print('--------------')
type = df['Type']
print(df['Type'].value_counts())


        Type                     sms
count   5572                    5572
unique     2                    5169
top      ham  Sorry, I'll call later
freq    4825                      30
--------------
ham     4825
spam     747
Name: Type, dtype: int64


In [83]:
#Preprocessing data

from sklearn.preprocessing import LabelEncoder
#Converting ham and spam values to 0 and 1 
encoder = LabelEncoder()
Y = encoder.fit_transform(type)
print(Y[:10])


[0 0 1 0 0 1 0 0 1 1]


In [84]:
# Storing sms data in another dataframe

messages = df['sms']
print(messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: sms, dtype: object


In [85]:
#PREPROCESSING

# Need to replace email addresses, URLs, numbers, etc in the text 
# so it can have some meaning rather than being a separate instance
# of itself

#will do this regular expressions
# http://regexlib.com

#Replacing email ids with 'emailid'
processed = messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','emailid')

#Replacing URLs with 'webaddr'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','webaddr')

#Replacing money symbols with 'moneysymbol' 
processed = processed.str.replace(r'£|\$|\₹', 'moneysymb')

#Replacing 10digit phone numbers 
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$','phonenum')

#Replacing any numbers (digits)
processed = processed.str.replace(r'\d+(\.\d+)?', 'num')

In [86]:
#Removing punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

#Removing leading and trailing whitespace in a line of sms
processed = processed.str.replace(r'^\s+|\s+?$', '')

#Replacing whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

In [87]:
#Change all words to lower case 
processed = processed.str.lower()
print(processed)

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in num a wkly comp to win fa cup fi...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
5       freemsg hey there darling it s been num week s...
6       even my brother is not like to speak with me t...
7       as per your request melle melle oru minnaminun...
8       winner as a valued network customer you have b...
9       had your mobile num months or more u r entitle...
10      i m gonna be home soon and i don t want to tal...
11      six chances to win cash from num to num num po...
12      urgent you have won a num week free membership...
13      i ve been searching for the right words to tha...
14                      i have a date on sunday with will
15      xxxmobilemovieclub to use your credit click th...
16                                 oh k i m watching here
17      eh u r

In [108]:
#Removing stop words from the corpus data

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = stopwords.words('english')

#tokenizing words, easier to remove stop words
words = []
cleaned = []  #cleaned after removing stop words 
for m in processed:
    k = word_tokenize(m)
    #print(k)
    for w in k:
        words.append(w)
        
for word in words:
    if word not in stop_words:
        cleaned.append(word)

#print(cleaned)

In [111]:
#Stemming the words to further utilize this data to the fullest

ps = nltk.PorterStemmer()

stemmed = []

for word in cleaned:
    word = ps.stem(word)
    stemmed.append(word)
    
#print(stemmed)

['go', 'jurong', 'point', 'crazi', 'avail', 'bugi', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amor', 'wat', 'ok', 'lar', 'joke', 'wif', 'u', 'oni', 'free', 'entri', 'num', 'wkli', 'comp', 'win', 'fa', 'cup', 'final', 'tkt', 'numst', 'may', 'num', 'text', 'fa', 'num', 'receiv', 'entri', 'question', 'std', 'txt', 'rate', 'c', 'appli', 'numovernum', 'u', 'dun', 'say', 'earli', 'hor', 'u', 'c', 'alreadi', 'say', 'nah', 'think', 'goe', 'usf', 'live', 'around', 'though', 'freemsg', 'hey', 'darl', 'num', 'week', 'word', 'back', 'like', 'fun', 'still', 'tb', 'ok', 'xxx', 'std', 'chg', 'send', 'moneysymbnum', 'rcv', 'even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent', 'per', 'request', 'mell', 'mell', 'oru', 'minnaminungint', 'nurungu', 'vettam', 'set', 'callertun', 'caller', 'press', 'num', 'copi', 'friend', 'callertun', 'winner', 'valu', 'network', 'custom', 'select', 'receivea', 'moneysymbnum', 'prize', 'reward', 'claim', 'call', 'num', 'claim', 'code', 'kl

In [122]:
words = []
words = nltk.FreqDist(cleaned) #to get the freq distribution of words
print(words) #words with their frequency 
print('Number of words:' , len(words))  #length of words
print('Most frequent words:',words.most_common(10)) #most frequently occuring words (top 10)
      

<FreqDist with 7909 samples and 53366 outcomes>
Number of words: 7909
Most frequent words: [('num', 2648), ('u', 1207), ('call', 593), ('ur', 391), ('get', 390), ('gt', 318), ('lt', 316), ('moneysymbnum', 303), ('ok', 293), ('free', 284)]


In [None]:
#How to do feature selection here
#How to determine which words will be useful for differentiating betweem Spam and Ham
#Can use 1000 most frequent words as features
