# SMS Spam Detection using NLP <br><br>

## Step 1: Reading Data

In [1]:
import csv
from itertools import islice 

with open('spam.csv', 'r') as f: #reading csv file
    reader = csv.reader(f)
    spam = list(reader)

spam = spam[1:]
    
for row in islice(spam, 10):
    print(row)

['ham', 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', '', '', '']
['ham', 'Ok lar... Joking wif u oni...', '', '', '']
['spam', "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", '', '', '']
['ham', 'U dun say so early hor... U c already then say...', '', '', '']
['ham', "Nah I don't think he goes to usf, he lives around here though", '', '', '']
['spam', "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv", '', '', '']
['ham', 'Even my brother is not like to speak with me. They treat me like aids patent.', '', '', '']
['ham', "As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune", '', '', '']
['spam', 'WINNER

## Step 2: Cleaning up Data

In [2]:
truncatedSpam = [row[0:2] for row in spam]
for row in islice(truncatedSpam, 10):
    print(row)

['ham', 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...']
['ham', 'Ok lar... Joking wif u oni...']
['spam', "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]
['ham', 'U dun say so early hor... U c already then say...']
['ham', "Nah I don't think he goes to usf, he lives around here though"]
['spam', "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv"]
['ham', 'Even my brother is not like to speak with me. They treat me like aids patent.']
['ham', "As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune"]
['spam', 'WINNER!! As a valued network customer you have been selected to receivea å£900 prize reward! To claim 

In [3]:
import re
import string

def remove_punctuation_and_lowercase(string1):
    string1 = "".join([char for char in string1 if char not in string.punctuation])
    return(string1.lower())

for i in range(1, len(truncatedSpam)):
    truncatedSpam[i][1] = truncatedSpam[i][1].split()
    
for row in range(1, len(truncatedSpam)):
    for word in range(0, len(truncatedSpam[row][1])):
        truncatedSpam[row][1][word] = remove_punctuation_and_lowercase(truncatedSpam[row][1][word])

for i in islice(truncatedSpam, 10):
    print(i)

['ham', 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...']
['ham', ['ok', 'lar', 'joking', 'wif', 'u', 'oni']]
['spam', ['free', 'entry', 'in', '2', 'a', 'wkly', 'comp', 'to', 'win', 'fa', 'cup', 'final', 'tkts', '21st', 'may', '2005', 'text', 'fa', 'to', '87121', 'to', 'receive', 'entry', 'questionstd', 'txt', 'ratetcs', 'apply', '08452810075over18s']]
['ham', ['u', 'dun', 'say', 'so', 'early', 'hor', 'u', 'c', 'already', 'then', 'say']]
['ham', ['nah', 'i', 'dont', 'think', 'he', 'goes', 'to', 'usf', 'he', 'lives', 'around', 'here', 'though']]
['spam', ['freemsg', 'hey', 'there', 'darling', 'its', 'been', '3', 'weeks', 'now', 'and', 'no', 'word', 'back', 'id', 'like', 'some', 'fun', 'you', 'up', 'for', 'it', 'still', 'tb', 'ok', 'xxx', 'std', 'chgs', 'to', 'send', 'å£150', 'to', 'rcv']]
['ham', ['even', 'my', 'brother', 'is', 'not', 'like', 'to', 'speak', 'with', 'me', 'they', 'treat', 'me', 'like', 'aids', 'patent']]
['

In [4]:
import nltk

stopwords = nltk.corpus.stopwords.words("english") #stopwords in english from nltk package
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [5]:
def remove_stopwords(textList):
    temp = [word for word in textList if word not in stopwords]
    return temp

for row in range(1, len(truncatedSpam)):
    truncatedSpam[row][1] = remove_stopwords(truncatedSpam[row][1])

for i in islice(truncatedSpam, 10):
    print(i)

['ham', 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...']
['ham', ['ok', 'lar', 'joking', 'wif', 'u', 'oni']]
['spam', ['free', 'entry', '2', 'wkly', 'comp', 'win', 'fa', 'cup', 'final', 'tkts', '21st', 'may', '2005', 'text', 'fa', '87121', 'receive', 'entry', 'questionstd', 'txt', 'ratetcs', 'apply', '08452810075over18s']]
['ham', ['u', 'dun', 'say', 'early', 'hor', 'u', 'c', 'already', 'say']]
['ham', ['nah', 'dont', 'think', 'goes', 'usf', 'lives', 'around', 'though']]
['spam', ['freemsg', 'hey', 'darling', '3', 'weeks', 'word', 'back', 'id', 'like', 'fun', 'still', 'tb', 'ok', 'xxx', 'std', 'chgs', 'send', 'å£150', 'rcv']]
['ham', ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aids', 'patent']]
['ham', ['per', 'request', 'melle', 'melle', 'oru', 'minnaminunginte', 'nurungu', 'vettam', 'set', 'callertune', 'callers', 'press', '9', 'copy', 'friends', 'callertune']]
['spam', ['winner', 'valued', 'network', 'custo

In [6]:
wn = nltk.WordNetLemmatizer()

for row in range(1, len(truncatedSpam)):
    for word in range(0, len(truncatedSpam[row][1])):
        truncatedSpam[row][1][word] = wn.lemmatize(truncatedSpam[row][1][word])

for i in islice(truncatedSpam, 10):
    print(i)

['ham', 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...']
['ham', ['ok', 'lar', 'joking', 'wif', 'u', 'oni']]
['spam', ['free', 'entry', '2', 'wkly', 'comp', 'win', 'fa', 'cup', 'final', 'tkts', '21st', 'may', '2005', 'text', 'fa', '87121', 'receive', 'entry', 'questionstd', 'txt', 'ratetcs', 'apply', '08452810075over18s']]
['ham', ['u', 'dun', 'say', 'early', 'hor', 'u', 'c', 'already', 'say']]
['ham', ['nah', 'dont', 'think', 'go', 'usf', 'life', 'around', 'though']]
['spam', ['freemsg', 'hey', 'darling', '3', 'week', 'word', 'back', 'id', 'like', 'fun', 'still', 'tb', 'ok', 'xxx', 'std', 'chgs', 'send', 'å£150', 'rcv']]
['ham', ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent']]
['ham', ['per', 'request', 'melle', 'melle', 'oru', 'minnaminunginte', 'nurungu', 'vettam', 'set', 'callertune', 'caller', 'press', '9', 'copy', 'friend', 'callertune']]
['spam', ['winner', 'valued', 'network', 'customer', '

## Step 3: Vectorizing Data

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

#truncatedSpam = truncatedSpam[1:]

tfidf_vect = TfidfVectorizer([i[1] for i in truncatedSpam])
X_counts = tfidf_vect.fit_transform([i[1] for i in spam])
print(X_counts)

  (0, 3550)	0.1481298737377147
  (0, 8030)	0.22998520738984352
  (0, 4350)	0.3264252905795869
  (0, 5920)	0.2553151503985779
  (0, 2327)	0.25279391746019725
  (0, 1303)	0.24415547176756056
  (0, 5537)	0.15618023117358304
  (0, 4087)	0.10720385321563428
  (0, 1751)	0.2757654045621182
  (0, 3634)	0.1803175103691124
  (0, 8489)	0.22080132794235655
  (0, 4476)	0.2757654045621182
  (0, 1749)	0.3116082237740733
  (0, 2048)	0.2757654045621182
  (0, 7645)	0.15566431601878158
  (0, 3594)	0.15318864840197105
  (0, 1069)	0.3264252905795869
  (0, 8267)	0.18238655630689804
  (1, 5504)	0.27211951321382544
  (1, 4512)	0.4082988561907181
  (1, 4318)	0.5236458071582338
  (1, 8392)	0.4316010362639011
  (1, 5533)	0.5465881710238072
  (2, 4087)	0.07917128722158312
  (2, 3358)	0.11301399735581102
  :	:
  (5570, 4218)	0.12246610191126918
  (5570, 8313)	0.18723687600522523
  (5570, 1084)	0.11225268140936363
  (5570, 4615)	0.1596552981734164
  (5570, 7039)	0.18426763178390446
  (5570, 3308)	0.1217217261863451

tfidf vectorizer makes a vector based on the frequency of each possible word with a weighted value(weighting is based on values that seem to be a determining factor)

There are other alternatives like countVectorizing

## Step 4: Building the Classifier

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
import pandas as pd

X_features = pd.DataFrame(X_counts.toarray()) #create dataframe for random forest
#print(X_features.head(1))

rf = RandomForestClassifier(n_jobs=-1)
k_fold1 = KFold(n_splits=5)

k_fold2 = KFold(n_splits=10)

print(cross_val_score(rf, X_features, [i[0] for i in truncatedSpam], cv=k_fold1, scoring='accuracy', n_jobs=-1))
print('LINE BREAK'.center(100,'-'))
print(cross_val_score(rf, X_features, [i[0] for i in truncatedSpam], cv=k_fold2, scoring='accuracy', n_jobs=-1))

[0.97399103 0.97488789 0.96947935 0.96499102 0.96678636]
---------------------------------------------LINE BREAK---------------------------------------------
[0.97670251 0.96236559 0.97127469 0.97666068 0.97127469 0.97307002
 0.96768402 0.96947935 0.97127469 0.97486535]


## Holdout Sets

In [9]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X_features, [i[0] for i in truncatedSpam], test_size=0.2)

temp = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)

temp.fit(X_features, [i[0] for i in truncatedSpam]) #fit by data and their corresponding tags

print(temp.predict_proba(X_features.head())) #head gives the first 5 rows, so prediction happens for the first five rows

print(X_features.head())

print([truncatedSpam[i][0] for i in range(0, 5)])

#temp_model = temp.fit(X_train, Y_train)
#sorted(zip(temp_model.feature_importances_, X_train.columns), reverse=True)[0:10]

[[0.95979754 0.04020246]
 [0.95887734 0.04112266]
 [0.29674749 0.70325251]
 [0.95887734 0.04112266]
 [0.94026273 0.05973727]]
   0     1     2     3     4     5     6     7     8     9     ...  8663  \
0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
1   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
2   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
3   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
4   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   

   8664  8665  8666  8667  8668  8669  8670  8671  8672  
0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
1   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
2   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
3   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
4   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  

[5 rows x 8673 columns]
['ham', 'ham', 'spam', 'ham', 'ham']


# Conclusion

## The two columns above represent the probability of being either ham/spam
In this case, the first one has probability of being ham by 95.54 percent (as well as the second, fourth and fifth).
The third value has 75 percent probability of being spam and 25 percent being ham.
Below the result i have shown the data and the tags for it (to verify that it is correct).

Feature Engineering (adding length of messages, amount of punctuation) 

Testing can be very much improvement with external data

Many better methods of doing this, using different values or methods all together

Better/More data cleaning methods could be used 

Holdout sets are never an indicator of quality (well, maybe a little, but negligible)