# **SMS Spam Detection**

## *Import Dataset*

First we have to import the dataset. The dataset is composed of 5548 SMS.

In [1]:
import os
import nltk
import pandas as pd
import string
import matplotlib.pyplot as plt

In [2]:
nltk.download('stopwords')
nltk.download('words')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\39349\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\39349\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\39349\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
punctuations = list(string.punctuation)
stopwords = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.PorterStemmer()
names = ['punctuations', 'stopwords', 'stemmer']
objs = [punctuations, stopwords, stemmer]
for name, obj in zip(names, objs):
    print(name, obj)
    print()

punctuations ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']

stopwords {"needn't", "shan't", 'they', 'ain', 'before', 'below', 'why', 'was', 'other', "don't", 'me', "aren't", "weren't", 'did', 'on', 'his', 'down', "it's", 'only', 'you', 'needn', 'won', 'the', 'such', 'because', 'under', 'it', 'hadn', 'haven', 'so', 't', 'its', 've', 'has', "hasn't", 'same', 'couldn', 'are', 'what', 'herself', 'when', "mightn't", 'isn', 'and', 'have', 'do', 'hers', 'some', 'i', 'with', 'yourself', 'further', 'd', 'or', 'above', 'your', 'as', 'where', 'y', 'ma', 'each', 'doesn', 'against', "should've", 'my', "that'll", 'were', 'ourselves', 'shouldn', 'if', 'itself', 'between', 'should', 'this', 'too', "didn't", 'any', 'who', 'whom', 'for', 'after', "you're", "wasn't", 'from', 'can', 'over', 'shan', 'had', 'these', 'am', 'ours', 'most', 'but', 'nor', 'very', 'will', 'hasn', 'once', 'yours', '

In [13]:
#Import the csv dataset file and transform it into a dataframe
df = pd.read_csv('./dataset/sms_spam_no_header.csv',header = None, names = ['Labels','SMS'])

In [14]:
df.head(5)

Unnamed: 0,Labels,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [15]:
df.tail(5)

Unnamed: 0,Labels,SMS
5569,spam,This is the 2nd time we have tried 2 contact u...
5570,ham,Will ü b going to esplanade fr home?
5571,ham,"Pity, * was in mood for that. So...any other s..."
5572,ham,The guy did some bitching but I acted like i'd...
5573,ham,Rofl. Its true to its name


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5574 entries, 0 to 5573
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Labels  5574 non-null   object
 1   SMS     5574 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [17]:
#Map the categorical labels "Spam" and "Ham" into binary labels (0,1)
df['Labels'] = df['Labels'].map({'spam': 1, 'ham': 0})
df.head()

Unnamed: 0,Labels,SMS
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [18]:
#Check null values
df.isna().sum()

Labels    0
SMS       0
dtype: int64

In [20]:
df.shape

(5574, 2)

In [21]:
#Check if there are duplicates messsages and, if so, drop them
df.drop_duplicates(inplace = True)
df.shape

(5160, 2)

In [46]:
def word_token(text):
    if not text:
        return
    # Tokenize the message
    tokens = nltk.word_tokenize(text)
    return len(tokens)

def sentence_token(text):
    if not text:
        return
    # Tokenize the message
    tokens = nltk.sent_tokenize(text)
    return len(tokens)

def check_numbers(text):
    if not text:
        return
    for n in text:
        if (n.isdigit()):
            return 1
    return 0


In [47]:
df['Length'] = df['SMS'].apply(len)
df['Words'] = df['SMS'].apply(word_token)
df['Sentences'] = df['SMS'].apply(sentence_token)
df['Numbers'] = df['SMS'].apply(check_numbers)
df.head(9)

Unnamed: 0,Labels,SMS,Length,Words,Sentences,Numbers
0,0,"Go until jurong point, crazy.. Available only ...",111,24,2,0
1,0,Ok lar... Joking wif u oni...,29,8,2,0
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155,37,2,1
3,0,U dun say so early hor... U c already then say...,49,13,1,0
4,0,"Nah I don't think he goes to usf, he lives aro...",61,15,1,0
5,1,FreeMsg Hey there darling it's been 3 week's n...,147,39,4,1
6,0,Even my brother is not like to speak with me. ...,77,18,2,0
7,0,As per your request 'Melle Melle (Oru Minnamin...,160,31,2,1
8,1,WINNER!! As a valued network customer you have...,157,32,5,1


In [45]:
b = df['SMS']
b[2]

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"