###Spam Detection Using Naive-Bayes




In [5]:
import string
import nltk
import pandas as pd

In [3]:
#Reading the csv dataset
df=pd.read_csv("spam.csv")
df.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [6]:
#Trying to understand the data,using Grouby on Category column
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


###Pre-Processing Data

In [9]:
#Downloading NLTK libraries
nltk.download('stopwords')
nltk.download('punkt')

stopwords=nltk.corpus.stopwords.words('english')
punctuation=string.punctuation

print(stopwords[:10])
print(punctuation)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]
!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [14]:
#Pre-Processing the Message Data

def pre_process(Message):
  lowercase="".join([char.lower() for char in Message if char not in punctuation])
  tokenize=nltk.tokenize.word_tokenize(lowercase)
  remove_stopwords=[word for word in tokenize if word not in stopwords]
  return remove_stopwords

df['Processed']=df['Message'].apply(lambda x:pre_process(x))
df['Spam']=df['Category'].apply(lambda x:1 if x=='spam' else 0)
df.head(10)


Unnamed: 0,Category,Message,Processed,Spam
0,ham,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n...",0
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]",0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin...",1
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]",0
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, goes, usf, lives, around, t...",0
5,spam,FreeMsg Hey there darling it's been 3 week's n...,"[freemsg, hey, darling, 3, weeks, word, back, ...",1
6,ham,Even my brother is not like to speak with me. ...,"[even, brother, like, speak, treat, like, aids...",0
7,ham,As per your request 'Melle Melle (Oru Minnamin...,"[per, request, melle, melle, oru, minnaminungi...",0
8,spam,WINNER!! As a valued network customer you have...,"[winner, valued, network, customer, selected, ...",1
9,spam,Had your mobile 11 months or more? U R entitle...,"[mobile, 11, months, u, r, entitled, update, l...",1


In [21]:
#Categorizing ham/spam word's in different list's
def categorize_words():
  spam_words=[]
  ham_words=[]

  #Spam Associated Words
  for Message in df['Processed'][df['Spam']==1]:
    for word in Message:
      spam_words.append(word)
  
  #Ham Associated Words
  for Message in df['Processed'][df['Spam']==0]:
    for word in Message:
      ham_words.append(word)

  return spam_words,ham_words

spam_words,ham_words=categorize_words()

print(spam_words[:10])
print(ham_words[:10])


['free', 'entry', '2', 'wkly', 'comp', 'win', 'fa', 'cup', 'final', 'tkts']
['go', 'jurong', 'point', 'crazy', 'available', 'bugis', 'n', 'great', 'world', 'la']


In [28]:
#Predicting the user text input and classifying them as spam or not

def predict(user_input):
  spam_counter=0
  ham_counter=0

  for word in user_input:
    spam_counter+=spam_words.count(word)
    ham_counter+=ham_words.count(word)

  if ham_counter>spam_counter:
    accuracy=(ham_counter/(spam_counter+ham_counter))*100
    print("User Text is Not Spam, with Accuracy of {}%".format(accuracy))

  elif spam_counter>ham_counter:
    accuracy=(spam_counter/(spam_counter+ham_counter))*100
    print("User Text is Spam, with Accuracy of {}%".format(accuracy))

  else:
    print("Could be spam, with 50% accuracy")

In [31]:
user_input=input("Please Enter Text :")


Please Enter Text :I am going for a swim


In [32]:
processed_input=pre_process(user_input)
predict(processed_input)

User Text is Not Spam, with Accuracy of 97.6608187134503%
