In [None]:
#Connecting the google drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Imports
import pandas as pd

In [None]:
data_file = "/content/drive/MyDrive/Colab Notebooks/Simple Spam Filter/spam.csv"

In [None]:
df = pd.read_csv(data_file, encoding='latin-1', header = None)


In [None]:
df = df.iloc[1:, :2]
df.columns = ['label', 'message']

In [None]:
df.head()

Unnamed: 0,label,message
1,ham,"Go until jurong point, crazy.. Available only ..."
2,ham,Ok lar... Joking wif u oni...
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...
4,ham,U dun say so early hor... U c already then say...
5,ham,"Nah I don't think he goes to usf, he lives aro..."


### Data Pre-processing

In [None]:
#load stopwords and punctuation

import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')


stopwords = nltk.corpus.stopwords.words('english')
punctuation = string.punctuation
print(stopwords[:5])
print(punctuation)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['a', 'about', 'above', 'after', 'again']
!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
#pre-process sms content

# Function to process SMS
def pre_process(SMS):
  lowercase = "".join([char.lower() for char in SMS if char not in punctuation])  # Converts each character to lowercase
  tokenize = nltk.tokenize.word_tokenize(lowercase)
  remove_stopwords = [word for word in tokenize if word not in stopwords]
  return remove_stopwords


# Apply pre_process to every SMS in the 'SMS' column
df['Processed'] = df['message'].apply(lambda x: pre_process(x))


In [None]:
df.head()

Unnamed: 0,label,message,Processed
1,ham,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
2,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
4,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
5,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, goes, usf, lives, around, t..."


In [None]:
#categorizing ham/spam associated words

def categorize_words():
  spam_words = []
  ham_words = []

  #spam associated words
  for sms in df['Processed'][df['label'] == 'spam']:
    for word in sms:
      spam_words.append(word)

  #ham associated words
  for sms in df['Processed'][df['label'] == 'ham']:
    for word in sms:
      ham_words.append(word)

  return spam_words, ham_words

spam_words, ham_words = categorize_words()

print(spam_words[:5])
print(ham_words[:5])

['free', 'entry', '2', 'wkly', 'comp']
['go', 'jurong', 'point', 'crazy', 'available']


Predict user input

In [None]:
def predict(user_input):
  spam_counter = 0
  ham_counter = 0

  for word in user_input:
    spam_counter += spam_words.count(word)
    ham_counter += ham_words.count(word)

  print("---------------Results---------------")
  if ham_counter > spam_counter:
    #adding accuracy
    accuracy = round((ham_counter/(ham_counter + spam_counter)) * 100, 2)
    print("Message is not spam, with {}% accuracy". format(accuracy))
  elif spam_counter > ham_counter:
    accuracy = round((spam_counter/(ham_counter + spam_counter)) * 100, 2)
    print("Message is Spam with {}% accuracy". format(accuracy))
  else:
    print("Message might be spam")



In [None]:
#Collecting user input

user_input = input("Please type a spam or ham message: ")

process_input = pre_process(user_input)
predict(process_input)





Please type a spam or ham message: Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
---------------Results---------------
Message is not spam, with 93.05% accuracy
