#                                      

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [None]:
data_file_path='/content/spam.csv'

In [None]:
df = pd.read_csv(data_file_path, encoding='ISO-8859-1')

In [None]:
df = df.iloc[:, :-3]
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df.columns = ['category', 'message']
df.head()

Unnamed: 0,category,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df['category'].value_counts()

ham     4825
spam     747
Name: category, dtype: int64

In [None]:
nltk.download('stopwords')

tokenizer = RegexpTokenizer('\w+')
stop_words_set = set(stopwords.words('english'))
porter_stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def preprocess(raw_text):
    raw_text = raw_text.lower()
    tokens = tokenizer.tokenize(raw_text) # breaking into small words
    # print(tokens)
    removed_stopwords = [word for word in tokens if word not in stop_words_set]
    # print(removed_stopwords)
    stemmed_words = [porter_stemmer.stem(token) for token in removed_stopwords]
    # print(stemmed_words)
    processed_text = ' '.join(stemmed_words)
    return processed_text

In [None]:
preprocess("Hi! I am XYZ. Don't talked too much.")

'hi xyz talk much'

In [None]:
df['message'] = df['message'].map(preprocess)

df.head()

Unnamed: 0,category,message
0,ham,go jurong point crazi avail bugi n great world...
1,ham,ok lar joke wif u oni
2,spam,free entri 2 wkli comp win fa cup final tkt 21...
3,ham,u dun say earli hor u c alreadi say
4,ham,nah think goe usf live around though


In [None]:
X = df['message']
y = df['category']
X.shape, y.shape

((5572,), (5572,))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
spam_words_probability = dict()
ham_words_probability = dict ()

def fit(X_train, y_train):
  spam_dict = dict()
  ham_dict = dict()

  spam_words_count = 0
  ham_words_count = 0

  # assign count 1 to every training words in the dataset
  for temp in X_train:
    tokens = tokenizer.tokenize(temp)
    for words in tokens:
      spam_dict[words]=1
      ham_dict[words]=1

  # count spam words and ham words
  for (sentence,types) in zip(X_train,y_train):
    tokens = tokenizer.tokenize(sentence)

    for words in tokens:
      if types == 'spam':
        spam_dict[words]= spam_dict[words] +1

      else:
        ham_dict[words] = ham_dict[words] +1

  # count total words in ham and spam
  for temp in ham_dict:
    ham_words_count = ham_words_count + ham_dict [temp]
  for temp in spam_dict:
    spam_words_count = spam_words_count + spam_dict [temp]

  # count the probabilities
  for temp in ham_dict:
    ham_words_probability[temp] = ham_dict[temp]/ham_words_count
  for temp in spam_dict:
    spam_words_probability[temp] = spam_dict[temp]/spam_words_count

fit(X_train,y_train)

In [None]:
spam_count = 0
ham_count =0

for type in y_train:
  if type == 'spam':
    spam_count = spam_count + 1
  else :
    ham_count = ham_count + 1


def predict(message):
  text=preprocess(message)
  tokens = tokenizer.tokenize(text)

  spam_prob= spam_count/(spam_count + ham_count)
  ham_prob = ham_count/(spam_count+ham_count)
  for words in tokens:
    if words in spam_words_probability:
      spam_prob = spam_prob * spam_words_probability[words]
      ham_prob = ham_prob * ham_words_probability[words]

  if ham_prob > spam_prob:
    return 'ham'
  else :
    return 'spam'

predict('Yes... I trust u to buy new stuff ASAP so I can try it out	')

'ham'

In [None]:
def evaluate(X_test, y_test):

  correct = 0
  for (msg,type) in zip (X_test, y_test):
    check = predict(msg)
    if check == type:
      correct = correct + 1
  print("Accurate :")
  print(correct)
  print("Message :")
  print(len(y_test))
  correct = (correct/len(y_test)) * 100
  print("Accuracy :")
  print(correct)

evaluate(X_test, y_test)

Accurate :
552
Message :
558
Accuracy :
98.9247311827957
