<a href="https://colab.research.google.com/github/Timure228/Naive-Bayes-Classifiers/blob/main/multinominal_naive_bayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Naive Multinomial Bayes spam classifier from scratch

In [1]:
normal = [["Dear", "Uzbek", "how", "are", "you", "doing", "?"], ["Hi", "Uzbek", "want", "to", "go", "out", "?"]]
spam = [["You", "won", "1000", "$", "congratulations", "!"]]

In [3]:
import pandas as pd
import csv
import requests

In [None]:
pd.read_csv("spam.csv")[:50]

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [62]:
with open("/content/spam.csv", "wb") as f:
  request = requests.get("https://raw.githubusercontent.com/Timure228/Naive-Bayes-Classifiers/refs/heads/main/SPAM%20text%20message%2020170820%20-%20Data.csv")
  f.write(request.content)

with open("/content/spam.csv", "r") as f:
  data = list(csv.reader(f))[1:]

In [42]:
normal = []
spam = []

for i in range(len(data)):
  if data[i][0] == "ham":
    normal.append(data[i][1].split())
  else:
    spam.append(data[i][1].split())

In [43]:
normal[1]

['Ok', 'lar...', 'Joking', 'wif', 'u', 'oni...']

In [44]:
# Label probabilities
normal_p = len(normal)/len(normal) + len(spam)
spam_p = len(spam)/len(normal) + len(spam)

In [45]:
# Likelihoods
def tokenize(arr):
  normal_tokens = []
  for i in arr:
    for j in i:
        normal_tokens.append(j.lower())
  return normal_tokens

In [46]:
normal_tokens = tokenize(normal)
spam_tokens = tokenize(spam)
tokens = normal_tokens + spam_tokens

tokens[:5]

['go', 'until', 'jurong', 'point,', 'crazy..']

In [47]:
# Count total amount of words
n_tokens = len(tokens)

n_tokens

86836

In [48]:
# Count each word
normal_words = dict(map(lambda word: (word, normal_tokens.count(word)), set(tokens)))
spam_words = dict(map(lambda word: (word, spam_tokens.count(word)), set(tokens)))

In [49]:
# Add black box
BLACK_BOX = 1

normal_words = {word: i + BLACK_BOX for word, i in normal_words.items()}
spam_words = {word: i + BLACK_BOX for word, i in spam_words.items()}

In [50]:
# Set the likelihoods
normal_likelihoods = {word: i / n_tokens for word, i in normal_words.items()}
spam_likelihoods = {word: i / n_tokens for word, i in spam_words.items()}

In [319]:
import math

def predict(message):
  message = " ".join(message).lower()
  arr = message.split()
  normal_p = []
  spam_p = []
  for i in arr:
    normal_p.append(normal_likelihoods[i])
    spam_p.append(spam_likelihoods[i])

  if math.prod(normal_p) > math.prod(spam_p):
    return "The message is normal"

  return "The message is spam"

In [52]:
" ".join(spam[3])

'WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.'

In [53]:
predict(" ".join(normal[3]).lower())

'The message is normal'

# Multinominal Naive Bayes with Sklearn

In [65]:
len(data)

5572

In [261]:
train_data = data[:int(len(data)*0.8)]
test_data = data[int(len(data)*0.8):]

In [262]:
X_train, y_train = [], []
X_test, y_test = [], []

def create_sequence(X, y, data):
  for i in range(len(data)):
    X.append([data[i][1]])
    if data[i][0] == "ham":
      y.append(0)
    else:
      y.append(1)
  return X, y

X_train, y_train = create_sequence(X_train, y_train, train_data)
X_test, y_test = create_sequence(X_test, y_test, test_data)

In [263]:
# Convert data into numerical
X_train_numerical = [[normal_words[j.lower()] for j in i[0].split()] for i in X_train]
X_test_numerical = [[normal_words[j.lower()] for j in i[0].split()] for i in X_test]

In [264]:
# Pad the sequences
maxlen = len(max(X_train_numerical, key=len))
from tqdm.auto import tqdm

def pad(X, maxlen):
  for i in tqdm(X):
    while len(i) < maxlen:
      i.append(0)
  return X

X_train_numerical = pad(X_train_numerical, maxlen)
X_test_numerical = pad(X_test_numerical, maxlen)

  0%|          | 0/4457 [00:00<?, ?it/s]

  0%|          | 0/1115 [00:00<?, ?it/s]

In [265]:
# Convert into np.array
X_train_numerical = np.array(X_train_numerical)
X_test_numerical = np.array(X_test_numerical)

In [267]:
import sklearn as sk
from sklearn.naive_bayes import MultinomialNB

multinominal_nb = MultinomialNB()
multinominal_nb.fit(X_train_numerical, y_train)

In [274]:
# Do some metrics
from sklearn.metrics import f1_score, classification_report

y_preds = multinominal_nb.predict(X_test_numerical)

print(classification_report(y_test, y_preds, target_names=["ham", "spam"]))

              precision    recall  f1-score   support

         ham       0.94      0.77      0.84       970
        spam       0.29      0.65      0.41       145

    accuracy                           0.75      1115
   macro avg       0.62      0.71      0.62      1115
weighted avg       0.85      0.75      0.79      1115



In [292]:
multinominal_nb.predict([X_train_numerical[140]])

array([1])

In [314]:
def decode(X):
  normal_words_rev = {i: word for i, word in enumerate(normal_words)}
  return [normal_words_rev[i] for i in X[0]]

In [358]:
example = X_train_numerical[543]
example_list = decode([example])

In [365]:
def sklearn_predict(X, y):
  return f"The message is spam {y == 1}" if multinominal_nb.predict([X]) == 1 else f"The message is normal {y == 0}"

In [367]:
sklearn_predict(example, y_train[543])

'The message is normal True'

In [368]:
predict(example_list)

'The message is spam'