In [3]:
import numpy as np
import sklearn
import pandas as pd

In [2]:
df = pd.read_csv('spam.csv', encoding='latin-1')
df.head(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,


In [4]:
data = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
data = df[['v2', 'v1']].rename(columns={'v2': 'text', 'v1': 'label'})
data.head()

Unnamed: 0,text,label
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


In [5]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()
data['label'] = enc.fit_transform(data['label'])
data.head()

Unnamed: 0,text,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [24]:
X = data.drop(columns=['label'])["text"].apply(lambda x: x.lower())
Y = data['label']

In [25]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [26]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((4457,), (1115,), (4457,), (1115,))

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(encoding='latin-1', ngram_range =(1, 3), stop_words='english')
vec.fit(x_train)
x_train = vec.transform(x_train).toarray()
x_test = vec.transform(x_test).toarray()

In [28]:
x_train.shape, x_test.shape

((4457, 54824), (1115, 54824))

In [29]:
vec.get_feature_names_out()

array(['00', '00 easter', '00 easter prize', ..., 'ûówell', 'ûówell û_',
       'ûówell û_ yes'], dtype=object)

# **Naive Bayes**

In [30]:
class GaussianNaiveBayes:
  def fit(self, X, y):
    n_samples, n_features = X.shape
    self.classes = np.unique(y)
    n_classes = len(self.classes)

    self._mean = np.zeros((n_classes, n_features))
    self._var = np.zeros((n_classes, n_features))
    self._class_prob = np.zeros(n_classes)

    for idx, cls in enumerate(self.classes):
      X_cls = X[y == cls]
      self._mean[idx, :] = X_cls.mean(axis=0)
      self._var[idx, :] = np.clip(X_cls.var(axis=0), 1e-8, None)
      self._class_prob[idx] = X_cls.shape[0] / float(n_samples)

  def predict(self, X):
    y_pred = [self._predict(x) for x in X]
    return np.array(y_pred)

  def _predict(self, x):
    _prob = np.zeros(len(self.classes))

    for idx, cls in enumerate(self.classes):
      conds = np.sum(np.log(self._pdf(x, idx)))
      c_prob = self._class_prob[idx]
      prob = np.log(c_prob) + conds

      _prob[idx] = prob

    return self.classes[np.argmax(_prob)]

  def _pdf(self, x, idx):
    mean = self._mean[idx]
    var = self._var[idx]
    numerator = np.exp(-(x - mean)**2 / (2 * var))
    denominator = np.sqrt(2 * np.pi * var)
    return numerator / denominator

  def score(self, X, y):
    y_pred = self.predict(X)
    return y_pred[y_pred == y].shape[0] / len(y)

In [31]:
gb = GaussianNaiveBayes()
gb.fit(x_train, y_train)

In [32]:
preds = gb.predict(x_test)

  conds = np.sum(np.log(self._pdf(x, idx)))


In [33]:
np.unique(preds)

array([0, 1])

In [34]:
gb.score(x_test, y_test)

  conds = np.sum(np.log(self._pdf(x, idx)))


0.8860986547085202

In [41]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB

gnb = GaussianNB()
y_pred = gnb.fit(x_train, y_train).predict(x_test)

In [43]:
acc = len(y_test[y_test == y_pred]) / len(y_test)
print(f"Accuracy = {acc}")

Accuracy = 0.9174887892376682


In [44]:
mnb = MultinomialNB()
y_pred = mnb.fit(x_train, y_train).predict(x_test)

In [45]:
acc = len(y_test[y_test == y_pred]) / len(y_test)
print(f"Accuracy = {acc}")

Accuracy = 0.9856502242152466


# **Model Pipeline**

In [46]:
class SpamClassifier:
  def __init__(self):
    self.model = MultinomialNB()
    self.vec = CountVectorizer(encoding='latin-1', stop_words='english')
    df = pd.read_csv('spam.csv', encoding='latin-1')
    data = df[['v2', 'v1']].rename(columns={'v2': 'text', 'v1': 'label'})
    enc = LabelEncoder()
    data['label'] = enc.fit_transform(data['label'])
    X = data.drop(columns=['label'])["text"].apply(lambda x: x.lower())
    Y = data['label']

    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
    self.vec.fit(x_train)
    x_train = self.vec.transform(x_train).toarray()
    x_test = self.vec.transform(x_test).toarray()
    self.model.fit(x_train, y_train)

  def predict(self, message):
    X = self.vec.transform(message).toarray()
    prediction = self.model.predict(X)
    return prediction

In [49]:
word = "Pantaloons is offering buy 2 get 1 offer. Valid till 27th June"
pipeline = SpamClassifier()
print(pipeline.predict([word.lower()]))

[1]
