In [10]:
#importing the data and preprocessing it for calculating prediciton probability

import os
import wget
import zipfile

import pandas as pd
from absl import logging

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

def download_and_prepare(name, path):
    if name == "sms_spam":
        logging.info(f"Preparing dataset {name}...")
        # Check if data has been extracted and if not download extract it
        if (os.path.exists(os.path.join(path, "SMSSpamCollection"))):
            logging.info(f"Dataset {name} already extracted.")
        else:
            logging.info(f"Downloading dataset {name}...")
            url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
            wget.download(url, path)
            logging.info(f"Extracting dataset {name}...")
            with zipfile.ZipFile(os.path.join(path, "smsspamcollection.zip"), 'r') as zip_ref:
                zip_ref.extractall(os.path.join(path, "SMSSpamCollection"))

        # Read dataset with pandas
        dataset = pd.read_csv(os.path.join(path, "SMSSpamCollection", "SMSSpamCollection"), delimiter="\t", encoding="latin-1", header=None)
        logging.debug(f"{len(dataset)} entries read.")

        # Preprocessing
        dataset[2] = dataset[0].map({'ham': 0, 'spam': 1})
        X, y = dataset[1].values, dataset[2].values
        vect = CountVectorizer(stop_words='english')
        vect.fit(X)
        #print("Vocabulary: ", len(vect.vocabulary_))
        X = vect.transform(X).toarray()
        #print(dataset)
        #print(vect)
        # Train-test-split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

        return X_train, X_test, y_train, y_test

    else:
        raise ValueError

In [11]:
#testing the above dataset and the diamention of the data set

X_train, X_test, y_train, y_test = download_and_prepare("sms_spam", "D:/SEM 2/DPR/Programming/")

In [13]:
import numpy as np
X_train, X_test, y_train, y_test = download_and_prepare("sms_spam", "D:/SEM 2/DPR/Programming/")
alpha = 0.01 #smoothing factor to avoid dividing by zero
n_alpha = 10
# if alpha is chosen to be 1 then it is called as laplace smoothing
# multiply the alpha with n to compensate the addition that has happened in in the numerator  
classess = np.unique(y_train)
priorir = np.ones(len(classess))
#calculation of priori 
for i in range(len(classess)):
    priorir[i] = len(y_train[y_train==classess[i]])/len(y_train)

#calculation of likelihood




P_t_H = X_train[y_train==0]
P_t_S = X_train[y_train==1]
P_t_H = (np.sum(P_t_H,axis = 0)+0.01)/len(y_train[y_train==0])
P_t_S = (np.sum(P_t_S,axis = 0)+0.01)/len(y_train[y_train==1])


P_t_H = np.log(P_t_H)
P_t_S = np.log(P_t_S)
#trying out the priori
print(priorir)

[0.86717523 0.13282477]


In [59]:
#defineing the classifier type and implementing the naive bayes method for prediction 

from absl import logging
import time
import matplotlib.pyplot as plt
import numpy as np

class SpamClassifier(object):
    """
    Spam classifier using (multinomial) Naive Bayes

    Parameters:
        alpha (float): Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).
    """
    def __init__(self, alpha=1.0):
        super(SpamClassifier, self).__init__()
        self.alpha = alpha

    def train(self, X, y):
        """
        Training method

        Estimates the log-likelihoods and priors for both classes ham and spam.

        Parameters:
            X (ndarray): Feature matrix with shape (num_samples, num_features)
            y (ndarray): Label vector with shape (num_samples,)
        """
        logging.info(f"Starting training...")
        start_time = time.time()

        n_samples, n_features = X.shape
        print(X.shape)
        self.classes = np.unique(y)
        self.n_classes = len(self.classes)

        self.priors = np.ones((self.n_classes,)) / self.n_classes # This is just a placeholder
        
        # X_train, X_test, y_train, y_test = download_and_prepare("sms_spam", "/home/dearadhp/")
        alpha = 0.01 #smoothing factor to avoid dividing by zero
        n_alpha = self.n_classes
        # if alpha is chosen to be 1 then it is called as laplace smoothing
        # multiply the alpha with n to compensate the addition that has happened in in the numerator  
        X_train = X
        y_train = y
        #calculation of priori 
        for i in range(self.n_classes):
            self.priors[i] = len(y_train[y_train==self.classes[i]])/len(y_train)
        print(self.priors)

        
        # TODO: Estimate priors
        self.log_priors = np.log(self.priors)

        self.log_probs = np.zeros((self.n_classes, n_features))
        temp = np.zeros((1, n_features))
        print(self.log_probs.shape)
        for i in range(self.n_classes):
            #calculation of likelihood
            #calculation of log likelihood  
            P_t_H = X_train[y_train==self.classes[i]]
            P_t_H = (np.sum(P_t_H,axis = 0)+alpha)/(len(y_train[y_train==self.classes[i]])+n_alpha)
            P_t_H = np.log(P_t_H)
            temp = np.vstack((temp,P_t_H))
        self.log_probs = temp[(1,self.n_classes),:]
            #print(np.unique(P_t_H))
        #print(np.unique(self.log_probs))
        
        
        
        # TODO: Estimate log-likelihoods

        logging.debug(f"Training took {int(time.time() - start_time)} seconds.")


    def predict(self, X):
        """
        Prediction method

        Uses Bayes rule to compute un-normalized posteriors
        
        

        Parameters:
            X (ndarray): Feature matrix with shape (num_samples, num_features)

        Returns:
            (ndarray): Prediction vector with shape (num_samples,)
        """
        # TODO: Implement MAP decision for multinomial Naive Bayes
        pri = np.matmul(self.log_probs, np.transpose(X))
        esti = pri.max(axis = 0)
        """for i in range(len(esti)):
            print(pri[:,i])
            print(esti[i]) """
        
        esti_class = np.argmax(pri, axis=0)
        '''for i in range(len(esti_class)):
            print(esti_class[i])
        print(len(esti_class))
        #print()'''
        #np.zeros(X.shape[0]) # This is just a placeholder in the return
        return esti_class
        





In [60]:
#loading the data and applying the classifier
X_train, X_test, y_train, y_test = download_and_prepare("sms_spam", "D:/SEM 2/DPR/Programming/")

spam_classifier = SpamClassifier()
spam_classifier.train(X_train, y_train)
spam_classifier.predict(X_test)

INFO:absl:Preparing dataset sms_spam...
INFO:absl:Dataset sms_spam already extracted.
DEBUG:absl:5572 entries read.
INFO:absl:Starting training...
DEBUG:absl:Training took 0 seconds.


(4457, 8480)
[0.86717523 0.13282477]
(2, 8480)


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [61]:
for i in range(len(y_test)):
            print(y_test[i])

0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
1
0
0
0
1
0
0
0
0
0
0
1
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
1
0
0
0
0
0
0
0
0
1
1
0
0
0
0
0
1
0
0
1
0
0
1
1
1
0
0
0
0
0
0
1
0
1
1
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
1
0
1
0
1
1
0
0
0
0
1
0
0
1
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
1
0
1
0
0
0
0
0
0
0
0
0
0
1
0
0
1
0
0
0
0
1
0
0
0
0
0
0
0
1
0
1
1
0
0
0
0
1
0
0
0
0
0
0
1
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
1
0
1
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
1
0
0
0
1
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
1
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
1
0
0
0
1
0
1
0
0
0
0
0
0
0
1
0
0
0
1
0
0
0
0
0
0
0


In [62]:
# %load main.py
from absl import logging

from sklearn.metrics import accuracy_score

from datasets import download_and_prepare
from classifier import SpamClassifier


def main():
    X_train, X_test, y_train, y_test = download_and_prepare("sms_spam", "D:/SEM 2/DPR/Programming/")

    spam_classifier = SpamClassifier()
    spam_classifier.train(X_train, y_train)

    logging.info(f"Train Accuracy: {accuracy_score(y_train, spam_classifier.predict(X_train))}")
    logging.info(f"Test Accuracy: {accuracy_score(y_test, spam_classifier.predict(X_test))}")
    

if __name__ == "__main__":
    logging.set_verbosity(logging.DEBUG)
    main()


INFO:absl:Preparing dataset sms_spam...
INFO:absl:Dataset sms_spam already extracted.
DEBUG:absl:5572 entries read.
INFO:absl:Starting training...
DEBUG:absl:Training took 0 seconds.
INFO:absl:Train Accuracy: 0.8658290329818263
INFO:absl:Test Accuracy: 0.8663677130044843
