**Spam Filtering using Multinomial Naive Bayes**

For spam filtering, we use another variation of Naive Bayes i.e. Multinomial Naive Bayes Clssifier where the likelihoods are computed as-

$$
P(w_i|\lambda) = \dfrac{N_{w_i|\lambda} + \alpha}{N_{\lambda} + \alpha N_{vocabulary}}
$$
where $w_i$ is the word whose likelihood is to be calculated and $\lambda$ represents a class.




In [3]:
# Multinomial Naive Bayes Model
class MyMultinomialNB:

    def __init__(self, filepath):
        np.random.seed(1234)
        self.data = pd.read_csv(filepath, sep='\t', header=None, names=['labels', 'sms'])   # read data
        print(self.data.shape)
        print(self.data.head(), end='\n\n')
        self.class_probabs = dict()
        self.vocab = None
        self.word_count = None
        self.num_spam = 0
        self.num_ham = 0
        self.vocab_size = 0
        self.alpha = 1

    def preprocess(self, split=0.8):
        self.data['sms'] = self.data['sms'].str.replace('\W+', ' ')   # replace non-word characters with a space
        self.data['sms'] = self.data['sms'].str.replace('\s+', ' ')   # relace multiple spaces with single space
        self.data['sms'] = self.data['sms'].str.strip()   # remove leading and trailing whitespaces
        self.data['sms'] = self.data['sms'].str.lower()   # make all characters lower case
        self.data['sms'] = self.data['sms'].str.split()   # break words on spaces to form a list
        # print(self.data.head())

        indices = np.arange(self.data.shape[0])
        np.random.shuffle(indices)
        self.train_data, self.test_data = self.data.iloc[indices[:int(split * len(indices))], :].reset_index(drop=True), self.data.iloc[indices[int(split * len(indices)):], :].reset_index(drop=True)  # split train and test data
        
        print(f'Training data: {self.train_data.shape[0]}')
        print(f'Test data: {self.test_data.shape[0]}')

        # split message and labels
        self.X_train, self.y_train = self.train_data.iloc[:, 1], self.train_data.iloc[:, 0] 
        self.X_test, self.y_test = self.test_data.iloc[:, 1], self.test_data.iloc[:, 0]

        return self.X_train, self.y_train, self.X_test, self.y_test
    
    def fit(self):
        self.class_probabs = self.y_train.value_counts(normalize=True)  # calculate class prior probabilities
        print(f'Class Probabilities:\n{self.class_probabs}')
        self.vocab = list(set(self.X_train.sum()))  # create vocabulary
        self.vocab_size = len(self.vocab)
        print(f'Vocabulary length: {self.vocab_size}')
        self.word_count = pd.DataFrame([[self.X_train.iloc[i].count(word) for word in self.vocab] for i in range(self.X_train.shape[0])], columns=self.vocab)   # count the number of times each word has appeare in a message
        self.X_train = pd.concat([self.X_train, self.word_count], axis=1).iloc[:, 1:]   # update train features
        # print(self.X_train.head())
    
    # calculate likelihood of a word for spam class
    def p_w_spam(self, word):
        if word in self.vocab:
            return (self.X_train.loc[self.y_train == 'spam', word].sum() + self.alpha) / ((self.y_train == 'spam').sum() + self.alpha * self.vocab_size)
        else:
            return 1

    # calculate likelihood of a word for ham class
    def p_w_ham(self, word):
        if word in self.vocab:
            return (self.X_train.loc[self.y_train == 'ham', word].sum() + self.alpha) / ((self.y_train == 'ham').sum() + self.alpha * self.vocab_size)
        else:
            return 1
    
    def predict(self, text):
        p_spam = self.class_probabs['spam']
        p_ham = self.class_probabs['ham']
        for word in text:
            p_spam *= self.p_w_spam(word)   # compute probability of message belonging to spam
            p_ham *= self.p_w_ham(word) # compute probability of message belonging to ham

        # predict
        if p_spam > p_ham:
            return 'spam'
        elif p_ham > p_spam:
            return 'ham'
        else:
            return 'can not classify'
        
    def test(self, test_data=None):
        # test
        predictions = []
        if test_data == None:
            for i in range(self.X_test.shape[0]):
                predictions.append(self.predict(self.X_test.iloc[i]))
            print(f'Accuracy = {(self.y_test == np.array(predictions)).sum()/len(predictions)}')
    
    def get_params(self):
        #return parameters
        return self.class_probabs, self.vocab, self.X_train, self.y_train

Processing data and fitting model¶


In [4]:
mnb = MyMultinomialNB('/content/SMSSpamCollection')
X_train, y_train, X_test, y_test = mnb.preprocess()

(5572, 2)
  labels                                                sms
0    ham  Go until jurong point, crazy.. Available only ...
1    ham                      Ok lar... Joking wif u oni...
2   spam  Free entry in 2 a wkly comp to win FA Cup fina...
3    ham  U dun say so early hor... U c already then say...
4    ham  Nah I don't think he goes to usf, he lives aro...

Training data: 4457
Test data: 1115


In [5]:
mnb.fit()

Class Probabilities:
ham     0.869419
spam    0.130581
Name: labels, dtype: float64
Vocabulary length: 7823


In [6]:
mnb.test()

Accuracy = 0.9632286995515695
