In [12]:
# Importing common packages

import os, sys, random
import csv, sklearn
import numpy as np

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix

Loading the test, val and test datasets

In [13]:
train_data = [i.split('\n')[0] for i in open("../data/train_data.csv").readlines()]
val_data = [i.split('\n')[0] for i in open("../data/val_data.csv").readlines()]
test_data = [i.split('\n')[0] for i in open("../data/test_data.csv").readlines()]

Separating label and list of words from the combined string

In [14]:
def get_string_from_list_words(word_list):
    temp = ""
    for i in word_list:
        temp = temp + i + " "

    return temp

x_train, y_train = [get_string_from_list_words(i.split(',')[1:]) for i in train_data], [i.split(',')[0] for i in train_data]
x_val, y_val = [get_string_from_list_words(i.split(',')[1:]) for i in val_data], [i.split(',')[0] for i in val_data]
x_test, y_test = [get_string_from_list_words(i.split(',')[1:]) for i in test_data], [i.split(',')[0] for i in test_data]


Converting the data to bag-of-words to be used in 3 models, viz. Multinomial Naive Bayes, Logistic Regression, Semi-supervised Clustering

Defining the Vectorizer

In [15]:
word_vec = CountVectorizer()
word_vec.fit(x_train)

Creating a Bag-of-words corpus

In [16]:
train_vocab = word_vec.vocabulary_

x_train_vec = word_vec.transform(x_train)
x_val_vec = word_vec.transform(x_val)
x_test_vec = word_vec.transform(x_test)

len(train_vocab), x_train_vec.shape, x_val_vec.shape, x_test_vec.shape

(5835, (3901, 5835), (836, 5835), (837, 5835))

Converting Bag-of-Words corpus to tf-idf corpus

In [17]:
tfidf_transformer = TfidfTransformer().fit(x_train_vec)

x_train_tf = tfidf_transformer.transform(x_train_vec)
x_val_tf = tfidf_transformer.transform(x_val_vec)
x_test_tf = tfidf_transformer.transform(x_test_vec)

x_train_tf.shape, x_val_tf.shape, x_test_tf.shape

((3901, 5835), (836, 5835), (837, 5835))

### Model 1: Multinomial Naive Bayes

In [19]:
for i in np.arange(0.25, 2.25, 0.25):
    multi_nb_spam_model = MultinomialNB(alpha=i).fit(x_train_tf, y_train)
    y_pred = multi_nb_spam_model.predict(x_val_tf)
    print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99       736
        spam       0.99      0.89      0.94       100

    accuracy                           0.99       836
   macro avg       0.99      0.94      0.96       836
weighted avg       0.99      0.99      0.99       836

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       736
        spam       1.00      0.85      0.92       100

    accuracy                           0.98       836
   macro avg       0.99      0.93      0.95       836
weighted avg       0.98      0.98      0.98       836

              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       736
        spam       1.00      0.80      0.89       100

    accuracy                           0.98       836
   macro avg       0.99      0.90      0.94       836
weighted avg       0.98      0.98      0.97       836

              preci

#### Looking at the scores for each model with different alpha values, alpha = 0.25 gives the best results. 

In [21]:
multi_nb_spam_model = MultinomialNB(alpha=0.25).fit(x_train_tf, y_train)
y_test_pred = multi_nb_spam_model.predict(x_test_tf)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

         ham       0.98      0.99      0.99       720
        spam       0.95      0.88      0.92       117

    accuracy                           0.98       837
   macro avg       0.97      0.94      0.95       837
weighted avg       0.98      0.98      0.98       837

