### Naive Bayes (Using Multinomial Event Model)

In [303]:
import json
import pandas as pd
import pickle
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
#from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.metrics import f1_score

In [249]:
np.random.seed(0)

#### LOAD AND PRE-PROCESS DATA

In [250]:
def read_data(train_file_name, test_file_name):    
    train_data = pd.read_json(train_file_name, lines=True)
    test_data = pd.read_json(test_file_name, lines=True)

    #final data
    #TRAIN DATA
    x_train = train_data['reviewText']
    y_train = train_data['overall']
    #TEST DATA
    x_test = test_data['reviewText']
    y_test = test_data['overall']
    
    return x_train, y_train, x_test, y_test

#### CLEAN DATA

In [251]:
def remove_punctution(tokens):
    words = []
    for word in tokens:
            if word.isalpha():
                words.append(word.lower()) #to not treat uppercase words differently
    return words

In [304]:
def clean_data(tokens):
    words = []
    stop_words = stopwords.words('english')
    #print(stop_words)
    lemmatizer = WordNetLemmatizer()
    #porter = PorterStemmer()
    for word in tokens:
            if word not in stop_words: #removing stop words
                words.append(lemmatizer.lemmatize(word))
                #words.append(porter.stem(word))    
                                        
    # Stemming refers to the process of reducing each word to its root or base.
    # I will be doing lemmatization rather than stemming, because lemmatization of words is based on linguistics and words are more meaningful.
    return words

#### PREPARE REQUIRED DICTIONARIES

In [253]:
def dictionary_prepare(x_train, y_train, saved = True, clean=False, part='1_a'):
    if(saved):
        #load dict from pickle file
        f1 = open('class_words_dict_'+part+'.pickle', 'rb')
        class_words_dict = pickle.load(f1)
        f1.close()

        #load vocab from pickle file
        f2 = open('vocabulary_'+part+'.pickle', 'rb')
        vocab = pickle.load(f2)
        f2.close()
        
        #load vocab from pickle file
        f3 = open('class_num_words_dict_'+part+'.pickle', 'rb')
        class_num_words_dict = pickle.load(f3)
        f3.close()
        
        return vocab, class_words_dict, class_num_words_dict
    
    #set of all disctinct words in the training data
    vocab = set()
    #number of examples in training data
    m = len(y_train)
    #making dictionary of words per class: key=class, val=dict(key=word, val=frequency)
    class_words_dict = {}
    #total words in class key: key=class, val=sum of total number of words in all examples of class key
    class_num_words_dict = {}
    
    for i in range(m):
        doc = x_train[i]
        cls = y_train[i] #class
        
        #split doc into list of individual words such that punctuations are kept separate from word
        tokens = word_tokenize(doc)
        #removing punctuations to get final list of words
        tokens = remove_punctution(tokens)
        #further do stemming, removing stopwords etc. for part (d)
        if(clean):
            tokens = clean_data(tokens)
        #calculating total number of words of class cls
        if(cls in class_num_words_dict.keys()):
            class_num_words_dict[cls] += len(tokens)
        else:
            class_num_words_dict[cls] = len(tokens)
            
        for word in tokens:
            vocab.add(word)
            if(cls in class_words_dict.keys()):
                if(word in class_words_dict[cls].keys()):
                    class_words_dict[cls][word] += 1 #if word is present, increase frequency by 1
                else: #make frequency of word 1 since word encountered for the first time for class 'cls'
                    class_words_dict[cls][word] = 1             
            else: #class is encountered for the first time
                class_words_dict[cls] = {} #initialize dictionary at class 'cls' as key
                #since dictionary is newly initialized, word can't possibly exist in it, therefore no need to check
                #set frequency to 1
                class_words_dict[cls][word] = 1
    #save dict to pickle file
    fp = open('class_words_dict_'+part+'.pickle', 'wb')
    pickle.dump(class_words_dict, fp)
    fp.close()
    
    #save vocab to pickle file
    f = open('vocabulary_'+part+'.pickle', 'wb')
    pickle.dump(vocab, f)
    f.close()
    
    #save class_num_words_dict to pickle file
    nf = open('class_num_words_dict_'+part+'.pickle', 'wb')
    pickle.dump(class_num_words_dict, nf)
    nf.close()
    
    return vocab, class_words_dict, class_num_words_dict


#### TRAIN

In [254]:
def train(x_train, y_train, vocab, class_words_dict, class_num_words_dict, alpha=1.0, saved=True, part='1_a'):
    if(saved):#if parameters have already been saved, load them
        #load dict from pickle file
        f4 = open('phi_'+ part +'.pickle', 'rb')
        phi = pickle.load(f4)
        f4.close()

        #load vocab from pickle file
        f5 = open('word_probs_per_class_' + part + '.pickle', 'rb')
        word_probs_per_class = pickle.load(f5)
        f5.close()
        
        return phi, word_probs_per_class
        
    # number of examples in training data
    m = len(y_train)
    
    #parameters
    phi = {} #key:class, val:prob of class = num of examples with class key/total number of examples
    word_probs_per_class = {} #key:class, val:dict(key=word, val=prob of word occuring in class key) 
    
    #calculating phi params
    for i in range(m):
        if(y_train[i] in phi.keys()):
            phi[y_train[i]] += 1 #increase frequency of class
        else:
            phi[y_train[i]] = 1
    # change frequency to probability
    for key in phi.keys():
        phi[key] /= m
        phi[key] = np.log(phi[key]) #taking log to prevent underflow
        
    # calculating word_probs_per_class
    mod_v = len(vocab)
    
    for cls in class_num_words_dict.keys():
        total_words = class_num_words_dict[cls]
        words_freqs = class_words_dict[cls]
        word_probs_per_class[cls] = {} 
        for word in vocab:  # also use laplace smoothing with alpha hyperparameter
            numerator = alpha  
            if(word in words_freqs.keys()): #word occured in class cls
                numerator += words_freqs[word]
            denominator = (mod_v*alpha) + total_words
            prob_word = numerator/denominator
            #update probability in word_probs_per_class
            word_probs_per_class[cls][word] = np.log(prob_word)#taking log to prevent underflow
    
    #save parameters
    f6 = open('phi_'+ part + '.pickle', 'wb')
    pickle.dump(phi, f6)
    f6.close()
    
    #save vocab to pickle file
    f7 = open('word_probs_per_class_' + part + '.pickle', 'wb')
    pickle.dump(word_probs_per_class, f7)
    f7.close()
    
    return phi, word_probs_per_class
    

#### TEST

In [296]:
def predict(x_test, y_test, phi, word_probs_per_class, mod_v, clean=False):
    m = len(y_test)
    preds = []
    num_classes = len(phi.keys())
    classes = list(phi.keys())
    classes.sort()
    
    for i in range(m):
        doc = x_test[i]
        #split doc into list of individual words such that punctuations are kept separate from word
        tokens = word_tokenize(doc)
        #removing punctuations to get final list of words
        tokens = remove_punctution(tokens)
        #removing stop words and doing lemmatization of tokens
        if(clean):
            tokens = clean_data(tokens)
            
        #class_log_probs = sum_log_feature_prob + log_class_prior
        class_log_probs = np.zeros(num_classes)
        for cls in phi.keys():    
            log_class_prior = phi[cls] #values already stored in log form
            sum_log_feature_prob = 0 #np.sum(word_probs_per_class[cls])
            for word in tokens:
                if(word in word_probs_per_class[cls].keys()):
                    sum_log_feature_prob += word_probs_per_class[cls][word]
                
            class_log_probs[cls-1] = sum_log_feature_prob + log_class_prior
#         print(class_log_probs)  
        final_pred = np.argmax(class_log_probs)+1 #due to zero based indexing adding +1
        preds.append(final_pred)
         
    return preds
    

#### BASELINES

In [256]:
def random_baseline(x_test, y_test, phi):
    preds = np.random.randint(1, 6, len(y_test)) #upper limit exclusive
    return preds

In [257]:
def majority_baseline(x_test, y_test, phi):
    probs = []
    for key in phi.keys():
        probs.append(phi[key])
    majority_pred = np.argmax(np.array(probs))+1
    preds = np.full(len(y_test), majority_pred)
    return preds

#### UTILITY FUNCTIONS

In [258]:
def accuracy(pred, y):
    m = len(y)
    acc = 0
    for i in range(m):
        if(pred[i] == y[i]):
            acc+=1
    acc /= m
    return acc

In [259]:
def plot_confusion_matrix(preds, y_test):
    y_test = list(y_test)
    conf_mat = confusion_matrix(y_test, preds)
    df = pd.DataFrame(conf_mat,
                     index = [1, 2, 3, 4, 5], 
                     columns = [1, 2, 3, 4, 5])
    #Plotting the confusion matrix
    plt.figure(figsize=(8,6))
    sns.heatmap(df, annot=True)
    plt.title('Confusion Matrix')
    plt.ylabel('Actual Labels')
    plt.xlabel('Predicted Labels')
    plt.show()

#### MAIN

In [297]:
def main():
    ### read data
    train_file_name = './reviews_Digital_Music_5.json/Music_Review_train.json'
    test_file_name = './reviews_Digital_Music_5.json/Music_Review_test.json'
    x_train, y_train, x_test, y_test = read_data(train_file_name, test_file_name)
    
    vocab, class_words_dict, class_num_words_dict = dictionary_prepare(x_train, y_train, saved=True)
    mod_v = len(vocab)
    print("Length of vocabulary on original text data (without punctuations) : ", mod_v)
    ### Part (a) change saved=True, after calculating params on train data once, to skip recomputing again
    phi,  word_probs_per_class = train(x_train, y_train,  vocab, class_words_dict, class_num_words_dict, saved=True)
    
    ### Part (b)
    # random baseline
    print('\n----------------- PART (b) ---------------------------\n')
    preds = random_baseline(x_test, y_test, phi)
    test_acc = accuracy(preds, y_test)
    print("Accuracy on test data using Random baseline: ", test_acc)
    
    #majority baseline
    preds = majority_baseline(x_test, y_test, phi)
    test_acc = accuracy(preds, y_test)
    print("Accuracy on test data using Majority baseline: ", test_acc)
    
    #Multinomial Naive Bayes
    preds = predict(x_test, y_test, phi, word_probs_per_class, mod_v)
    test_acc = accuracy(preds, y_test)
    print("Accuracy on test data using Multinomial Naive Bayes: ", test_acc)
    
    ### Part (c) - draw confusion matrix of test data
    print('\n----------------- PART (c) ---------------------------\n')
    #plot_confusion_matrix(preds, y_test)
    ### Part (d) - text cleaning
    print('\n----------------- PART (d) ---------------------------\n')
    vocab, class_words_dict, class_num_words_dict = dictionary_prepare(x_train, y_train, saved=True, clean=True, part='1_d')
    mod_v = len(vocab)
    print("Length of vocabulary after cleaning text data: ", mod_v)
    #retrain
    phi,  word_probs_per_class = train(x_train, y_train,  vocab, class_words_dict, class_num_words_dict, saved=True, part='1_d')
    #test
    preds_1d = predict(x_test, y_test, phi, word_probs_per_class, mod_v, clean=True)
    test_acc = accuracy(preds_1d, y_test)
    print("Accuracy on test data using Multinomial Naive Bayes after cleaning text: ", test_acc)  
    ### Part (e)
    #print('\n----------------- PART (e) ---------------------------\n')
    ### Part (f)
    #print('\n----------------- PART (f) ---------------------------\n')
#     print("F1 score: ", f1_score(list(y_test), pred, average=None))
#     print("F1 score (macro-averaged): ", f1_score(list(y_test), pred, average='macro'))
    ### Part (g)
    #print('\n----------------- PART (g) ---------------------------\n')

In [305]:
main()

Length of vocabulary on original text data (without punctuations) :  90062

----------------- PART (b) ---------------------------

Accuracy on test data using Random baseline:  0.2035
Accuracy on test data using Majority baseline:  0.07757142857142857
Accuracy on test data using Multinomial Naive Bayes:  0.6646428571428571

----------------- PART (c) ---------------------------


----------------- PART (d) ---------------------------

Length of vocabulary after cleaning text data:  83277
Accuracy on test data using Multinomial Naive Bayes after cleaning text:  0.6607142857142857


### ROUGH

In [159]:
A = np.array([1, 2, 3, 4, 5])

In [234]:
print(np.full(3,5))

[5 5 5]


### REFERENCES

1. https://machinelearningmastery.com/clean-text-machine-learning-python/
2. https://www.analyticsvidhya.com/blog/2021/06/confusion-matrix-for-multi-class-classification/
3. https://www.geeksforgeeks.org/python-lemmatization-with-nltk/
4. https://www.researchgate.net/publication/337321725_The_Effect_of_Stemming_and_Removal_of_Stopwords_on_the_Accuracy_of_Sentiment_Analysis_on_Indonesian-language_Texts
5. https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html