William Sigala
id: 1001730022

In [1]:
import pandas as pd
import re

In [2]:
class NaiveBayesClassifier():
    def __init__(self, train_data) -> None:
        self.alpha = 1
        self.re= re.compile('[^a-zA-Z ]')
        self.class_tables = self.separate_by_class(train_data)
        self.priors = self.calc_priors(train_data)
        self.n_classes = len(self.priors.keys())
        self.con_probs = self.calc_probs()

    def separate_by_class(self, train_data):
        class_tables = {}

        for label in train_data.label:
            if label not in class_tables:
                class_tables[label] = train_data.loc[train_data.label == label]
        
        return class_tables

    def calc_priors(self, train_data):
        priors = {}

        for label, class_data in self.class_tables.items():
            priors[label] = class_data.shape[0] / train_data.shape[0]
        
        return priors
    
    def calc_probs(self):
        vocab_class = {}

        for label, class_data in self.class_tables.items():
            vocab_class[label] = {}
            for sentence in class_data.review:
                sentence = sentence.lower()
                sentence = self.re.sub(' ', sentence)
                for word in sentence.split():
                    if word in vocab_class[label]:
                        vocab_class[label][word] += 1
                    else:
                        vocab_class[label][word] = 1
            omitted_vocab = {
                key : 0 for key, val in vocab_class[label].items() if val >= 5
            }
            vocab_class[label] = omitted_vocab

        for label, class_data in self.class_tables.items():
            class_size = len(class_data.review)
            vocab = vocab_class[label]

            for sentence in class_data.review:
                added = set()
                sentence = sentence.lower()
                sentence = self.re.sub(' ', sentence)
                for word in sentence.split():
                    if word in vocab.keys() and word not in added:
                        vocab[word] += 1        
                    added.add(word)
            for word in vocab.keys():
                vocab[word] += self.alpha
                vocab[word] /= (self.alpha * self.n_classes + class_size)
            vocab_class[label] = {
                k: v for k, v in sorted(
                    vocab.items(), key=lambda item: item[1], reverse=True
                )
            } 

        return vocab_class

    def __call__(self, sentence):
        max_posterior = (None, 0)

        for label, prior in self.priors.items():
            sentence = sentence.lower()
            sentence = self.re.sub(' ', sentence)
            likelihood = prior
            class_size = len(self.con_probs[label].keys())

            for w in sentence.split():
                if w in self.con_probs[label]:
                    likelihood *= self.con_probs[label][w] 
                else:
                    likelihood *= self.alpha / \
                    (self.alpha * self.n_classes + class_size)

            if likelihood > max_posterior[1]:
                max_posterior = (label, likelihood)
        
        return max_posterior


In [3]:
datapath = "customerReviews.txt"

def parse_data(path):
    data = []
    with open(path, "r") as f:
        for line in f.readlines():
            sep_idx = line.rfind(",")
            data.append([line[:sep_idx], line[sep_idx + 1:].strip()])

    return data

In [4]:
#preprocessing
data = parse_data(datapath)
df=pd.DataFrame(data, columns=['review', 'label'])
df = df.dropna(how="any")

size = df.shape[0]
train_data = df.loc[:int(0.7 * size)]
test_data = df.loc[int(0.7 * size):]

In [5]:
def evaluate_model(model, data):
    score = 0

    for i in range(data.shape[0]):
        pred, _ = model(data.iloc[i].review)
        score += pred == data.iloc[i].label

    return score / data.shape[0]

ignore = ["the", "is", "it"]
model = NaiveBayesClassifier(train_data)
for k in ["positive", "negative"]:
    for word in ignore:
        model.con_probs[k][word] = 1

print(f"Train Accuracy: {evaluate_model(model, train_data)} ")
print(f"Test Accuracy: {evaluate_model(model, test_data)} ")

Train Accuracy: 0.8585858585858586 
Test Accuracy: 0.813953488372093 


In [6]:
print(model("I had a terrible experience with this company"))
print(model("This is a great company with excellent customer service"))
print(model("I was really disappointed with this product"))
print(model("The service is too expensive for what it offers"))

('positive', 1.6301554284060483e-07)
('positive', 1.1923422562055663e-07)
('positive', 6.706925191156312e-06)
('positive', 1.4942322634630333e-07)
