In [265]:
import pandas as pd
import re
datapath = "~/Documents/datasets/ford_sentence/data.csv"
data = pd.read_csv(datapath, skip_blank_lines=True)
data = data.dropna(how="any")
size = data.shape[0]
train_data = data.loc[:int(0.6 * size)]
val_data = data.loc[int(0.6 * size):int(0.8 * size)]
test_data = data.loc[int(0.8 * size):]

In [266]:
#df = pd.Series(vocab_class['Responsibility'])
#print("P(w | Responsibility)")
#df

In [267]:
class NBC():
    def __init__(self, train_data, smoothing=True) -> None:
        self.regex = re.compile('[^a-zA-Z ]')
        self.class_tables = self.separate_by_class(train_data)
        self.priors = self.compute_priors(train_data)
        self.occurences = self.compute_occurences()

    def separate_by_class(self, train_data):
        class_tables = {}

        for type in train_data.Type:
            if type not in class_tables:
                class_tables[type] = train_data.loc[train_data.Type == type]
        
        return class_tables

    def compute_priors(self, train_data):
        priors = {}

        for class_type, class_data in self.class_tables.items():
            priors[class_type] = class_data.shape[0] / train_data.shape[0]
        
        return priors
    
    def compute_occurences(self):
        vocab_class = {}

        for class_type, class_data in self.class_tables.items():
            vocab_class[class_type] = {}
            for sentence in class_data.New_Sentence:
                sentence = sentence.lower()
                sentence = self.regex.sub(' ', sentence)
                for word in sentence.split():
                    if word in vocab_class[class_type]:
                        vocab_class[class_type][word] += 1
                    else:
                        vocab_class[class_type][word] = 1
            omitted_vocab = {key : 0 for key, val in vocab_class[class_type].items() if val >= 5}
            vocab_class[class_type] = omitted_vocab

        for class_type, class_data in self.class_tables.items():
            class_size = len(class_data.New_Sentence)
            vocab = vocab_class[class_type]

            for sentence in class_data.New_Sentence:
                added = set()
                sentence = sentence.lower()
                sentence = self.regex.sub(' ', sentence)
                for word in sentence.split():
                    if word in vocab.keys() and word not in added:
                        vocab[word] += 1        
                    added.add(word)
            for word in vocab.keys():
                vocab[word] /= class_size
        
        return vocab_class

    def __call__(self, sentence):
        max_a_posteriori = (None, 0)

        for class_type, prior in self.priors.items():
            sentence = sentence.lower()
            sentence = self.regex.sub(' ', sentence)
            likelihood = prior
            for word in sentence.split():
                if word in self.occurences[class_type]:
                    likelihood *= self.occurences[class_type][word] 
                else:
                    likelihood *= 1 / len(self.occurences[class_type].keys())

#            print(f"{class_type}: {likelihood}")
            if likelihood > max_a_posteriori[1]:
                max_a_posteriori = (class_type, likelihood)
        
        return max_a_posteriori

In [269]:
model = NBC(train_data, smoothing=False)

In [270]:
def eval(model, data):
    score = 0

    for i in range(data.shape[0]):
        pred, _ = model(data.iloc[i].New_Sentence)
        score += pred == data.iloc[i].Type

    return score / data.shape[0]

In [271]:
eval(model, val_data)

0.6530788496415926