In [125]:
import pandas as pd

datapath = "~/Documents/datasets/ford_sentence/"
train_data = pd.read_csv(datapath + "train_data.csv", skip_blank_lines=True)
#train_data = train_data[train_data.New_Sentence.notna()]
train_data = train_data.dropna(how="any")

num_sentences = len(train_data.iloc[:])

### Split by class

In [109]:
class_tables = {}

for type in train_data.Type:
    if type not in class_tables:
        class_tables[type] = train_data.loc[train_data.Type == type]

### Tracking number of occurences of each word for every class

In [105]:
import re
vocab_class = {}
regex = re.compile('[^a-zA-Z ]')

for class_type, class_data in class_tables.items():
    vocab_class[class_type] = {}
    for sentence in class_data.New_Sentence:
        sentence = sentence.lower()
        sentence = regex.sub(' ', sentence)
        for word in sentence.split():
            if word in vocab_class[class_type]:
                vocab_class[class_type][word] += 1
            else:
                vocab_class[class_type][word] = 1
    omitted_vocab = {key : 0 for key, val in vocab_class[class_type].items() if val >= 5}
    vocab_class[class_type] = omitted_vocab

In [106]:
for class_type, class_data in class_tables.items():
    class_size = len(class_data.New_Sentence)
    vocab = vocab_class[class_type]

    for sentence in class_data.New_Sentence:
        added = set()
        sentence = sentence.lower()
        sentence = regex.sub(' ', sentence)
        for word in sentence.split():
            if word in vocab.keys() and word not in added:
                vocab[word] += 1        
            added.add(word)
    for word in vocab.keys():
        vocab[word] /= class_size

In [107]:
df = pd.Series(vocab_class['Responsibility'])
print("P(w | Responsibility)")
df

P(w | Responsibility)


and               0.696926
or                0.050534
review            0.021892
architecture      0.012453
design            0.059514
                    ...   
categorization    0.000393
approximately     0.000328
rs                0.000262
hedges            0.000131
consignment       0.000066
Length: 3427, dtype: float64

In [133]:
class NBC():
    def __init__(self, train_data) -> None:
        self.regex = re.compile('[^a-zA-Z ]')
        self.class_tables = self.separate_by_class(train_data)
        self.priors = self.compute_priors(train_data)
        self.occurences = self.compute_likelihoods()

    def separate_by_class(self, train_data):
        class_tables = {}

        for type in train_data.Type:
            if type not in class_tables:
                class_tables[type] = train_data.loc[train_data.Type == type]
        
        return class_tables

    def compute_priors(self, train_data):
        priors = {}

        for class_type, class_data in self.class_tables.items():
            priors[class_type] = len(class_data[:]) / len(train_data[:])
        
        return priors
    
    def compute_likelihoods(self):
        vocab_class = {}

        for class_type, class_data in self.class_tables.items():
            vocab_class[class_type] = {}
            for sentence in class_data.New_Sentence:
                sentence = sentence.lower()
                sentence = self.regex.sub(' ', sentence)
                for word in sentence.split():
                    if word in vocab_class[class_type]:
                        vocab_class[class_type][word] += 1
                    else:
                        vocab_class[class_type][word] = 1
            omitted_vocab = {key : 0 for key, val in vocab_class[class_type].items() if val >= 5}
            vocab_class[class_type] = omitted_vocab

        for class_type, class_data in self.class_tables.items():
            class_size = len(class_data.New_Sentence)
            vocab = vocab_class[class_type]

            for sentence in class_data.New_Sentence:
                added = set()
                sentence = sentence.lower()
                sentence = self.regex.sub(' ', sentence)
                for word in sentence.split():
                    if word in vocab.keys() and word not in added:
                        vocab[word] += 1        
                    added.add(word)
            for word in vocab.keys():
                vocab[word] /= class_size
        
        return vocab_class

    def __call__(self, sentence):
        max_a_posteriori = (None, 0)

        for class_type, prior in self.priors.items():
            sentence = sentence.lower()
            sentence = self.regex.sub(' ', sentence)
            likelihood = prior
            for word in sentence.split():
                if word in self.occurences[class_type]:
                    likelihood *= self.occurences[class_type][word] 

            print(f"{class_type}: {likelihood}")
            if likelihood > max_a_posteriori[1]:
                max_a_posteriori = (class_type, likelihood)
        
        return max_a_posteriori

In [138]:
model = NBC(train_data)
print(model.priors)
model("Productivity solutions include a mix of products, from rugged mobile computers, voice enabled softwa...")

{'Responsibility': 0.2585844547642453, 'Requirement': 0.23521236568251924, 'Skill': 0.11580963357174333, 'SoftSkill': 0.1595878105826921, 'Education': 0.07694654418494289, 'Experience': 0.15385919121385716}
Responsibility: 8.668542287165756e-27
Requirement: 3.6976258720270046e-26
Skill: 8.387046472103041e-29
SoftSkill: 2.911311176851286e-22
Education: 1.442102453747357e-17
Experience: 3.841821441254866e-28


('Education', 1.442102453747357e-17)