### Load data

In [532]:
import pandas as pd
import re
import math

datapath = "~/Documents/datasets/ford_sentence/data.csv"
data = pd.read_csv(datapath, skip_blank_lines=True)
data = data.dropna(how="any")

size = data.shape[0]
train_data = data.loc[:int(0.6 * size)]
val_data = data.loc[int(0.6 * size):int(0.8 * size)]
test_data = data.loc[int(0.8 * size):]

### Building Naive Bayes Classifier

In [546]:
class NBC():
    def __init__(self, train_data, smoothing=True, alpha=1) -> None:
        assert alpha > 0, "Alpha must be greater than 0"

        self.regex = re.compile('[^a-zA-Z ]')
        self.alpha = alpha
        self.smoothing = smoothing
        self.class_tables = self.separate_by_class(train_data)
        self.priors = self.compute_priors(train_data)
        self.n_classes = len(self.priors.keys())
        self.occurences = self.compute_occurences()

    def separate_by_class(self, train_data):
        class_tables = {}

        for type in train_data.Type:
            if type not in class_tables:
                class_tables[type] = train_data.loc[train_data.Type == type]
        
        return class_tables

    def compute_priors(self, train_data):
        priors = {}

        for class_type, class_data in self.class_tables.items():
            priors[class_type] = class_data.shape[0] / train_data.shape[0]
        
        return priors
    
    def compute_occurences(self):
        vocab_class = {}

        for class_type, class_data in self.class_tables.items():
            vocab_class[class_type] = {}
            for sentence in class_data.New_Sentence:
                sentence = sentence.lower()
                sentence = self.regex.sub(' ', sentence)
                for word in sentence.split():
                    if word in vocab_class[class_type]:
                        vocab_class[class_type][word] += 1
                    else:
                        vocab_class[class_type][word] = 1
            omitted_vocab = {
                key : 0 for key, val in vocab_class[class_type].items() if val >= 5
            }
            vocab_class[class_type] = omitted_vocab

        for class_type, class_data in self.class_tables.items():
            class_size = len(class_data.New_Sentence)
            vocab = vocab_class[class_type]

            for sentence in class_data.New_Sentence:
                added = set()
                sentence = sentence.lower()
                sentence = self.regex.sub(' ', sentence)
                for word in sentence.split():
                    if word in vocab.keys() and word not in added:
                        vocab[word] += 1        
                    added.add(word)
            for word in vocab.keys():
                vocab[word] += self.alpha
                vocab[word] /= (self.alpha * self.n_classes + class_size)
            vocab_class[class_type] = {
                k: v for k, v in sorted(vocab.items(), key=lambda item: item[1], reverse=True)
            } 

        return vocab_class

    def __call__(self, sentence):
        max_a_posteriori = (None, 0)

        for class_type, prior in self.priors.items():
            sentence = sentence.lower()
            sentence = self.regex.sub(' ', sentence)
            likelihood = prior
            class_size = len(self.occurences[class_type].keys())

            for word in sentence.split():
                if word in self.occurences[class_type]:
                    likelihood *= self.occurences[class_type][word] 
                elif self.smoothing:
                    likelihood *= self.alpha / (self.alpha * self.n_classes + class_size)
                else:
                    likelihood *= 0

            if likelihood > max_a_posteriori[1]:
                max_a_posteriori = (class_type, likelihood)
        
        return max_a_posteriori

### Model Evaluation

In [547]:
def eval(model, data):
    score = 0

    for i in range(data.shape[0]):
        pred, _ = model(data.iloc[i].New_Sentence)
        score += pred == data.iloc[i].Type

    return score / data.shape[0]

In [515]:
model = NBC(train_data, smoothing=False)
print(f"Accuracy: {eval(model, val_data)} ")

Accuracy: 0.3841437084376889 


In [548]:
model = NBC(train_data, smoothing=True)
print(f"Accuracy: {eval(model, val_data)} ")

Accuracy: 0.6546333880300544 


In [549]:
def class_perf(model, data):
    class_performance = {}
    for class_type in model.priors.keys():
        class_performance[class_type] = eval(model, val_data[val_data.Type == class_type])

    df_perf = pd.Series(class_performance)
    print("Accuracy by class:")
    print(df_perf)

In [550]:
class_perf(model, val_data)

Accuracy by class:
Responsibility    0.882784
Requirement       0.320044
Skill             0.383171
SoftSkill         0.642973
Education         0.860045
Experience        0.898190
dtype: float64


In [551]:
top_words = {}
for class_type, class_occ in model.occurences.items():
    top_words[class_type] = [*class_occ.keys()][:10]

top_class_words = pd.DataFrame(top_words)
top_class_words.index += 1
top_class_words

Unnamed: 0,Responsibility,Requirement,Skill,SoftSkill,Education,Experience
1,and,and,and,and,in,experience
2,to,to,experience,to,degree,years
3,the,in,in,skills,or,of
4,of,experience,of,ability,engineering,in
5,with,of,with,with,bachelors,and
6,for,with,knowledge,in,science,or
7,in,ability,a,a,computer,with
8,a,a,to,communication,a,a
9,business,the,or,strong,and,minimum
10,on,or,management,of,related,management


In [562]:
import copy

model_copy = copy.deepcopy(model)
req = model_copy.occurences["Requirement"]

ignore = ["and", "with", "in", "of", "or", "experience"]
for word in ignore:
    req[word] = 1

In [563]:
print(f"Accuracy: {eval(model_copy, val_data)} ")
class_perf(model_copy, val_data)

Accuracy: 0.6897832282580534 
Accuracy by class:
Responsibility    0.856810
Requirement       0.607169
Skill             0.311044
SoftSkill         0.572429
Education         0.838600
Experience        0.867647
dtype: float64


In [564]:
print(f"Accuracy: {eval(model, test_data)} ")
print(f"Accuracy: {eval(model_copy, test_data)} ")

Accuracy: 0.650967996839194 
Accuracy: 0.6809956538917424 
