In [40]:
import os

### Data Class

In [41]:
class Data:
    def __init__(self, id: int, class_: str | None, abstract: str) -> None:
        self.id: int = id
        self.class_: str = class_
        self.abstract: str = abstract

    def get_type(self) -> str:
        '''Returns: "test" | "train"'''
        return "train" if self.class_ else "test"

### DataModel Class

In [42]:
class DataModel:
    def __init__(self, data_path: str) -> None:
        self.data: list[Data] = []
        with open(data_path, 'r') as f:
            lines = f.readlines()
        for line in lines[1:]:
            line_split = line.strip().replace('"', '').split(',')
            if len(line_split) == 3:
                id, class_, abstract = line_split
                data = Data(id, class_, abstract)
                self.data.append(data)
            else:
                id, abstract = line_split
                data = Data(id, None, abstract)
                self.data.append(data)

### Basic Naive Bayes Classifier

In [43]:
class NaiveBayesianClassifier:
    def __init__(self) -> None:
        self.training_data: DataModel = DataModel(
            data_path=os.path.join(os.path.join("data", "trg.csv"))
        )
        self.testing_data: DataModel = DataModel(
            data_path=os.path.join(os.path.join("data", "tst.csv"))
        )
        self.classes = [c for c in set([data.class_ for data in self.training_data.data])]
        self.class_counts = self.get_class_counts()
        self.class_probabilities = [count / len(self.training_data.data) for count in self.class_counts]
        self.word_counts = self.get_word_counts()


    def run_test_data(self, fileout: str) -> None:
        with open(fileout, 'w') as f:
            f.write("id,class\n")
            for data in self.testing_data.data:
                f.write(f"{data.id},{self.classify_abstract(data.abstract)}\n")


    def classify_abstract(self, abstract: str) -> str:
        '''Classifies the abstract into one of the classes. Returns the class. Uses the Naive Bayesian Classifier algorithm.'''
        abstract_words = abstract.split()
        class_probabilities = []
        for i in range(len(self.classes)):
            cur_class = self.classes[i]
            cur_class_probability = self.class_probabilities[i]
            cur_word_counts = self.word_counts[i]
            for word in abstract_words:
                if word in cur_word_counts:
                    cur_class_probability *= cur_word_counts[word] / self.class_counts[i]
            class_probabilities.append(cur_class_probability)
        return self.classes[class_probabilities.index(max(class_probabilities))]


    def get_class_counts(self) -> list[int]:
        '''Returns the count of each class in the training data. Match the order of the classes with the order of self.classes (classes[i] -> class_counts[i])'''
        class_counts = [0] * len(self.classes)
        for data in self.training_data.data:
            class_counts[self.classes.index(data.class_)] += 1
        return class_counts
    

    def get_word_counts(self) -> list[dict[str, int]]:
        '''Returns the count of each word in each class. Match the order of the classes with the order of self.classes (classes[i] -> word_counts[i])'''
        word_counts = [{} for _ in range(len(self.classes))]
        for data in self.training_data.data:
            for word in data.abstract.split():
                if word not in word_counts[self.classes.index(data.class_)]:
                    word_counts[self.classes.index(data.class_)][word] = 1
                else:
                    word_counts[self.classes.index(data.class_)][word] += 1
        return word_counts

### Main Routine

In [44]:
if __name__ == "__main__":
    classifier = NaiveBayesianClassifier()
    classifier.run_test_data("output.csv")
    