In [92]:
import os
import random

### Data Class

In [93]:
class Data:
    def __init__(self, id: int, class_: str | None, abstract: str) -> None:
        self.id: int = id
        self.class_: str = class_
        self.abstract: str = abstract

    def get_type(self) -> str:
        '''Returns: "test" | "train"'''
        return "train" if self.class_ else "test"

### DataModel Class

In [113]:
class DataModel:
    def __init__(self, data_path: str, do_shuffle: bool = False) -> None:
        self.data: list[Data] = []
        self.vocabulary_size: int = 0
        if not os.path.exists(data_path):
            self.data = []
        else:
            with open(data_path, 'r') as f:
                lines = f.readlines()
            if do_shuffle:
                random.shuffle(lines) # shouldn't be used for the test data just in case Kaggle expects the data in order
            for line in lines[1:]:
                line_split = line.strip().replace('"', '').split(',')
                if len(line_split) == 3:
                    id, class_, abstract = line_split
                    data = Data(id, class_, abstract)
                    self.data.append(data)
                else:
                    id, abstract = line_split
                    data = Data(id, None, abstract)
                    self.data.append(data)
        self.vocabulary_size = self.get_vocabulary_size()


    def set_data(self, data: list[Data]) -> None:
        self.data = data
        self.vocabulary_size = self.get_vocabulary_size()


    def get_vocabulary_size(self) -> int:
        '''Returns the number of unique words in the dataset.'''
        vocabulary = set()
        for data in self.data:
            words = data.abstract.split()
            for word in words:
                vocabulary.add(word)
        self.vocabulary_size = len(vocabulary)
        return self.vocabulary_size


    def split_model(self, proportion: float) -> 'DataModel':
        '''Splits this dataset into two data sets based on the proportion of the current data to be in the new split.'''
        split_data = DataModel('')
        split_index = int(len(self.data) * proportion)
        split_data.set_data(self.data[:split_index])
        self.set_data(self.data[split_index:])
        return split_data
    
    def eliminate_stop_words(self, stop_word_proportion) -> None:
        '''
        Stopwords include:
        - Any word that is a single character
        - Any word that is a number
        - The top 10% of words that appear in all classes
        '''
        # first, get the single char / number words out
        word_count = {}
        num_eliminated = 0
        for data in self.data:
            words = data.abstract.split()
            new_data = []
            for word in words:
                if len(word) > 1 and not word.isdigit():
                    if word in word_count:
                        word_count[word] += 1
                    else:
                        word_count[word] = 1
                    new_data.append(word)
                else:
                    num_eliminated += 1
            data.abstract = ' '.join(new_data)
        # now eliminate the top (stop_word_proportion)% of words
        stop_words = set()
        num_words_to_eliminate = int(len(word_count) * stop_word_proportion)
        for _ in range(num_words_to_eliminate):
            max_word = max(word_count, key=word_count.get)
            stop_words.add(max_word)
            del word_count[max_word]
            num_eliminated += 1
        for data in self.data:
            words = data.abstract.split()
            new_data = []
            for word in words:
                if word not in stop_words:
                    new_data.append(word)
            data.abstract = ' '.join(new_data)
        self.vocabulary_size = self.get_vocabulary_size()
        print(f'Eliminated {num_eliminated} words. Including the {len(stop_words)} most common and {num_eliminated - len(stop_words)} single char / number words.')
        
        

### Basic Naive Bayes Classifier

In [138]:
class StandardNaiveBayes:
    def __init__(self) -> None:
        self.training_data: DataModel = DataModel(
            data_path=os.path.join(os.path.join("data", "trg.csv")),
            do_shuffle=False
        )
        self.testing_data: DataModel = DataModel(
            data_path=os.path.join(os.path.join("data", "tst.csv")),
            do_shuffle=False
        )
        # self.training_data.eliminate_stop_words(0.02)
        self.classes = [c for c in set([data.class_ for data in self.training_data.data])]
        self.class_counts = self.get_class_counts()
        self.class_probabilities = [count / len(self.training_data.data) for count in self.class_counts]
        self.word_counts = self.get_word_counts()


    def run_test_data(self, fileout: str, type_: str = "test") -> None:
        with open(fileout, 'w') as f:
            f.write("id,class\n")
            if type_ == "test":
                for data in self.testing_data.data:
                    f.write(f"{data.id},{self.classify_abstract(data.abstract)}\n")
            elif type_ == "train":
                for data in self.training_data.data:
                    f.write(f"{data.id},{self.classify_abstract(data.abstract)}\n")
            else:
                raise ValueError("Invalid type_ argument. Must be 'test', 'validation', or 'train'.")


    def get_word_probability(self, word: str, class_index: int) -> float:
        '''
        p(class|word) = p(word|class) * p(class) / p(word)
        '''
        word_count = self.word_counts[class_index].get(word, 0)
        class_count = self.class_counts[class_index]
        word_in_class = word_count / class_count
        class_probability = self.class_probabilities[class_index]
        word_in_data = sum([self.word_counts[i].get(word, 0) for i in range(len(self.classes))]) / len(self.training_data.data)
        if word_in_data == 0:
            return 0
        return word_in_class * class_probability / word_in_data


    def classify_abstract(self, abstract: str) -> str:
        '''Classifies the abstract into one of the classes. Returns the class. Uses the Naive Bayesian Classifier algorithm.'''
        abstract_words = abstract.split()
        class_probabilities = []
        for i in range(len(self.classes)):
            cur_class_probability = self.class_probabilities[i]
            cur_class_probability = 1
            for word in abstract_words:
                cur_word_probability = self.get_word_probability(word, i)
                if cur_word_probability == 0:
                    continue
                cur_class_probability *= cur_word_probability
            class_probabilities.append(cur_class_probability)
        max_class = self.classes[class_probabilities.index(max(class_probabilities))]
        return max_class


    def get_class_counts(self) -> list[int]:
        '''Returns the count of each class in the training data. Match the order of the classes with the order of self.classes (classes[i] -> class_counts[i])'''
        class_counts = [0] * len(self.classes)
        for data in self.training_data.data:
            class_counts[self.classes.index(data.class_)] += 1
        return class_counts
    

    def get_word_counts(self) -> list[dict[str, int]]:
        '''Returns the count of each word in each class. Match the order of the classes with the order of self.classes (classes[i] -> word_counts[i])'''
        word_counts = [{} for _ in range(len(self.classes))]
        for data in self.training_data.data:
            for word in data.abstract.split():
                if word not in word_counts[self.classes.index(data.class_)]:
                    word_counts[self.classes.index(data.class_)][word] = 1
                else:
                    word_counts[self.classes.index(data.class_)][word] += 1
        return word_counts

### Main Routine

In [139]:
if __name__ == "__main__":
    classifier = StandardNaiveBayes()
    classifier.run_test_data("standard_output.csv", type_="test")
    '''
    -->    Accuracy Obtained: 0.800 (Kaggle)
    '''

Eliminated 41175 words. Including the 602 most common and 40573 single char / number words.


# Discussion of Standard Naive Bayesian Classifier
...

# Improvements 
...
- Add-1 LaPlace Smoothing
- Multinomial Naive Bayes
- Vocab Cleaning

In [157]:
class MN_DataModel:
    def __init__(self, data_path: str, do_shuffle: bool = False) -> None:
        '''
        Difference between MN_DataModel and DataModel is that it will combine multiple words as features.
        '''
        self.data: list[Data] = []
        self.vocabulary_size: int = 0
        if not os.path.exists(data_path):
            self.data = []
        else:
            with open(data_path, 'r') as f:
                lines = f.readlines()
            if do_shuffle:
                random.shuffle(lines) # shouldn't be used for the test data just in case Kaggle expects the data in order
            for line in lines[1:]:
                line_split = line.strip().replace('"', '').split(',')
                if len(line_split) == 3:
                    id, class_, abstract = line_split
                    data = Data(id, class_, abstract)
                    self.data.append(data)
                else:
                    id, abstract = line_split
                    data = Data(id, None, abstract)
                    self.data.append(data)
        self.vocabulary_size = self.get_vocabulary_size()


    def set_data(self, data: list[Data]) -> None:
        self.data = data
        self.vocabulary_size = self.get_vocabulary_size()


    def get_vocabulary_size(self) -> int:
        '''Returns the number of unique words in the dataset.'''
        vocabulary = set()
        for data in self.data:
            # words = data.abstract.split()
            # coupling words
            words = self.split_abstract(data.abstract)
            for word in words:
                vocabulary.add(word)
        self.vocabulary_size = len(vocabulary)
        return self.vocabulary_size


    def split_model(self, proportion: float) -> 'DataModel':
        '''Splits this dataset into two data sets based on the proportion of the current data to be in the new split.'''
        split_data = DataModel('')
        split_index = int(len(self.data) * proportion)
        split_data.set_data(self.data[:split_index])
        self.set_data(self.data[split_index:])
        return split_data
    

    def split_abstract(self, abstract: str) -> list[str]:
        '''Splits the abstract into a list of paired words.'''
        words = abstract.split()
        combined_words = []
        for i in range(0, len(words), 2):
            if i + 1 < len(words):
                combined_words.append(words[i] + ' ' + words[i + 1])
            else:
                combined_words.append(words[i])
        return combined_words

    def eliminate_stop_words(self, stop_word_proportion) -> None:
        '''
        Stopwords include:
        - Any word that is a single character
        - Any word that is a number
        - The top 10% of words that appear in all classes
        '''
        # first, get the single char / number words out
        word_count = {}
        num_eliminated = 0
        for data in self.data:
            words = self.split_abstract(data.abstract)
            new_data = []
            for word in words:
                if len(word) > 1 and not word.isdigit():
                    if word in word_count:
                        word_count[word] += 1
                    else:
                        word_count[word] = 1
                    new_data.append(word)
                else:
                    num_eliminated += 1
            data.abstract = ' '.join(new_data)
        # now eliminate the top (stop_word_proportion)% of words
        stop_words = set()
        num_words_to_eliminate = int(len(word_count) * stop_word_proportion)
        for _ in range(num_words_to_eliminate):
            max_word = max(word_count, key=word_count.get)
            stop_words.add(max_word)
            del word_count[max_word]
            num_eliminated += 1
        for data in self.data:
            words = self.split_abstract(data.abstract)
            new_data = []
            for word in words:
                if word not in stop_words:
                    new_data.append(word)
            data.abstract = ' '.join(new_data)
        self.vocabulary_size = self.get_vocabulary_size()
        print(f'Eliminated {num_eliminated} words. Including the {len(stop_words)} most common and {num_eliminated - len(stop_words)} single char / number words.')
        
        

In [229]:
class ImprovedNaiveBayes:
    def __init__(self, validation_data_split: float, alpha: float, stop_word_proportion: float, auto_load_data: bool = True) -> None:
        if auto_load_data:
            self.training_data: DataModel = DataModel(
                data_path=os.path.join(os.path.join("data", "trg.csv")),
                do_shuffle=True
            )
            self.testing_data: DataModel = DataModel(
                data_path=os.path.join(os.path.join("data", "tst.csv")),
                do_shuffle=False
            )
            self.training_data.eliminate_stop_words(stop_word_proportion)
            if validation_data_split > 0.0:
                self.validation_data: DataModel = self.training_data.split_model(validation_data_split)
            else:
                self.validation_data = None
            self.alpha = alpha
            self.vocab_size = self.training_data.vocabulary_size
            self.classes = [c for c in set([data.class_ for data in self.training_data.data])]
            self.class_counts = self.get_class_counts()
            self.class_probabilities = [count / len(self.training_data.data) for count in self.class_counts]
            self.word_counts = self.get_word_counts()
        else:
            self.training_data = DataModel('')
            self.testing_data = DataModel('')
            self.validation_data = None
            self.alpha = alpha
            self.vocab_size = 0
            self.classes = []
            self.class_counts = []
            self.class_probabilities = []
            self.word_counts = []


    def set_data(self, training_data: DataModel = None, testing_data: DataModel = None, validation_data: DataModel = None) -> None:
        if training_data:
            self.training_data.set_data(training_data.data)
        if testing_data:
            self.testing_data.set_data(testing_data.data)
        if validation_data:
            if self.validation_data:
                self.validation_data.set_data(validation_data.data)
            else:
                self.validation_data = DataModel('')
                self.validation_data.set_data(validation_data.data)
        self.vocab_size = self.training_data.vocabulary_size
        self.classes = [c for c in set([data.class_ for data in self.training_data.data])]
        if self.classes == []:
            print(f"Classes: {self.classes}")
            print(f"{self.training_data.data[0].class_}, {self.training_data.data[0].abstract}")
            print(f"{self.training_data.data[1].class_}, {self.training_data.data[1].abstract}")
        self.class_counts = self.get_class_counts()
        self.class_probabilities = [count / len(self.training_data.data) for count in self.class_counts]
        self.word_counts = self.get_word_counts()


    def get_validation_accuracy(self) -> float:
        if not self.validation_data:
            print(f"Model has no validation set.")
            return 0
        correct = 0
        for data in self.validation_data.data:
            predicted_class = self.classify_abstract(data.abstract)
            # print(f"Predicted: {predicted_class} | Actual: {data.class_}")
            if predicted_class == data.class_:
                correct += 1
        return correct / len(self.validation_data.data)


    def run_test_data(self, fileout: str, type_: str = "test") -> None:
        with open(fileout, 'w') as f:
            f.write("id,class\n")
            if type_ == "test":
                for data in self.testing_data.data:
                    f.write(f"{data.id},{self.classify_abstract(data.abstract)}\n")
            elif type_ == "train":
                for data in self.training_data.data:
                    f.write(f"{data.id},{self.classify_abstract(data.abstract)}\n")
            else:
                raise ValueError("Invalid type_ argument. Must be 'test', 'validation', or 'train'.")
    
    
    def get_word_probability(self, word: str, class_index: int) -> float:
        '''
        p(class|word) = p(word|class) * p(class) / p(word) -- NEW (Dirichlet)
        '''
        word_count = self.word_counts[class_index].get(word, 0)
        class_count = self.class_counts[class_index]
        word_in_class = (word_count + self.alpha) / (class_count + self.alpha * self.vocab_size)
        class_probability = self.class_probabilities[class_index]
        word_in_data = (sum([self.word_counts[i].get(word, 0) for i in range(len(self.classes))]) + self.alpha) / (len(self.training_data.data) + self.alpha * self.vocab_size)
        return word_in_class * class_probability / word_in_data


    def classify_abstract(self, abstract: str) -> str:
        '''Classifies the abstract into one of the classes. Returns the class. Uses the Naive Bayesian Classifier algorithm.'''
        abstract_words = abstract.split()

        # testing with mn words
        # abstract_words = self.training_data.split_abstract(abstract)
        abstract_words = abstract.split()

        class_probabilities = []
        for i in range(len(self.classes)):
            cur_class_probability = self.class_probabilities[i]
            cur_class_probability = 1
            for word in abstract_words:
                cur_word_probability = self.get_word_probability(word, i)
                if cur_word_probability == 0:
                    continue
                cur_class_probability *= cur_word_probability
            class_probabilities.append(cur_class_probability)
        max_class = self.classes[class_probabilities.index(max(class_probabilities))]
        return max_class
    


    def get_class_counts(self) -> list[int]:
        '''Returns the count of each class in the training data. Match the order of the classes with the order of self.classes (classes[i] -> class_counts[i])'''
        class_counts = [0] * len(self.classes)
        for data in self.training_data.data:
            class_counts[self.classes.index(data.class_)] += 1
        return class_counts
    

    def get_word_counts(self) -> list[dict[str, int]]:
        '''Returns the count of each word in each class. Match the order of the classes with the order of self.classes (classes[i] -> word_counts[i])'''
        word_counts = [{} for _ in range(len(self.classes))]
        for data in self.training_data.data:
            # testing with mn words
            # words = self.training_data.split_abstract(data.abstract)
            words = data.abstract.split()
            for word in words:
                if word not in word_counts[self.classes.index(data.class_)]:
                    word_counts[self.classes.index(data.class_)][word] = 1
                else:
                    word_counts[self.classes.index(data.class_)][word] += 1
        return word_counts
    

    def save(self):
        '''Saves the word counts to a txt file'''
        with open("word_counts.txt", 'w') as f:
            for i in range(len(self.classes)):
                f.write('-'*100 + '\n')
                f.write(f"Class: {self.classes[i]}\n")
                f.write(f"Class Count: {self.class_counts[i]}\n")
                f.write(f"Class Probability: {self.class_probabilities[i]}\n\n")
                for word, count in self.word_counts[i].items():
                    f.write(f"{word}: {count}\n")


In [230]:
def create_ensemble(num_models: int, alpha: float, stop_word_proportion: float, validation_proportion: float) -> list[ImprovedNaiveBayes]:
    ensemble: list[ImprovedNaiveBayes] = []
    for j in range(num_models):
        if j == 0:
            auto_load = True
            valid_prop = validation_proportion
        else:
            auto_load = False
            valid_prop = 0.0
        classifier = ImprovedNaiveBayes(
            validation_data_split=valid_prop,
            alpha=alpha,
            stop_word_proportion=stop_word_proportion,
            auto_load_data=auto_load
        )
        ensemble.append(classifier)
    data_to_split = ensemble[0].training_data
    split_size = len(data_to_split.data) // num_models
    split_data = [data_to_split.data[i * split_size: (i + 1) * split_size] for i in range(num_models)]
    for i in range(num_models):
        new_datamodel = DataModel('')
        new_datamodel.set_data(split_data[i])
        if i == 0:
            ensemble[i].set_data(training_data=new_datamodel)
        else:
            ensemble[i].set_data(training_data=new_datamodel, testing_data=ensemble[0].testing_data)
    return ensemble


def test_ensemble(ensemble: list[ImprovedNaiveBayes], fileout: str, moderator_model = None) -> None:
    with open(fileout, 'w') as f:
        f.write("id,class\n")
        for data in ensemble[0].testing_data.data:
            class_counts = {}
            for classifier in ensemble:
                predicted_class = classifier.classify_abstract(data.abstract)
                if predicted_class in class_counts:
                    class_counts[predicted_class] += 1
                else:
                    class_counts[predicted_class] = 1
            print(f"{data.id}: {class_counts}")
            if len(class_counts) >= 3 and moderator_model:
                max_class = moderator_model.classify_abstract(data.abstract)
                print(f"Moderator Intervention: {max_class}")
            else:
                max_class = max(class_counts, key=class_counts.get)
            f.write(f"{data.id},{max_class}\n")


def validate_ensemble(ensemble: list[ImprovedNaiveBayes], validation_set: DataModel, moderator = None) -> None:
    correct = 0
    for data in validation_set.data:
        class_counts = {}
        for classifier in ensemble:
            predicted_class = classifier.classify_abstract(data.abstract)
            if predicted_class in class_counts:
                class_counts[predicted_class] += 1
            else:
                class_counts[predicted_class] = 1
        if len(class_counts) >= 3 and moderator:
            max_class = moderator.classify_abstract(data.abstract)
        else:
            max_class = max(class_counts, key=class_counts.get)
        if max_class == data.class_:
            correct += 1
    print(f"Validation Accuracy: {correct / len(validation_set.data)}")
    


if __name__ == "__main__":
    # total_accuracy = 0
    # num_trials = 1

    # for _ in range(num_trials):
    #     classifier = ImprovedNaiveBayes(
    #         validation_data_split=0.0,
    #         alpha = 0.001,
    #         stop_word_proportion=0.01
    #     )
    #     total_accuracy += classifier.get_validation_accuracy()
    # print(f"Average Validation Accuracy: {total_accuracy / num_trials}")
    # classifier.save()
    # classifier.run_test_data("new_output.csv", type_="test")


    moderator = ImprovedNaiveBayes(
        validation_data_split=0.5,
        alpha=0.0001,
        stop_word_proportion=0.02
    )
    ensemble: list[ImprovedNaiveBayes] = create_ensemble(5, 0.0001, 0.02, 0.2)
    test_ensemble(ensemble, "ensemble_output.csv", moderator_model=moderator)
    validate_ensemble(ensemble, ensemble[0].validation_data, moderator=moderator)
    ensemble[0].save()

    

Eliminated 2345 words. Including the 2309 most common and 36 single char / number words.
Eliminated 2345 words. Including the 2309 most common and 36 single char / number words.
1: {'E': 4, 'B': 1}
2: {'E': 5}
3: {'E': 5}
4: {'E': 5}
5: {'E': 5}
6: {'E': 5}
7: {'E': 5}
8: {'E': 1, 'B': 4}
9: {'E': 5}
10: {'E': 5}
11: {'B': 2, 'E': 3}
12: {'E': 5}
13: {'B': 5}
14: {'B': 5}
15: {'E': 5}
16: {'E': 5}
17: {'E': 5}
18: {'E': 4, 'B': 1}
19: {'E': 5}
20: {'E': 5}
21: {'B': 4, 'E': 1}
22: {'E': 5}
23: {'B': 5}
24: {'B': 4, 'E': 1}
25: {'E': 5}
26: {'E': 4, 'B': 1}
27: {'E': 5}
28: {'E': 3, 'B': 2}
29: {'B': 4, 'E': 1}
30: {'E': 5}
31: {'E': 5}
32: {'B': 5}
33: {'E': 4, 'B': 1}
34: {'E': 5}
35: {'B': 4, 'E': 1}
36: {'E': 5}
37: {'E': 5}
38: {'E': 5}
39: {'B': 5}
40: {'E': 5}
41: {'E': 5}
42: {'E': 5}
43: {'E': 4, 'B': 1}
44: {'E': 3, 'B': 2}
45: {'E': 5}
46: {'E': 4, 'B': 1}
47: {'E': 5}
48: {'B': 5}
49: {'B': 5}
50: {'E': 5}
51: {'E': 4, 'B': 1}
52: {'E': 5}
53: {'E': 5}
54: {'B': 5}
55: {'E':