### Figure 1. Imports

In [26]:
import random # Random is the ONLY module I am allowing myself to use for this challenge.

### Figure 2. Simple Data Class

In [27]:
class Data:
    def __init__(self, id: int, class_: str | None, abstract: str) -> None:
        self.id: int = id
        self.class_: str = class_
        self.abstract: str = abstract

    def get_type(self) -> str:
        '''Returns: "test" | "train"'''
        return "train" if self.class_ else "test"

### Figure 3. DataModel Class

In [28]:
class DataModel:
    def __init__(self, data_path: str, do_shuffle: bool = False) -> None:
        '''
        The DataModel class is a container for the data in the dataset.
        This will be greatly helpful for the model to access the data in a structured way, and minimise risk of errors.
        The data is stored in a list of Data objects, which contain the id, class, and abstract of the data as attributes.
        The vocabulary size is also stored as an attribute, which is the number of unique words in the dataset.
        This DataModel will be able to split data for validation sets and ensembles.
        '''
        self.data: list[Data] = []
        self.vocabulary_size: int = 0
        if not data_path or data_path.strip() == "":
            self.data = []
        else:
            with open(data_path, 'r', encoding='utf-8') as f:
                lines = f.readlines()
            if do_shuffle:
                random.shuffle(lines) # shouldn't be used for the test data just in case Kaggle expects the data in order
            for line in lines[1:]:
                line_split = line.strip().replace('"', '').split(',')
                if len(line_split) == 3:
                    id, class_, abstract = line_split
                    data = Data(id, class_, abstract)
                    self.data.append(data)
                else:
                    id, abstract = line_split
                    data = Data(id, None, abstract)
                    self.data.append(data)
        self.vocabulary_size = self.get_vocabulary_size()


    def set_data(self, data: list[Data]) -> None:
        '''Sets the data of this model to the given data and updates the vocab size.'''
        self.data = data
        self.vocabulary_size = self.get_vocabulary_size()


    def get_vocabulary_size(self) -> int:
        '''Returns the number of unique words in the dataset.'''
        vocabulary = set()
        for data in self.data:
            words = data.abstract.split()
            for word in words:
                vocabulary.add(word)
        self.vocabulary_size = len(vocabulary)
        return self.vocabulary_size


    def split_model(self, proportion: float) -> 'DataModel':
        '''Splits this dataset into two data sets based on the proportion of the current data to be in the new split.'''
        split_data = DataModel('')
        split_index = int(len(self.data) * proportion)
        split_data.set_data(self.data[:split_index])
        self.set_data(self.data[split_index:])
        return split_data
    
    
    def eliminate_stop_words(self, stop_word_proportion) -> None:
        '''
        Stopwords include:
        - Any word that is a single character
        - Any word that is a number
        - The top 10% of words that appear in all classes
        '''
        word_count = {}
        num_eliminated = 0
        for data in self.data:
            words = data.abstract.split()
            new_data = []
            for word in words:
                if len(word) > 1 and not word.isdigit():
                    if word in word_count:
                        word_count[word] += 1
                    else:
                        word_count[word] = 1
                    new_data.append(word)
                else:
                    num_eliminated += 1
            data.abstract = ' '.join(new_data)
        stop_words = set()
        num_words_to_eliminate = int(len(word_count) * stop_word_proportion)
        for _ in range(num_words_to_eliminate):
            max_word = max(word_count, key=word_count.get)
            stop_words.add(max_word)
            del word_count[max_word]
            num_eliminated += 1
        for data in self.data:
            words = data.abstract.split()
            new_data = []
            for word in words:
                if word not in stop_words:
                    new_data.append(word)
            data.abstract = ' '.join(new_data)
        self.vocabulary_size = self.get_vocabulary_size()
        print(f'Eliminated {num_eliminated} words. Including the {len(stop_words)} most common and {num_eliminated - len(stop_words)} single char / number words.')

    
    def bootstrap_sample(self, proportion, num_models) -> list['DataModel']:
        '''Returns a list of num_models DataModels, each with a random sample of the data with proportion of the original data.'''
        num_per_model = int(len(self.data) * proportion)
        models = []
        for _ in range(num_models):
            new_data = []
            for _ in range(num_per_model):
                new_data.append(random.choice(self.data))
            new_model = DataModel('')
            new_model.set_data(new_data)
            models.append(new_model)
        return models        
        

### Figure 4. Standard Naive Bayes Classifier

In [29]:
class StandardNaiveBayes:
    def __init__(self) -> None:
        ''''
        This is the standard Naive Bayes classifier. 
        It uses the training data to calculate the probabilities of each word in each class and then uses these probabilities to classify the test data.
        The standard Naive Bayes has absolutely no added features or optimizations but still managed to get a 0.80 accuracy on the test data.
        '''
        self.training_data: DataModel = DataModel(
            data_path=r"data\trg.csv",
            do_shuffle=False # this counts as data pre-processing and won't be used for the standard model
        )
        self.testing_data: DataModel = DataModel(
            data_path=r"data\tst.csv",
            do_shuffle=False # ditto above
        )
        self.classes = [c for c in set([data.class_ for data in self.training_data.data])]
        self.class_counts = self.get_class_counts()
        self.class_probabilities = [count / len(self.training_data.data) for count in self.class_counts]
        self.word_counts = self.get_word_counts()


    def run_test_data(self, fileout: str) -> None:
        with open(fileout, 'w') as f:
            f.write("id,class\n")
            for data in self.testing_data.data:
                f.write(f"{data.id},{self.classify_abstract(data.abstract)}\n")


    def get_word_probability(self, word: str, class_index: int) -> float:
        '''
        p(class|word) = p(word|class) * p(class) / p(word)
        '''
        word_count = self.word_counts[class_index].get(word, 0)
        class_count = self.class_counts[class_index]
        word_in_class = word_count / class_count
        class_probability = self.class_probabilities[class_index]
        word_in_data = sum([self.word_counts[i].get(word, 0) for i in range(len(self.classes))]) / len(self.training_data.data)
        if word_in_data == 0:
            return 0
        return word_in_class * class_probability / word_in_data


    def classify_abstract(self, abstract: str) -> str:
        '''Classifies the abstract into one of the classes. Returns the class. Uses the Naive Bayesian Classifier algorithm.'''
        abstract_words = abstract.split()
        class_probabilities = []
        for i in range(len(self.classes)):
            cur_class_probability = self.class_probabilities[i]
            cur_class_probability = 1
            for word in abstract_words:
                cur_word_probability = self.get_word_probability(word, i)
                if cur_word_probability == 0:
                    continue
                cur_class_probability *= cur_word_probability
            class_probabilities.append(cur_class_probability)
        max_class = self.classes[class_probabilities.index(max(class_probabilities))]
        return max_class


    def get_class_counts(self) -> list[int]:
        '''Returns the count of each class in the training data. Match the order of the classes with the order of self.classes (classes[i] -> class_counts[i])'''
        class_counts = [0] * len(self.classes)
        for data in self.training_data.data:
            class_counts[self.classes.index(data.class_)] += 1
        return class_counts
    

    def get_word_counts(self) -> list[dict[str, int]]:
        '''Returns the count of each word in each class. Match the order of the classes with the order of self.classes (classes[i] -> word_counts[i])'''
        word_counts = [{} for _ in range(len(self.classes))]
        for data in self.training_data.data:
            for word in data.abstract.split():
                if word not in word_counts[self.classes.index(data.class_)]:
                    word_counts[self.classes.index(data.class_)][word] = 1
                else:
                    word_counts[self.classes.index(data.class_)][word] += 1
        return word_counts

### Figure 5. Main Routine for the Standard Naive Bayes

In [30]:
if __name__ == "__main__":
    # 10-fold repeated Cross-Validation for the Evaluation metric.
    total_accuracy = 0
    for _ in range(10):
        accuracy = 0
        model = StandardNaiveBayes()
        validation_data = model.training_data.split_model(0.1)
        for data in validation_data.data:
            classification = model.classify_abstract(data.abstract)
            if classification == data.class_:
                accuracy += 1
        total_accuracy += accuracy / len(validation_data.data)
    print(f"Average accuracy: {total_accuracy / 10}")
        
    # Simple run of the model on test data
    classifier = StandardNaiveBayes()
    classifier.run_test_data(fileout="standard_output.csv")
    '''
    -->    Accuracy Obtained: 0.800 (Kaggle)
    '''

Average accuracy: 0.5585284280936456


In [31]:
class ImprovedNaiveBayes:
    def __init__(self, validation_data_split: float, alpha: float, stop_word_proportion: float, auto_load_data: bool = True) -> None:
        '''
        This model is an improvement on the standard naive bayes model. It includes the following improvements:
        - Supporting validation data splits
        - Dirichlet smoothing
        - Stop word elimination
        - Ability to combine words as features
        - Supporting manual data loading for ensembles / superstructures of models        
        '''
        if auto_load_data:
            self.training_data: DataModel = DataModel(
                data_path=r"data\trg.csv",
                do_shuffle=True                         # Data stochasiticity parameter Figure 7.1
            )
            self.testing_data: DataModel = DataModel(
                data_path=r"data\tst.csv",
                do_shuffle=False
            )
            self.training_data.eliminate_stop_words(stop_word_proportion) # stop word elimination parameter Figure 7.2
            if validation_data_split > 0.0:        # validation data split parameter Figure 7.3
                self.validation_data: DataModel = self.training_data.split_model(validation_data_split)
            else:
                self.validation_data = None
            self.alpha = alpha
            self.vocab_size = self.training_data.vocabulary_size
            self.classes = [c for c in set([data.class_ for data in self.training_data.data])]
            self.class_counts = self.get_class_counts()
            self.class_probabilities = [count / len(self.training_data.data) for count in self.class_counts]
            self.word_counts = self.get_word_counts()
        else:
            self.training_data = DataModel('')
            self.testing_data = DataModel('')
            self.validation_data = None
            self.alpha = alpha                      # Dirichlet smoothing hyperparameter Figure 7.4
            self.vocab_size = 0                     # Dirichlet smoothing hyperparameter Figure 7.5
            self.classes = []
            self.class_counts = []
            self.class_probabilities = []
            self.word_counts = []


    def set_data(self, training_data: DataModel = None, testing_data: DataModel = None, validation_data: DataModel = None) -> None:
        '''Method for manually setting the data. Needs to update several attributes of the model.'''
        if training_data:
            self.training_data.set_data(training_data.data)
        if testing_data:
            self.testing_data.set_data(testing_data.data)
        if validation_data:
            if self.validation_data:
                self.validation_data.set_data(validation_data.data)
            else:
                self.validation_data = DataModel('')
                self.validation_data.set_data(validation_data.data)
        self.vocab_size = self.training_data.vocabulary_size
        self.classes = [c for c in set([data.class_ for data in self.training_data.data])]
        self.class_counts = self.get_class_counts()
        self.class_probabilities = [count / len(self.training_data.data) for count in self.class_counts]
        self.word_counts = self.get_word_counts()


    def get_validation_accuracy(self, print_prediciton: bool = False) -> float:
        '''Returns the accuracy of the model on the validation data.'''
        if not self.validation_data:
            print(f"Model has no validation set.")
            return 0
        correct = 0
        for data in self.validation_data.data:
            predicted_class = self.classify_abstract(data.abstract)
            if print_prediciton:
                print(f"Predicted: {predicted_class} | Actual: {data.class_}")
            if predicted_class == data.class_:
                correct += 1
        return correct / len(self.validation_data.data)


    def run_test_data(self, fileout: str, type_: str = "test") -> None:
        '''Runs the model on either the testing or training data and writes the results to a file.'''
        with open(fileout, 'w') as f:
            f.write("id,class\n")
            if type_ == "test":
                for data in self.testing_data.data:
                    f.write(f"{data.id},{self.classify_abstract(data.abstract)}\n")
            elif type_ == "train":
                for data in self.training_data.data:
                    f.write(f"{data.id},{self.classify_abstract(data.abstract)}\n")
            else:
                raise ValueError("Invalid type_ argument. Must be 'test', 'validation', or 'train'.")
    
    
    def get_word_probability(self, word: str, class_index: int) -> float:
        '''p(class|word) = p(word|class) * p(class) / p(word) ++ Dirichlet smoothing'''
        word_count = self.word_counts[class_index].get(word, 0)
        class_count = self.class_counts[class_index]
        word_in_class = (word_count + self.alpha) / (class_count + self.alpha * self.vocab_size)
        class_probability = self.class_probabilities[class_index]
        word_in_data = (sum([self.word_counts[i].get(word, 0) for i in range(len(self.classes))]) + self.alpha) / (len(self.training_data.data) + self.alpha * self.vocab_size)
        return word_in_class * class_probability / word_in_data


    def classify_abstract(self, abstract: str) -> str:
        '''Classifies the abstract into one of the classes. Returns the class. Uses the Naive Bayesian Classifier algorithm.'''
        abstract_words = abstract.split()
        class_probabilities = []
        for i in range(len(self.classes)):
            cur_class_probability = self.class_probabilities[i]
            cur_class_probability = 1
            for word in abstract_words:
                cur_word_probability = self.get_word_probability(word, i)
                if cur_word_probability == 0:
                    continue
                cur_class_probability *= cur_word_probability
            class_probabilities.append(cur_class_probability)
        max_class = self.classes[class_probabilities.index(max(class_probabilities))]
        return max_class
    


    def get_class_counts(self) -> list[int]:
        '''Returns the count of each class in the training data. Match the order of the classes with the order of self.classes (classes[i] -> class_counts[i])'''
        class_counts = [0] * len(self.classes)
        for data in self.training_data.data:
            class_counts[self.classes.index(data.class_)] += 1
        return class_counts
    

    def get_word_counts(self) -> list[dict[str, int]]:
        '''Returns the count of each word in each class. Matches the order of the classes with the order of self.classes (classes[i] -> word_counts[i])'''
        word_counts = [{} for _ in range(len(self.classes))]
        for data in self.training_data.data:
            words = data.abstract.split()
            for word in words:
                if word not in word_counts[self.classes.index(data.class_)]:
                    word_counts[self.classes.index(data.class_)][word] = 1
                else:
                    word_counts[self.classes.index(data.class_)][word] += 1
        return word_counts
    

    def save(self):
        '''Saves the word counts to a txt file'''
        with open("word_counts.txt", 'w', encoding='utf-8') as f:
            for i in range(len(self.classes)):
                f.write('-'*100 + '\n')
                f.write(f"Class: {self.classes[i]}\n")
                f.write(f"Class Count: {self.class_counts[i]}\n")
                f.write(f"Class Probability: {self.class_probabilities[i]}\n\n")
                for word, count in self.word_counts[i].items():
                    f.write(f"{word}: {count}\n")
    

    def tune_hyper_param(self, min_value: float, max_value: float, step: float, param: str) -> float: # Tuning Method Figure 7.6
        '''Tunes the hyperparameter of the model and returns the best value.'''
        if not self.validation_data:
            print("Model has no validation set.")
            return 0
        multiplier = 1
        if isinstance(min_value, float):
            multiplier = 10 ** len(str(min_value).split('.')[1])
        min_value = int(min_value * multiplier)
        max_value = int(max_value * multiplier)
        step = int(step * multiplier)
        best_value = min_value
        best_accuracy = 0
        for value in range(min_value, max_value, step):
            setattr(self, param, value)
            accuracy = self.get_validation_accuracy()
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_value = value
        return best_value / multiplier

In [50]:
def create_divided_ensemble(num_models: int, alpha: float, stop_word_proportion: float, validation_proportion: float) -> list[ImprovedNaiveBayes]:
    '''Creates num_models models and splits the training data into num_models parts. Each model is trained on a different part of the data.'''
    ensemble: list[ImprovedNaiveBayes] = []
    for j in range(num_models):
        if j == 0:
            auto_load = True
            valid_prop = validation_proportion
        else:
            auto_load = False
            valid_prop = 0.0
        classifier = ImprovedNaiveBayes(
            validation_data_split=valid_prop,
            alpha=alpha,
            stop_word_proportion=stop_word_proportion,
            auto_load_data=auto_load
        )
        ensemble.append(classifier)
    data_to_split = ensemble[0].training_data
    split_size = len(data_to_split.data) // num_models
    split_data = [data_to_split.data[i * split_size: (i + 1) * split_size] for i in range(num_models)]
    for i in range(num_models):
        new_datamodel = DataModel('')
        new_datamodel.set_data(split_data[i])
        if i == 0:
            ensemble[i].set_data(training_data=new_datamodel)
        else:
            ensemble[i].set_data(training_data=new_datamodel, testing_data=ensemble[0].testing_data)
    return ensemble


def create_bootstrap_ensemble(num_models: int, alpha: float, stop_word_proportion: float, validation_proportion: float) -> list[ImprovedNaiveBayes]:
    '''Creates num_models models and splits the training data into num_models parts. Each model is trained on a different part of the data.'''
    ensemble: list[ImprovedNaiveBayes] = []
    base_classifier = ImprovedNaiveBayes(
        validation_data_split=validation_proportion,
        alpha=alpha,
        stop_word_proportion=stop_word_proportion
    )
    data_to_split = base_classifier.training_data
    split_data = data_to_split.bootstrap_sample(1/num_models, num_models)
    for i in range(num_models):
        new_datamodel = DataModel('')
        new_datamodel.set_data(split_data[i].data)
        new_classifier = ImprovedNaiveBayes(
            validation_data_split=0.0,
            alpha=alpha,
            stop_word_proportion=stop_word_proportion,
            auto_load_data=False
        )
        new_classifier.set_data(training_data=new_datamodel, testing_data=base_classifier.testing_data)
        ensemble.append(new_classifier)
    return ensemble


def test_ensemble(ensemble: list[ImprovedNaiveBayes], fileout: str, moderator_model = None) -> None:
    '''Tests the ensemble on the testing data and writes the results to a file.'''
    with open(fileout, 'w') as f:
        f.write("id,class\n")
        for data in ensemble[0].testing_data.data:
            class_counts = {}
            for classifier in ensemble:
                predicted_class = classifier.classify_abstract(data.abstract)
                if predicted_class in class_counts:
                    class_counts[predicted_class] += 1
                else:
                    class_counts[predicted_class] = 1
            print(f"{data.id}: {class_counts}")
            if len(class_counts) >= 3 and moderator_model:
                max_class = moderator_model.classify_abstract(data.abstract)
                print(f"Moderator Intervention: {max_class}")
            else:
                max_class = max(class_counts, key=class_counts.get)
            f.write(f"{data.id},{max_class}\n")


def validate_ensemble(ensemble: list[ImprovedNaiveBayes], validation_set: DataModel, moderator: ImprovedNaiveBayes | None = None) -> None:
    '''Validates the ensemble on the validation data and prints the accuracy. Uses a moderator model if provided.'''
    true_positives = 0
    for data in validation_set.data:
        class_counts = {}
        for classifier in ensemble:
            predicted_class = classifier.classify_abstract(data.abstract)
            if predicted_class in class_counts:
                class_counts[predicted_class] += 1
            else:
                class_counts[predicted_class] = 1
        if len(class_counts) >= 3 and moderator:
            max_class = moderator.classify_abstract(data.abstract)
        else:
            max_class = max(class_counts, key=class_counts.get)
        if max_class == data.class_:
            true_positives += 1
    return true_positives / len(validation_set.data)


if __name__ == "__main__":
    # ensemble: list[ImprovedNaiveBayes] = create_divided_ensemble(
    #     num_models=5,
    #     alpha=0.0001,
    #     stop_word_proportion=0.02,
    #     validation_proportion=0.1
    # )
    # ensemble[0].save()
    # print(f"Accuracy: {validate_ensemble(ensemble, ensemble[0].validation_data)}")

    classifier = ImprovedNaiveBayes(
        validation_data_split=0.2,
        alpha=0.01,
        stop_word_proportion=0.01
    )
    print(f"Validation Accuracy: {classifier.get_validation_accuracy()}")
    classifier.save()
    print(classifier.classify_abstract(
        "trump trump trump"
    ))
    

Eliminated 1847 words. Including the 95 most common and 1752 single char / number words.
Validation Accuracy: 0.5267558528428093
U
