# Assignment 2: Naive Bayesian Classification (acom884)
## Code:
### Figure 1. Imports

In [45]:
import random # Random is the ONLY module I am allowing myself to use for this challenge.

### Figure 2. Simple Data Class

In [46]:
class Data:
    def __init__(self, id: int, class_: str | None, abstract: str) -> None:
        self.id: int = id
        self.class_: str = class_
        self.abstract: str = abstract

    def get_type(self) -> str:
        '''Returns: "test" | "train"'''
        return "train" if self.class_ else "test"

### Figure 3. DataModel Class

In [47]:
class DataModel:
    def __init__(self, data_path: str, do_shuffle: bool = False) -> None:
        '''
        The DataModel class is a container for the data in the dataset.
        This will be greatly helpful for the model to access the data in a structured way, and minimise risk of errors.
        The data is stored in a list of Data objects, which contain the id, class, and abstract of the data as attributes.
        The vocabulary size is also stored as an attribute, which is the number of unique words in the dataset.
        This DataModel will be able to split data for validation sets and ensembles.
        '''
        self.data: list[Data] = []
        self.vocabulary_size: int = 0
        if not data_path or data_path.strip() == "":
            self.data = []
        else:
            with open(data_path, 'r') as f:
                lines = f.readlines()
            if do_shuffle:
                random.shuffle(lines) # shouldn't be used for the test data just in case Kaggle expects the data in order
            for line in lines[1:]:
                line_split = line.strip().replace('"', '').split(',')
                if len(line_split) == 3:
                    id, class_, abstract = line_split
                    data = Data(id, class_, abstract)
                    self.data.append(data)
                else:
                    id, abstract = line_split
                    data = Data(id, None, abstract)
                    self.data.append(data)
        self.vocabulary_size = self.get_vocabulary_size()


    def set_data(self, data: list[Data]) -> None:
        '''Sets the data of this model to the given data and updates the vocab size.'''
        self.data = data
        self.vocabulary_size = self.get_vocabulary_size()


    def get_vocabulary_size(self) -> int:
        '''Returns the number of unique words in the dataset.'''
        vocabulary = set()
        for data in self.data:
            words = data.abstract.split()
            for word in words:
                vocabulary.add(word)
        self.vocabulary_size = len(vocabulary)
        return self.vocabulary_size


    def split_model(self, proportion: float) -> 'DataModel':
        '''Splits this dataset into two data sets based on the proportion of the current data to be in the new split.'''
        split_data = DataModel('')
        split_index = int(len(self.data) * proportion)
        split_data.set_data(self.data[:split_index])
        self.set_data(self.data[split_index:])
        return split_data
    
    
    def eliminate_stop_words(self, stop_word_proportion) -> None:
        '''
        Stopwords include:
        - Any word that is a single character
        - Any word that is a number
        - The top 10% of words that appear in all classes
        '''
        word_count = {}
        num_eliminated = 0
        for data in self.data:
            words = data.abstract.split()
            new_data = []
            for word in words:
                if len(word) > 1 and not word.isdigit():
                    if word in word_count:
                        word_count[word] += 1
                    else:
                        word_count[word] = 1
                    new_data.append(word)
                else:
                    num_eliminated += 1
            data.abstract = ' '.join(new_data)
        stop_words = set()
        num_words_to_eliminate = int(len(word_count) * stop_word_proportion)
        for _ in range(num_words_to_eliminate):
            max_word = max(word_count, key=word_count.get)
            stop_words.add(max_word)
            del word_count[max_word]
            num_eliminated += 1
        for data in self.data:
            words = data.abstract.split()
            new_data = []
            for word in words:
                if word not in stop_words:
                    new_data.append(word)
            data.abstract = ' '.join(new_data)
        self.vocabulary_size = self.get_vocabulary_size()
        print(f'Eliminated {num_eliminated} words. Including the {len(stop_words)} most common and {num_eliminated - len(stop_words)} single char / number words.')

    
    def bootstrap_sample(self, proportion, num_models) -> list['DataModel']:
        '''Returns a list of num_models DataModels, each with a random sample of the data with proportion of the original data.'''
        num_per_model = int(len(self.data) * proportion)
        models = []
        for _ in range(num_models):
            new_data = []
            for _ in range(num_per_model):
                new_data.append(random.choice(self.data))
            new_model = DataModel('')
            new_model.set_data(new_data)
            models.append(new_model)
        return models        
        

### Figure 4. Standard Naive Bayes Classifier

In [48]:
class StandardNaiveBayes:
    def __init__(self) -> None:
        ''''
        This is the standard Naive Bayes classifier. 
        It uses the training data to calculate the probabilities of each word in each class and then uses these probabilities to classify the test data.
        The standard Naive Bayes has absolutely no added features or optimizations but still managed to get a 0.80 accuracy on the test data.
        '''
        self.training_data: DataModel = DataModel(
            data_path=r"data\trg.csv",
            do_shuffle=False # this counts as data pre-processing and won't be used for the standard model
        )
        self.testing_data: DataModel = DataModel(
            data_path=r"data\tst.csv",
            do_shuffle=False # ditto above
        )
        self.classes = [c for c in set([data.class_ for data in self.training_data.data])]
        self.class_counts = self.get_class_counts()
        self.class_probabilities = [count / len(self.training_data.data) for count in self.class_counts]
        self.word_counts = self.get_word_counts()


    def run_test_data(self, fileout: str) -> None:
        with open(fileout, 'w') as f:
            f.write("id,class\n")
            for data in self.testing_data.data:
                f.write(f"{data.id},{self.classify_abstract(data.abstract)}\n")


    def get_word_probability(self, word: str, class_index: int) -> float:
        '''
        p(class|word) = p(word|class) * p(class) / p(word)
        '''
        word_count = self.word_counts[class_index].get(word, 0)
        class_count = self.class_counts[class_index]
        word_in_class = word_count / class_count
        class_probability = self.class_probabilities[class_index]
        word_in_data = sum([self.word_counts[i].get(word, 0) for i in range(len(self.classes))]) / len(self.training_data.data)
        if word_in_data == 0:
            return 0
        return word_in_class * class_probability / word_in_data


    def classify_abstract(self, abstract: str) -> str:
        '''Classifies the abstract into one of the classes. Returns the class. Uses the Naive Bayesian Classifier algorithm.'''
        abstract_words = abstract.split()
        class_probabilities = []
        for i in range(len(self.classes)):
            cur_class_probability = self.class_probabilities[i]
            cur_class_probability = 1
            for word in abstract_words:
                cur_word_probability = self.get_word_probability(word, i)
                if cur_word_probability == 0:
                    continue
                cur_class_probability *= cur_word_probability
            class_probabilities.append(cur_class_probability)
        max_class = self.classes[class_probabilities.index(max(class_probabilities))]
        return max_class


    def get_class_counts(self) -> list[int]:
        '''Returns the count of each class in the training data. Match the order of the classes with the order of self.classes (classes[i] -> class_counts[i])'''
        class_counts = [0] * len(self.classes)
        for data in self.training_data.data:
            class_counts[self.classes.index(data.class_)] += 1
        return class_counts
    

    def get_word_counts(self) -> list[dict[str, int]]:
        '''Returns the count of each word in each class. Match the order of the classes with the order of self.classes (classes[i] -> word_counts[i])'''
        word_counts = [{} for _ in range(len(self.classes))]
        for data in self.training_data.data:
            for word in data.abstract.split():
                if word not in word_counts[self.classes.index(data.class_)]:
                    word_counts[self.classes.index(data.class_)][word] = 1
                else:
                    word_counts[self.classes.index(data.class_)][word] += 1
        return word_counts

### Figure 5. Main Routine for the Standard Naive Bayes

In [49]:
if __name__ == "__main__":
    # 10-fold repeated Cross-Validation for the Evaluation metric.
    total_accuracy = 0
    for _ in range(10):
        accuracy = 0
        model = StandardNaiveBayes()
        validation_data = model.training_data.split_model(0.1)
        for data in validation_data.data:
            classification = model.classify_abstract(data.abstract)
            if classification == data.class_:
                accuracy += 1
        total_accuracy += accuracy / len(validation_data.data)
    print(f"Average accuracy: {total_accuracy / 10}")
        
    # Simple run of the model on test data
    classifier = StandardNaiveBayes()
    classifier.run_test_data(fileout="standard_output.csv")
    '''
    -->    Accuracy Obtained: 0.800 (Kaggle)
    '''

### Figure 6. N-Gram Feature Representation

In [50]:
class N_Gram_DataModel:
    def __init__(self, data_path: str, do_shuffle: bool = False) -> None:
        '''
        Difference between MN_DataModel and DataModel is that it will combine multiple words as features.
        This means that the vocabulary size will be smaller as words are paired together.
        This could capture the meaning of the words better, but also results in less features and potentially overfitting.
        '''
        self.data: list[Data] = []
        self.vocabulary_size: int = 0
        if not data_path or data_path.strip() == "":
            self.data = []
        else:
            with open(data_path, 'r') as f:
                lines = f.readlines()
            if do_shuffle:
                random.shuffle(lines) # shouldn't be used for the test data just in case Kaggle expects the data in order
            for line in lines[1:]:
                line_split = line.strip().replace('"', '').split(',')
                if len(line_split) == 3:
                    id, class_, abstract = line_split
                    data = Data(id, class_, abstract)
                    self.data.append(data)
                else:
                    id, abstract = line_split
                    data = Data(id, None, abstract)
                    self.data.append(data)
        self.vocabulary_size = self.get_vocabulary_size()


    def set_data(self, data: list[Data]) -> None:
        '''Simplt sets the data and ensures to update the vocab size attribute.'''
        self.data = data
        self.vocabulary_size = self.get_vocabulary_size()


    def get_vocabulary_size(self) -> int:
        '''Returns the number of unique words in the dataset.'''
        vocabulary = set()
        for data in self.data:
            words = self.split_abstract(data.abstract)
            for word in words:
                vocabulary.add(word)
        self.vocabulary_size = len(vocabulary)
        return self.vocabulary_size


    def split_model(self, proportion: float) -> 'N_Gram_DataModel':
        '''Splits this dataset into two data sets based on the proportion of the current data to be in the new split.'''
        split_data = N_Gram_DataModel('')
        split_index = int(len(self.data) * proportion)
        split_data.set_data(self.data[:split_index])
        self.set_data(self.data[split_index:])
        return split_data
    

    def split_abstract(self, abstract: str) -> list[str]:
        '''Splits the abstract into a list of paired words.'''
        words = abstract.split()
        combined_words = []
        for i in range(0, len(words), 2):
            if i + 1 < len(words):
                combined_words.append(words[i] + ' ' + words[i + 1])
            else:
                combined_words.append(words[i])
        return combined_words


    def eliminate_stop_words(self, stop_word_proportion) -> None:
        '''
        Stopwords include:
        - Any word that is a single character
        - Any word that is a number
        - The top n% of words to appear
        '''
        # first, get the single char / number words out
        word_count = {}
        num_eliminated = 0
        for data in self.data:
            words = self.split_abstract(data.abstract)
            new_data = []
            for word in words:
                if len(word) > 1 and not word.isdigit():
                    if word in word_count:
                        word_count[word] += 1
                    else:
                        word_count[word] = 1
                    new_data.append(word)
                else:
                    num_eliminated += 1
            data.abstract = ' '.join(new_data)
        # now eliminate the top (stop_word_proportion)% of words
        stop_words = set()
        num_words_to_eliminate = int(len(word_count) * stop_word_proportion)
        for _ in range(num_words_to_eliminate):
            max_word = max(word_count, key=word_count.get)
            stop_words.add(max_word)
            del word_count[max_word]
            num_eliminated += 1
        for data in self.data:
            words = self.split_abstract(data.abstract)
            new_data = []
            for word in words:
                if word not in stop_words:
                    new_data.append(word)
            data.abstract = ' '.join(new_data)
        self.vocabulary_size = self.get_vocabulary_size()
        print(f'Eliminated {num_eliminated} words. Including the {len(stop_words)} most common and {num_eliminated - len(stop_words)} single char / number words.')
        
        

### Figure 7. Improved Naive Bayesian Classifier

In [51]:
class ImprovedNaiveBayes:
    def __init__(self, validation_data_split: float, alpha: float, stop_word_proportion: float, auto_load_data: bool = True) -> None:
        '''
        This model is an improvement on the standard naive bayes model. It includes the following improvements:
        - Supporting validation data splits
        - Dirichlet smoothing
        - Stop word elimination
        - Ability to combine words as features
        - Supporting manual data loading for ensembles / superstructures of models        
        '''
        if auto_load_data:
            self.training_data: DataModel = DataModel(
                data_path=r"data\trg.csv",
                do_shuffle=True                         # Data stochasiticity parameter Figure 7.1
            )
            self.testing_data: DataModel = DataModel(
                data_path=r"data\tst.csv",
                do_shuffle=False
            )
            self.training_data.eliminate_stop_words(stop_word_proportion) # stop word elimination parameter Figure 7.2
            if validation_data_split > 0.0:        # validation data split parameter Figure 7.3
                self.validation_data: DataModel = self.training_data.split_model(validation_data_split)
            else:
                self.validation_data = None
            self.alpha = alpha
            self.vocab_size = self.training_data.vocabulary_size
            self.classes = [c for c in set([data.class_ for data in self.training_data.data])]
            self.class_counts = self.get_class_counts()
            self.class_probabilities = [count / len(self.training_data.data) for count in self.class_counts]
            self.word_counts = self.get_word_counts()
        else:
            self.training_data = DataModel('')
            self.testing_data = DataModel('')
            self.validation_data = None
            self.alpha = alpha                      # Dirichlet smoothing hyperparameter Figure 7.4
            self.vocab_size = 0                     # Dirichlet smoothing hyperparameter Figure 7.5
            self.classes = []
            self.class_counts = []
            self.class_probabilities = []
            self.word_counts = []


    def set_data(self, training_data: DataModel = None, testing_data: DataModel = None, validation_data: DataModel = None) -> None:
        '''Method for manually setting the data. Needs to update several attributes of the model.'''
        if training_data:
            self.training_data.set_data(training_data.data)
        if testing_data:
            self.testing_data.set_data(testing_data.data)
        if validation_data:
            if self.validation_data:
                self.validation_data.set_data(validation_data.data)
            else:
                self.validation_data = DataModel('')
                self.validation_data.set_data(validation_data.data)
        self.vocab_size = self.training_data.vocabulary_size
        self.classes = [c for c in set([data.class_ for data in self.training_data.data])]
        self.class_counts = self.get_class_counts()
        self.class_probabilities = [count / len(self.training_data.data) for count in self.class_counts]
        self.word_counts = self.get_word_counts()


    def get_validation_accuracy(self, print_prediciton: bool = False) -> float:
        '''Returns the accuracy of the model on the validation data.'''
        if not self.validation_data:
            print(f"Model has no validation set.")
            return 0
        correct = 0
        for data in self.validation_data.data:
            predicted_class = self.classify_abstract(data.abstract)
            if print_prediciton:
                print(f"Predicted: {predicted_class} | Actual: {data.class_}")
            if predicted_class == data.class_:
                correct += 1
        return correct / len(self.validation_data.data)


    def run_test_data(self, fileout: str, type_: str = "test") -> None:
        '''Runs the model on either the testing or training data and writes the results to a file.'''
        with open(fileout, 'w') as f:
            f.write("id,class\n")
            if type_ == "test":
                for data in self.testing_data.data:
                    f.write(f"{data.id},{self.classify_abstract(data.abstract)}\n")
            elif type_ == "train":
                for data in self.training_data.data:
                    f.write(f"{data.id},{self.classify_abstract(data.abstract)}\n")
            else:
                raise ValueError("Invalid type_ argument. Must be 'test', 'validation', or 'train'.")
    
    
    def get_word_probability(self, word: str, class_index: int) -> float:
        '''p(class|word) = p(word|class) * p(class) / p(word) ++ Dirichlet smoothing'''
        word_count = self.word_counts[class_index].get(word, 0)
        class_count = self.class_counts[class_index]
        word_in_class = (word_count + self.alpha) / (class_count + self.alpha * self.vocab_size)
        class_probability = self.class_probabilities[class_index]
        word_in_data = (sum([self.word_counts[i].get(word, 0) for i in range(len(self.classes))]) + self.alpha) / (len(self.training_data.data) + self.alpha * self.vocab_size)
        return word_in_class * class_probability / word_in_data


    def classify_abstract(self, abstract: str) -> str:
        '''Classifies the abstract into one of the classes. Returns the class. Uses the Naive Bayesian Classifier algorithm.'''
        abstract_words = abstract.split()
        class_probabilities = []
        for i in range(len(self.classes)):
            cur_class_probability = self.class_probabilities[i]
            cur_class_probability = 1
            for word in abstract_words:
                cur_word_probability = self.get_word_probability(word, i)
                if cur_word_probability == 0:
                    continue
                cur_class_probability *= cur_word_probability
            class_probabilities.append(cur_class_probability)
        max_class = self.classes[class_probabilities.index(max(class_probabilities))]
        return max_class
    


    def get_class_counts(self) -> list[int]:
        '''Returns the count of each class in the training data. Match the order of the classes with the order of self.classes (classes[i] -> class_counts[i])'''
        class_counts = [0] * len(self.classes)
        for data in self.training_data.data:
            class_counts[self.classes.index(data.class_)] += 1
        return class_counts
    

    def get_word_counts(self) -> list[dict[str, int]]:
        '''Returns the count of each word in each class. Matches the order of the classes with the order of self.classes (classes[i] -> word_counts[i])'''
        word_counts = [{} for _ in range(len(self.classes))]
        for data in self.training_data.data:
            words = data.abstract.split()
            for word in words:
                if word not in word_counts[self.classes.index(data.class_)]:
                    word_counts[self.classes.index(data.class_)][word] = 1
                else:
                    word_counts[self.classes.index(data.class_)][word] += 1
        return word_counts
    

    def save(self):
        '''Saves the word counts to a txt file'''
        with open("word_counts.txt", 'w') as f:
            for i in range(len(self.classes)):
                f.write('-'*100 + '\n')
                f.write(f"Class: {self.classes[i]}\n")
                f.write(f"Class Count: {self.class_counts[i]}\n")
                f.write(f"Class Probability: {self.class_probabilities[i]}\n\n")
                for word, count in self.word_counts[i].items():
                    f.write(f"{word}: {count}\n")
    

    def tune_hyper_param(self, min_value: float, max_value: float, step: float, param: str) -> float: # Tuning Method Figure 7.6
        '''Tunes the hyperparameter of the model and returns the best value.'''
        if not self.validation_data:
            print("Model has no validation set.")
            return 0
        multiplier = 1
        if isinstance(min_value, float):
            multiplier = 10 ** len(str(min_value).split('.')[1])
        min_value = int(min_value * multiplier)
        max_value = int(max_value * multiplier)
        step = int(step * multiplier)
        best_value = min_value
        best_accuracy = 0
        for value in range(min_value, max_value, step):
            setattr(self, param, value)
            accuracy = self.get_validation_accuracy()
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_value = value
        return best_value / multiplier

### Figure 8. Main Routine for Improved Naive Bayesian Classifier

In [66]:
def create_divided_ensemble(num_models: int, alpha: float, stop_word_proportion: float, validation_proportion: float) -> list[ImprovedNaiveBayes]:
    '''Creates num_models models and splits the training data into num_models parts. Each model is trained on a different part of the data.'''
    ensemble: list[ImprovedNaiveBayes] = []
    for j in range(num_models):
        if j == 0:
            auto_load = True
            valid_prop = validation_proportion
        else:
            auto_load = False
            valid_prop = 0.0
        classifier = ImprovedNaiveBayes(
            validation_data_split=valid_prop,
            alpha=alpha,
            stop_word_proportion=stop_word_proportion,
            auto_load_data=auto_load
        )
        ensemble.append(classifier)
    data_to_split = ensemble[0].training_data
    split_size = len(data_to_split.data) // num_models
    split_data = [data_to_split.data[i * split_size: (i + 1) * split_size] for i in range(num_models)]
    for i in range(num_models):
        new_datamodel = DataModel('')
        new_datamodel.set_data(split_data[i])
        if i == 0:
            ensemble[i].set_data(training_data=new_datamodel)
        else:
            ensemble[i].set_data(training_data=new_datamodel, testing_data=ensemble[0].testing_data)
    return ensemble


def create_bootstrap_ensemble(num_models: int, alpha: float, stop_word_proportion: float, validation_proportion: float) -> list[ImprovedNaiveBayes]:
    '''Creates num_models models and splits the training data into num_models parts. Each model is trained on a different part of the data.'''
    ensemble: list[ImprovedNaiveBayes] = []
    base_classifier = ImprovedNaiveBayes(
        validation_data_split=validation_proportion,
        alpha=alpha,
        stop_word_proportion=stop_word_proportion
    )
    data_to_split = base_classifier.training_data
    split_data = data_to_split.bootstrap_sample(1/num_models, num_models)
    for i in range(num_models):
        new_datamodel = DataModel('')
        new_datamodel.set_data(split_data[i].data)
        new_classifier = ImprovedNaiveBayes(
            validation_data_split=0.0,
            alpha=alpha,
            stop_word_proportion=stop_word_proportion,
            auto_load_data=False
        )
        new_classifier.set_data(training_data=new_datamodel, testing_data=base_classifier.testing_data)
        ensemble.append(new_classifier)
    return ensemble


def test_ensemble(ensemble: list[ImprovedNaiveBayes], fileout: str, moderator_model = None) -> None:
    '''Tests the ensemble on the testing data and writes the results to a file.'''
    with open(fileout, 'w') as f:
        f.write("id,class\n")
        for data in ensemble[0].testing_data.data:
            class_counts = {}
            for classifier in ensemble:
                predicted_class = classifier.classify_abstract(data.abstract)
                if predicted_class in class_counts:
                    class_counts[predicted_class] += 1
                else:
                    class_counts[predicted_class] = 1
            print(f"{data.id}: {class_counts}")
            if len(class_counts) >= 3 and moderator_model:
                max_class = moderator_model.classify_abstract(data.abstract)
                print(f"Moderator Intervention: {max_class}")
            else:
                max_class = max(class_counts, key=class_counts.get)
            f.write(f"{data.id},{max_class}\n")


def validate_ensemble(ensemble: list[ImprovedNaiveBayes], validation_set: DataModel, moderator: ImprovedNaiveBayes | None = None) -> None:
    '''Validates the ensemble on the validation data and prints the accuracy. Uses a moderator model if provided.'''
    true_positives = 0
    for data in validation_set.data:
        class_counts = {}
        for classifier in ensemble:
            predicted_class = classifier.classify_abstract(data.abstract)
            if predicted_class in class_counts:
                class_counts[predicted_class] += 1
            else:
                class_counts[predicted_class] = 1
        if len(class_counts) >= 3 and moderator:
            max_class = moderator.classify_abstract(data.abstract)
        else:
            max_class = max(class_counts, key=class_counts.get)
        if max_class == data.class_:
            true_positives += 1
    return true_positives / len(validation_set.data)


if __name__ == "__main__":
    ensemble: list[ImprovedNaiveBayes] = create_divided_ensemble(
        num_models=5,
        alpha=0.0001,
        stop_word_proportion=0.02,
        validation_proportion=0.1
    )
    print(f"Accuracy: {validate_ensemble(ensemble, ensemble[0].validation_data)}")
    '''
    -->    Accuracy Obtained: 0.920 (Kaggle)
    '''
    

Eliminated 41171 words. Including the 602 most common and 40569 single char / number words.
Accuracy: 0.9125


## Discussion
#### 1. Data Representation
I chose to write in object-oriented and type explicit python. This served two purposes; for one, the assignment would be marked on readability and for another, debugging OOP code can be much simpler. With this in mind, I created a ‘class hierarchy’ which would take advantage of this type of programming. 3 tiers of class would be at play:
1.	A Data class, which represents a single line of data with an id, class and abstract.
2.	A DataModel class which acts as a container for the Data classes and enables safer extraction and manipulation of the datasets.
3.	The Model class which holds the DataModels as its sets (train, test, valid) and holds the word frequency tables.

The SNB I was building was non-time constrained and would require plenty of testing, data splitting and code reuse and this highly modular approach seemed perfect for the task.</br>
I decided to completely minimalize my initial Standard Naïve Bayes Classifier and not include any data pre-processing or optimisations, as this would allow me to include these as extensions and see the accuracy differences when creating my improved model. My final model pre-processes with randomisation of the training data. This is because my final model was a majority-choice ensemble and I wanted to randomise the splits of training data between the ensembled models for more accurate validation results.


#### 2. My Standard Naive Bayesian Classifier
My SNB, the basic Naïve Bayes was intended to be a minimum-viable product of a Naïve Bayesian Classifier, so that the effects of improvements were fully captured. My SNB *[Figure 4]* works by immediately identifying the word and class counts and frequencies once the class object is initialised. Then, the model can be run with the run_test_data method and save the output to a csv.
Each classification is a simplistic implementation of Bayes Formula and this approach achieved a 0.800 accuracy on the Kaggle test set.

### 3. Implemented Extensions
For my improved classifier, I made and tested the following improvements. Not all of these improvements remained however, as will be discussed.</br>

1.	Smoothing</br>
In the training data given, classes ‘E’ and ‘B’ made up 94% of the training data, meaning identifying meaningful features for ‘A’ and ‘V’ was going to be difficult. With this in mind I implemented smoothing with a dirichlet distribution. The idea was that this would make more accurate classifications for rare  / unseen words by using prior knowledge of word frequencies. The assumption was that rarer words were more likely to be associated with the classes for which less training data exists. </br>

2.	Stop-word analysis</br>
Another issue was the existence of words which held no actual value to unseen data. This included numbers, single character words and words that appeared so ubiquitously across the classes that they just biased the model based on class frequency. To fix this I implemented a stop-word system whereby these features (class ubiquity, numbers and single characters) were removed. I also wrote a hyperparameter tuning function for the model and had it iterate over values between (0.01 and 10), before settling on an optimisation of the top 2% of words being removed (retested for the ensemble later). *[Figure 7.2]*</br>

3.	N-gram words</br>
I experimented with n-gram words for this classifier. The idea was that combining words together would better capture their meaning in the data and better identify classes with lower training frequency. However, because of the limited size of the training data I actually found this to make my model overfit the training data, and I removed it from my final model. It’s implementation can still be read in *[Figure 6]*.</br>

4.	Validation Splits / Data splits in general</br>
As I excluded any data pre-processing in the SNB, I implemented data splitting into the DataModel class so that improved models could split of proportions of their data for validation set testing or for more complex ensemble data distribution methods *[Figure 7.2]*. Additionally, I also implemented randomness into the data splitting, so that techniques such as bagging might be used. *[Figure 7.1]*</br>

5.	Tuning of hyperparameters (range-iter averaging)</br>
As mentioned in (b), the final improved classifier has a hyperparameter tuning method for which hyperparameters of the models were derived. These figures, such as for alpha, stop_word elimination proportion and so on, proved their effectiveness with big upticks in accuracy. *[Figure 7.6]*</br>

6.	Ensemble Structure</br>
Finally, I wanted to test an ensemble structure. My reasoning here was that the size of the training data would inevitably lead to some form of overfitting. A way to mitigate this would be to create an ensemble of classifiers, each with distinct training data. This way, overfitting would increase the difference between the models in the ensemble and help mitigate overall overfitting of the model. The distinct training data and tendency to overfit lead to a strong ensemble structure as the independent models were distinct.</br>

### 4. Evaluation
My final model consisted of an ensemble of 5 Improved Naïve Bayesian Classifiers, each containing a bag-of-words approach trained on distinct subsets of the training data. Each has had the top 2% of most common words removed as well as has a smoothing alpha of 0.0001 for Dirichlet smoothing as per the best outcomes of my hyperparameter tuning method. This led to a Kaggle accuracy of 0.92, with a 10-iteration validation test result of 0.91.

Comparing the improved Bayesian classifier with the original SNB classifier and with a non-ensembled bag-of-words Improved Classifier standing along:
<table>
  <thead>
    <tr>
      <th> </th>
      <th>Standard Naïve Bayes</th>
      <th>Improved Single</th>
      <th>Improved Ensemble</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>10-fold repeat Cross-Validation Accuracy</td>
      <td>0.785</td>
      <td>0.93</td>
      <td>0.910</td>
    </tr>
    <tr>
      <td>Test Accuracy</td>
      <td>0.800</td>
      <td>0.853</td>
      <td>0.920</td>
    </tr>
  </tbody>
</table>
** In all cases of Cross-Validation 10% of the total data was used for validation

As evidenced by these results on the test set, there was a steady improvement in accuracy on unseen data at each major stage of my model. However, the improved single model showed significant potential overfitting when run on the test data. This suggests that this individual model is highly overfit to the training data and was the reason I decided to test an ensemble structure.

Additionally, I also noted down the average accuracy of models before and after certain landmark implementations:
<table>
  <thead>
    <tr>
      <th>Improvement</th>
      <th>Accuracy difference (10-fold)</th>
      <th>Notes</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>Smoothing</td>
      <td>-0.014 [0.786]</td>
      <td>This was because my alpha parameter was far too large.</td>
    </tr>
    <tr>
      <td>Hyperparameter Tuning (alpha tuning)</td>
      <td>+0.067 [0.853]</td>
      <td>This rapid boost was the effect of tuning on alpha for smoothing.</td>
    </tr>
    <tr>
      <td>Stop-words</td>
      <td>+0.040 [0.893]</td>
      <td>Tuned to 2%</td>
    </tr>
    <tr>
      <td>N-Grams</td>
      <td>-0.013 [~0.880]</td>
      <td>This idea was scratched **</td>
    </tr>
    <tr>
      <td>Ensemble Structure</td>
      <td>+0.027 [0.920]</td>
      <td>Final Test Accuracy</td>
    </tr>
  </tbody>
</table>
** As discussed earlier, the N-Grams failure was likely due to massive overfitting as my method involved reducing the size of the resultant training set to around half the size.

__Conclusions__:
- Each individual improvement included in the final classifier made improvements to the accuracy in both the validation and test sets
- The initial model was being greatly affected by words that held no meaningful classification value like 'a', 'if' or '42'.
- Improvements such as smoothing can appear to be decreasing performance if hyperparameters haven't been properly optimised
- The individual improved model tends to overfit with a large difference between the validation data and the test data but ... 
- Using this overfitting characteristic as individual members of an ensemble greatly increased the overall accuracy while ensembles can do a good job of reducing overfitting