## Imports & Constants

In [1]:
import numpy as np
import pandas as pd
import numpy.typing as npt
from abc import ABC, abstractmethod
from sklearn.naive_bayes import MultinomialNB, GaussianNB

from dataset_types import ReviewDataSet
from feature_generation import FeatureSetGenerator
from feature_normalization import FeatureSetNormalizer

POSITIVE_REVIEWS_DIR = "./data/pos/"
NEGATIVE_REVIEWS_DIR = "./data/neg/"

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wij21\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wij21\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\wij21\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\wij21\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Data

In [2]:
dataset = ReviewDataSet([POSITIVE_REVIEWS_DIR, NEGATIVE_REVIEWS_DIR]).load()

In [12]:
feature_set = FeatureSetGenerator(dataset)\
    .lemmatize()\
    .remove_stopwords()\
    .remove_punctuation()\
    .create_n_grams(1)

In [15]:
normalizer = FeatureSetNormalizer(feature_set)
normalized_feature_set = normalizer.perform_tf_idf_norm()

In [16]:
len(normalized_feature_set.first().contents)

47771

In [17]:
X_train, y_train, X_dev, y_dev, X_test, y_test = normalized_feature_set.as_train_dev_test_arrays("polarity", 0.3)

## Naive Bayes Classifier

Naive Bayes uses a 'bag of words approach', where individual words constitute its features, and the other of the words is ignored.
- The adjective 'Naive' connotes the asssumption that the occurence of features in a dataset are mutually independent.
    - In reality, this is a performance-damaging assumption

[Variants of Naive Bayes Classifiers](https://en.wikipedia.org/wiki/Naive_Bayes_classifier)
- With a `mutlinomial` event model, samples (feature vectors) the frequencies with which certain events
have been generated by a `multinomial` (p_1, ..., p_n) where p_i is the probability that event i occurs.
    - Or K such multinomials in the _multiclass_ case
    - This is the event model typically used for document classification (e.g., sentiment analysis)
        - Events can represent the occurence of a word in a single document
        - In our case, events represent the TF-IDF of words in a document
- `Guassian` distributions are typically used when dealing with continuous data
    - I gather `mutlinomial` is much better for NLP applications

### Helpful Definitions
**Prior Probability**: The probability of an event (e.g., a spam email) before the collection of new data.
- I.e., if prior observations are used to calculate the probability, we call it the prior probability

**Conditional Likelihoods**:

**Maximum a Posteriori**:

In [7]:
class IClassifier(ABC):

    @abstractmethod
    def fit(self):
        raise NotImplementedError

    @abstractmethod
    def predict(self):
        raise NotImplementedError


class NaiveBayesClassifier(IClassifier):

    def __init__(self, k: int = 2, alpha: float = 1):
        """
        
        :param k: The number of classes in the classification problem.
        :param alpha: A laplace smoothing parameter.
        """
        self.k = k
        self.alpha = alpha

    def fit(self, data: npt.NDArray, labels: npt.NDArray):
        num_samples, num_features = data.shape

        # Allocate memory for the class prior probabilities and class conditional feature likelihoods.
        self.class_prior_probas = np.zeros(shape=(self.k,))
        self.class_conditional_feature_likelihoods = np.zeros(shape=(self.k, num_features))

        for cls in range(self.k):
            samples_of_class = data[labels == cls]
            class_sample_count = samples_of_class.shape[0]

            # Probability class cls occurs. P(Y)
            self.class_prior_probas[cls] = class_sample_count / num_samples

            # Term frequencies for class `cls`.
            cls_global_feature_totals = samples_of_class.sum(axis=0) # shape = (num_features, )

            # Compute the conditional probability for each feature given the class
            # (including Laplace smoothing). P(X | Y)
            self.class_conditional_feature_likelihoods[cls, :] = (
                    (cls_global_feature_totals + self.alpha) /
                    (np.sum(cls_global_feature_totals) + self.alpha * num_features)
                )

    def predict(self, data: npt.NDArray) -> npt.NDArray:
        # Calculate the log of the priors.
        log_class_priors = np.log(self.class_prior_probas)

        # Calculate the log probabilities of each class for each sample.
        logits = log_class_priors + (data @ np.log(self.class_conditional_feature_likelihoods).T)

        # For each sample, get the index of the maximum logit (i.e., the predicted class)
        return np.argmax(logits, axis=1)

    def legacy_predict(self, data: npt.NDArray):
        num_samples, _ = data.shape

        # Allocate memory for our predictions.
        predictions = np.zeros(shape=(num_samples,))

        # Calculate the log of class prior probabilities.
        log_class_priors = np.log(self.class_prior_probas)

        for idx, sample in enumerate(data):
            # Log probabilities of each class for the current sample.
            logits = np.zeros(shape=(self.k,))

            for cls in range(self.k):
                # The log probability of each feature given the class.
                feature_log_probabilities = sample * np.log(self.class_conditional_feature_likelihoods[cls, :])

                # Sum the feature log probabilties to get the total log probability of the sample
                # given the class.
                log_prob_features_given_class = np.sum(feature_log_probabilities)

                # Add the prior probability of the class itself to get the final log probability
                # for the class.
                logits[cls] = log_class_priors[cls] + log_prob_features_given_class

            predictions[idx] = np.argmax(logits)

        return predictions

## Evaluation & Comparison Classes

In [8]:
class ClassifierPerformanceSummary:

    def __init__(self, accuracy: float, precision: float, recall: float, f1: float):
        self.accuracy = accuracy
        self.precision = precision
        self.recall = recall
        self.f1 = f1

    def __repr__(self) -> str:
        return f"""
        Accuracy:  {self.accuracy*100:.4f}%
        Precision: {self.precision*100:.4f}%
        Recall:    {self.recall*100:.4f}%
        F1:        {self.f1*100:.4f}%
        """
    
    def as_dict(self) -> dict:
        return {
            'accuracy': self.accuracy,
            'precision': self.precision,
            'recall': self.recall,
            'f1': self.f1
        }


class BinaryClassifierEvaluator:

    def __init__(self, ground_truth: npt.NDArray, predictions: npt.NDArray):
        self.predictions = predictions
        self.ground_truth = ground_truth
        self.confusion_matrix = self.produce_confusion_matrix()
        """
        [[ TP, FP ],
         [ FN, TN ]]
        """

    def produce_confusion_matrix(self) -> npt.NDArray:
        confusion_matrix = np.zeros(shape=(2, 2))

        # True positives: where both prediction and ground truth are 1.
        confusion_matrix[0, 0] = np.sum((self.predictions == 1) & (self.ground_truth == 1))

        # False positives: where prediction is 1, but ground truth is 0.
        confusion_matrix[0, 1] = np.sum((self.predictions == 1) & (self.ground_truth == 0))
        
        # False negatives: where prediction is 0, but ground truth is 1.
        confusion_matrix[1, 0] = np.sum((self.predictions == 0) & (self.ground_truth == 1))
        
        # True negatives: where both prediction and ground truth are 0.
        confusion_matrix[1, 1] = np.sum((self.predictions == 0) & (self.ground_truth == 0))

        return confusion_matrix.astype(int)

    def calculate_accuracy(self) -> float:
        """
        Calculates (TP + TN) / Num Predictions.
        """
        return (self.confusion_matrix[0, 0] + self.confusion_matrix[1, 1]) / self.confusion_matrix.sum()

    def calculate_recall(self) -> float:
        """
        Calculates TP / (TP + FN)
        """
        return self.confusion_matrix[0, 0] / self.confusion_matrix[:, 0].sum()

    def calculate_precision(self) -> float:
        """
        Calculates TP / (TP + FP).
        """
        return self.confusion_matrix[0, 0] / self.confusion_matrix[0, :].sum()
    
    def calculate_f1(self) -> float:
        """
        Calculates the harmonic mean of precision and recall.
        """
        precision = self.calculate_precision()
        recall = self.calculate_recall()

        return 2 * (precision * recall) / (precision + recall)
    
    def get_summary(self) -> ClassifierPerformanceSummary:
        return ClassifierPerformanceSummary(
            self.calculate_accuracy(),
            self.calculate_precision(),
            self.calculate_recall(),
            self.calculate_f1()
        )
    

class BaseComparator:

    def __init__(self, X_trains: list[npt.NDArray], y_trains: list[npt.NDArray]):
        self.X_trains = X_trains
        self.y_trains = y_trains


class FeatureSetsComparator(BaseComparator):

    def train_and_evaluate(
            self, 
            cls: IClassifier, 
            X_train: npt.NDArray, 
            y_train: npt.NDArray, 
            X_test: npt.NDArray, 
            y_test: npt.NDArray, 
            hyperparams: dict
            ) -> pd.DataFrame:
        
        classifier: IClassifier = cls(**hyperparams)
        classifier.fit(X_train, y_train)

        predictions = classifier.predict(X_test)
        return BinaryClassifierEvaluator(y_test, predictions).get_summary().as_dict()
    
    def compare(
            self, 
            classifier_cls: IClassifier, 
            X_tests: list[npt.NDArray], 
            y_tests: list[npt.NDArray], 
            hyperparams: dict
            ) -> pd.DataFrame:
        
        performance_data = pd.DataFrame()
        for i in range(len(self.X_trains)):
            performance_data[f"{classifier_cls.__name__} - Set {i}"] = self.train_and_evaluate(
                classifier_cls,
                self.X_trains[i],
                self.y_trains[i],
                X_tests[i],
                y_tests[i],
                hyperparams
            )

        return performance_data.T


class ClassifiersComparator(BaseComparator):

    def compare(
            self, 
            classifier_classes: list[IClassifier], 
            X_tests: list[npt.NDArray], 
            y_tests: list[npt.NDArray]
            ) -> pd.DataFrame:
        
        classifier_data = pd.DataFrame()
        for cls in classifier_classes:
            performance = FeatureSetsComparator(self.X_trains, self.y_trains)\
                .compare(cls, X_tests, y_tests, {})
            
            classifier_data = pd.concat([classifier_data, performance], axis=0)

        return classifier_data

## Evaluating our Naive Bayes Classifier

In [18]:
classifier = NaiveBayesClassifier(2, 1)
classifier.fit(X_train, y_train)

In [19]:
cdfl = classifier.class_conditional_feature_likelihoods
priors = classifier.class_prior_probas

In [20]:
dev_set_predictions = classifier.predict(X_dev)
BinaryClassifierEvaluator(y_dev, dev_set_predictions).get_summary()


        Accuracy:  84.3333%
        Precision: 84.1060%
        Recall:    84.6667%
        F1:        84.3854%
        

## Evaluating Sklearn's Multinomial Naive Bayes Classifier

In [35]:
mnb_classifier = MultinomialNB(alpha=1, fit_prior=True, ) # force_alpha=True

mnb_classifier.fit(X_train, y_train)
sklearn_dev_set_predictions = mnb_classifier.predict(X_dev)

BinaryClassifierEvaluator(y_dev, sklearn_dev_set_predictions).get_summary()


        Accuracy:  77.5000%
        Precision: 83.1325%
        Recall:    69.0000%
        F1:        75.4098%
        

In [15]:
clf_perform = ClassifiersComparator(
    [X_train, X_train],
    [y_train, y_train]
).compare([MultinomialNB, GaussianNB, NaiveBayesClassifier], [X_dev, X_dev], [y_dev, y_dev])

In [16]:
clf_perform.sort_values(by="f1")

Unnamed: 0,accuracy,precision,recall,f1
MultinomialNB - Set 0,accuracy,precision,recall,f1
MultinomialNB - Set 1,accuracy,precision,recall,f1
GaussianNB - Set 0,accuracy,precision,recall,f1
GaussianNB - Set 1,accuracy,precision,recall,f1
NaiveBayesClassifier - Set 0,accuracy,precision,recall,f1
NaiveBayesClassifier - Set 1,accuracy,precision,recall,f1
