## Imports & Constants

In [43]:
import numpy as np
import numpy.typing as npt
import pandas as pd

from dataset_types import ReviewDataSet
from feature_generation import FeatureSetGenerator
from feature_normalization import FeatureSetNormalizer

POSITIVE_REVIEWS_DIR = "./data/pos/"
NEGATIVE_REVIEWS_DIR = "./data/neg/"

## Data

In [5]:
dataset = ReviewDataSet([POSITIVE_REVIEWS_DIR, NEGATIVE_REVIEWS_DIR]).load()

In [6]:
feature_set = FeatureSetGenerator(dataset).remove_punctuation().stem().create_n_grams(1)

In [7]:
normalizer = FeatureSetNormalizer(feature_set)
normalized_feature_set = normalizer.perform_tf_idf()

In [8]:
X_train, y_train, X_dev, y_dev, X_test, y_test = normalized_feature_set.split_into_train_dev_test_sets("polarity", 0.3)

## Naive Bayes Classifier

Naive Bayes uses a 'bag of words approach', where individual words constitute its features, and the other of the words is ignored.
- The adjective 'Naive' connotes the asssumption that the occurence of features in a dataset are mutually independent.
    - In reality, this is a performance-damaging assumption

[Variants of Naive Bayes Classifiers](https://en.wikipedia.org/wiki/Naive_Bayes_classifier)
- With a `mutlinomial` event model, samples (feature vectors) the frequencies with which certain events
have been generated by a `multinomial` (p_1, ..., p_n) where p_i is the probability that event i occurs.
    - Or K such multinomials in the _multiclass_ case
    - This is the event model typically used for document classification (e.g., sentiment analysis)
        - Events can represent the occurence of a word in a single document
        - In our case, events represent the TF-IDF of words in a document
- `Guassian` distributions are typically used when dealing with continuous data
    - I gather `mutlinomial` is much better for NLP applications

### Helpful Definitions
**Prior Probability**: The probability of an event (e.g., a spam email) before the collection of new data.
- I.e., if prior observations are used to calculate the probability, we call it the prior probability

**Conditional Likelihoods**:

**Maximum a Posteriori**:

In [75]:
class NaiveBayesClassifier:

    def __init__(self, k: int, alpha: float):
        """
        
        :param k: The number of classes in the classification problem.
        :param alpha: A laplace smoothing parameter.
        """
        self.k = k
        self.alpha = alpha

    def train(self, data: npt.NDArray, labels: npt.NDArray):
        num_samples, num_features = data.shape

        # Allocate memory for the class prior probabilities and the class-conditioned feature likelihoods.
        self.class_prior_probas = np.zeros(shape=(self.k,))
        self.class_conditional_feature_likelihoods = np.zeros(shape=(self.k, num_features))

        for cls in range(self.k):
            samples_of_class = data[labels == cls]
            class_sample_count = samples_of_class.shape[0]

            # Probability class cls occurs. P(Y)
            self.class_prior_probas[cls] = class_sample_count / num_samples

            # Term frequencies for class `cls`.
            cls_global_feature_totals = samples_of_class.sum(axis=0) # shape = (num_features, )

            # Compute the conditional probability for each feature given the class
            # (including Laplace smoothing). P(X | Y)
            self.class_conditional_feature_likelihoods[cls, :] = (
                    (cls_global_feature_totals + self.alpha) /
                    (np.sum(cls_global_feature_totals) + self.alpha * num_features)
                )

    def predict(self, data: npt.NDArray):
        num_samples, num_features = data.shape

        # Allocate memory for our predictions.
        predictions = np.zeros(shape=(num_samples,))

        for sample in data:
            log_class_priors = ...

In [76]:
classifier = NaiveBayesClassifier(2, 1)

In [77]:
classifier.train(X_train, y_train)

In [84]:
classifier.class_conditional_feature_likelihoods.sum(axis=1)

array([1., 1.])