# Plan
- [x] Load all reviews into a data structure in memory
- [x] Apply tokenization to each review (base step)
- [x] Choose and apply three pathways through the feature generation process
    - _Remember that you will need to justify your choices with reference to the accuracy that is achieved with each feature set_

A simple idea for a datastructure that holds the dataset in memory is:
```Python
dataset = [
    {
        "id": 1,
        "rating": 7,
        "polarity": 1,
        "contents:": "Lorem Ipsum"
    },
    ...
]
```

Later, this should be a `pandas` dataframe.

## Imports & Constants

In [1]:
import os
import copy
import utils
import string
import numpy as np
import pandas as pd
import numpy.typing as npt
from typing_extensions import Self
# from typing import TypeAlias, Tuple
TypeAlias = None
from typing import Tuple
from functools import cached_property
from collections import Counter, defaultdict
from sklearn.model_selection import StratifiedShuffleSplit

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk.util import ngrams, everygrams

POSITIVE_REVIEWS_DIR = "./data/pos/"
NEGATIVE_REVIEWS_DIR = "./data/neg/"

[nltk_data] Downloading package punkt to /home/sowell/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/sowell/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sowell/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/sowell/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Dataset Types

In [2]:
class TextualDataPoint:

    def __init__(self, file_path: str):
        self.file_path = file_path
        self.contents = self._extract_contents()

    @cached_property
    def basename(self) -> str:
        return os.path.basename(self.file_path)

    @cached_property
    def filename(self) -> str:
        return os.path.splitext(self.basename)[0]
    
    def _extract_contents(self) -> str:
        with open(self.file_path, "r") as file:
            return file.read()
        
    def as_dict(self) -> dict:
        return {
            'contents': self.contents,
        }


class Review(TextualDataPoint):

    def __init__(self, file_path: str):
        super().__init__(file_path)
        self.id = self._parse_id()
        self.rating = self._parse_rating()
        self.polarity = self._determine_polarity()

    def _parse_id(self) -> int:
        return int(self.filename.split("_")[0])

    def _parse_rating(self) -> int:
        return int(self.filename.split("_")[1])
    
    def _determine_polarity(self) -> int:
        if self.rating is None:
            self.rating = self._parse_rating()

        if self.rating <= 4:
            return 0
        elif self.rating >= 7:
            return 1
        else:
            raise ValueError(f"unexpected rating: {self.rating}")

    def as_dict(self) -> dict:
        return {
            'id': self.id,
            'rating': self.rating,
            'polarity': self.polarity,
        } | super().as_dict()

In [3]:
class IterableSet:

    datapoint_class: TypeAlias = TextualDataPoint

    def __init__(self, datapoints: list[datapoint_class]):
        self.datapoints = datapoints

        # To keep track of the current iteration position.
        self.index = 0

    def first(self) -> datapoint_class:
        return self.datapoints[0]

    def last(self) -> datapoint_class:
        return self.datapoints[-1]
    
    def __len__(self) -> int:
        return len(self.datapoints)
    
    def __iter__(self) -> Self:
        # Reset the index whenever starting a new iteration.
        self.index = 0
        return self
        
    def __next__(self) -> datapoint_class:
        # Make sure there are more datapoints to yield.
        if self.index < len(self.datapoints):
            result = self.datapoints[self.index]
            self.index += 1
            return result
        else:
            # No more datapoints -> raise StopIteration exception.
            raise StopIteration

    def as_lower_representation(self) -> list[dict]:
        return [
            datapoint.as_dict()
            for datapoint in self.datapoints
        ]
    
    def as_df(self) -> pd.DataFrame:
        return pd.DataFrame(self.as_lower_representation())

    def to_csv(self, file_path: str) -> None:
        return self.as_df().to_csv(file_path)
    

class SplitableSet(IterableSet):

    def to_csv_as_train_dev_test_sets(
            self, 
            output_dir: str, 
            target_variable_name: str, 
            dev_test_size: float = 0.3, 
            random_state: int = 42
        ) -> None:
        train, dev, test = self.as_train_dev_test_dfs(
            target_variable_name, dev_test_size, random_state)
        
        if not os.path.exists(output_dir):
            os.mkdir(output_dir)

        train.to_csv(output_dir+"train.csv")
        dev.to_csv(output_dir+"dev.csv")
        test.to_csv(output_dir+"test.csv")

    def as_training_dataframe(self, target_variable_name: str) -> pd.DataFrame:
        salient_columns = ["contents", target_variable_name]
        column_rename_map = {"contents": "X", target_variable_name: "y"}

        training_dataframe = self.as_df()[salient_columns]
        training_dataframe.rename(columns=column_rename_map, inplace=True)

        return training_dataframe
    
    def split_into_train_dev_test_dfs(
            self, 
            target_variable_name: str, 
            dev_and_test_size: float = 0.3, 
            random_state: int = 42
        ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        entire_df = self.as_training_dataframe(target_variable_name)

        # Split the data into train and dev+test sets in a ratio of:
        #  -> (1-dev_and_test_size):(dev_and_test_size)
        initial_splitter = StratifiedShuffleSplit(
            n_splits=1, test_size=dev_and_test_size, random_state=random_state)
        train_indexes, test_indexes = next(
            initial_splitter.split(entire_df.X, entire_df.y))

        train_df = entire_df.iloc[train_indexes]
        dev_and_test_df = entire_df.iloc[test_indexes]

        # Split the dev + test set into dev and test sets in a 50:50 ratio.
        final_splitter = StratifiedShuffleSplit(
            n_splits=1, test_size=0.5, random_state=random_state)
        dev_indexes, test_indexes = next(
            final_splitter.split(dev_and_test_df.X, dev_and_test_df.y))

        dev_df = dev_and_test_df.iloc[dev_indexes]
        test_df = dev_and_test_df.iloc[test_indexes]

        return train_df, dev_df, test_df
    
    def split_into_train_dev_test_arrays(
            self, 
            target_variable_name: str, 
            dev_test_size: float = 0.3, 
            random_state: int = 42
        ) -> Tuple[npt.NDArray, npt.NDArray, npt.NDArray]:
        train, dev, test = self.split_into_train_dev_test_dfs(
            target_variable_name,
            dev_test_size,
            random_state
        )

        return (
            utils.convert_to_nd_array(train.X),
            train.y.values,
            utils.convert_to_nd_array(dev.X),
            dev.y.values,
            utils.convert_to_nd_array(test.X),
            test.y.values,
        )


class DataSet(SplitableSet):

    def __init__(self, dirs: list[str]):
        super().__init__(None)
        self.dirs = dirs
        
    def load(self) -> Self:
        self.datapoints = [
            self.datapoint_class(directory + file)
            for directory in self.dirs
            for file in os.listdir(directory)
        ]

        return self
    
    def as_lower_representation(self) -> list[dict]:
        # Ensure the dataset has been loaded.
        if self.datapoints is None:
            self.load()

        return super().as_lower_representation()

    def __iter__(self) -> Self:
        # Ensure the dataset has been loaded.
        if self.datapoints is None:
            self.load()

        return super().__iter__()
    

class ReviewDataSet(DataSet):

    datapoint_class: TypeAlias = Review

## Preprocessing

In [4]:
class Preprocessor:

    def __init__(self, dataset: DataSet):
        # We don't want to modify the original dataset.
        self.dataset = copy.deepcopy(dataset)
        # Tokenization is the first preprocessing step of most NLP applications.
        self.tokenize()

    def tokenize(self) -> Self:
        for datapoint in self.dataset:
            if isinstance(datapoint.contents, list):
                # This datapoint has already been tokenized.
                continue

            datapoint.contents = nltk.word_tokenize(datapoint.contents)
        
        return self

## Feature Set Generation

This section pertains to N-gram Generation and Feature Selection.

#### Research
[Stemming (NTLK)](https://www.nltk.org/howto/stem.html)  
[A comparison of Stemming Algorithms & Lemmatization Algorithms](https://stackoverflow.com/questions/24647400/what-is-the-best-stemming-method-in-python)
- `PorterStemmer` is apparently one of the most aggresive `nltk` stemmers
    - It appears the choice of stemmer has a significant impact on performance
- `SnowballStemmer` appears to be a lighter middle-ground
- Lemmatizers are usually "lighter" than stemmers, but they cannot handle unknown words

In [5]:
class FeatureSet(SplitableSet):

    def __init__(self, dataset: DataSet):
        super().__init__(dataset.datapoints)

    def compare_with(self, other_set: Self):
        set1_dp1 = self.first().contents
        set2_dp1 = other_set.first().contents
        max_length_set1 = len(max(set1_dp1, key=len))

        print("Comparing feature sets Self and Other:")
        for token1, token2 in zip(set1_dp1, set2_dp1):
            empty_space = " " * (max_length_set1 - len(token1))
            print(f"Set A: {token1} {empty_space}| Set B: {token2}")

In [6]:
class FeatureSetGenerator(Preprocessor):

    def create_n_grams(self, n: int) -> FeatureSet:
        for datapoint in self.dataset:
            datapoint.contents = list(ngrams(datapoint.contents, n))

        return FeatureSet(self.dataset)
    
    def create_everygrams(self, max_n: int) -> FeatureSet:
        for datapoint in self.dataset:
            datapoint.contents = list(everygrams(datapoint.contents, max_len=max_n))

        return FeatureSet(self.dataset)
    
    def to_lowercase(self) -> Self:
        for datapoint in self.dataset:
            datapoint.contents = [token.lower() for token in datapoint.contents]

        return self
    
    def remove_stopwords(self) -> Self:
        distinct_stopwords = set(stopwords.words('english'))

        for datapoint in self.dataset:
            datapoint.contents = [
                # Calling `lower` on `token` because all stopwords are in lowercase.
                # This thus removes all stopwords irrespective of their capitalisation.
                token for token in datapoint.contents if token.lower() not in distinct_stopwords
            ]

        return self
    
    def remove_punctuation(self) -> Self: 
        for datapoint in self.dataset:
            datapoint.contents = [token for token in datapoint.contents if token not in string.punctuation]

        return self

    def lemmatize(self) -> Self:
        lmtzr = WordNetLemmatizer()

        for datapoint in self.dataset:
            datapoint.contents = [lmtzr.lemmatize(token) for token in datapoint.contents]

        return self

    def stem(self) -> Self:
        # Making the assumption that all datapoints are in English.
        stmr = SnowballStemmer("english")

        for datapoint in self.dataset:
            datapoint.contents = [stmr.stem(token) for token in datapoint.contents]

        return self

## Feature Set Normalisation

In [7]:
class FeatureSetNormalizer:

    def __init__(self, feature_set: FeatureSet):
        # We don't want to modify the original feature set.
        self.feature_set = copy.deepcopy(feature_set)

        self.normalized = False
        self.shared_vocabulary = self._collect_shared_vocabulary()

        self.num_samples = len(self.feature_set)
        self.num_features = len(self.shared_vocabulary)

    def perform_tf_norm(self, drop_percentile: float = 0) -> FeatureSet:
        _, tf_matrix = self._calculate_tf_idf_scores(drop_percentile)

        for doc_idx, datapoint in enumerate(self.feature_set):
            datapoint.contents = tf_matrix[doc_idx, :]

        return self.feature_set

    def perform_tf_idf_norm(self, drop_percentile: float = 0) -> FeatureSet:
        tfidf_matrix, _ = self._calculate_tf_idf_scores(drop_percentile)

        # Update datapoint contents with tf-idf values
        for doc_idx, datapoint in enumerate(self.feature_set):
            datapoint.contents = tfidf_matrix[doc_idx, :]

        return self.feature_set

    def peform_ppmi(self) -> FeatureSet:
        raise NotImplementedError

    def _collect_shared_vocabulary(self) -> set:
        return {
            token
            for datapoint in self.feature_set
            for token in datapoint.contents
        }
    
    def _remove_rare_features(self, tfidf_matrix: npt.NDArray, tf_matrix: npt.NDArray, drop_percentile: float) -> Tuple[npt.NDArray, npt.NDArray]:
        total_tfidf_per_feature = np.sum(tfidf_matrix, axis=0)
        total_tfidf = np.sum(total_tfidf_per_feature)

        # Get the indices of total_tfidf_per_feature if it were sorted.
        sorted_indices = np.argsort(total_tfidf_per_feature)
        # Calculate the cumulative sum along the sorted features.
        sorted_cumulative_tfidf = np.cumsum(total_tfidf_per_feature[sorted_indices])

        # Determine the cut-off index where the cumulative sum reaches the threshold
        # percentage.
        threshold_index = np.searchsorted(sorted_cumulative_tfidf, drop_percentile * total_tfidf)

        # Use the threshold_index to determine the indices of features to keep.
        features_to_keep_indices = sorted_indices[threshold_index:]

        # Keep only the columns for features we want to retain.
        tfidf_matrix = tfidf_matrix[:, features_to_keep_indices]
        tf_matrix = tf_matrix[:, features_to_keep_indices]

        return tfidf_matrix, tf_matrix
    
    def _calculate_tf_idf_scores(self, drop_percentile: float) -> Tuple[npt.NDArray, npt.NDArray]:
        vocab_to_index = {word: idx for idx, word in enumerate(self.shared_vocabulary)}

        # Term frequency matrix. Each row corresponds to a document and each column to a term.
        tf_matrix = np.zeros(shape=(self.num_samples, self.num_features), dtype=float)
        # This will store a count of how many documents each term appears in, defaulting to 0.
        df_counter = defaultdict(int)

        # Populate tf_matrix and df_counter
        for doc_idx, datapoint in enumerate(self.feature_set):
            # Count term occurences in this document.
            term_occurences = Counter(datapoint.contents)
            for term, count in term_occurences.items():
                if term in vocab_to_index:
                    index = vocab_to_index[term]
                    # Raw count for TF (to be normalised later)
                    tf_matrix[doc_idx, index] = count
                    # Increment the df counter.
                    df_counter[term] += 1

        # Normalise the term frequency matrix row-wise (divide by the number of terms in each document).
        doc_lengths = np.array([len(datapoint.contents) for datapoint in self.feature_set])
        tf_matrix = tf_matrix / doc_lengths[:, None]

        # Transform document frequencies into inverse-document frequencies.
        idf_array = np.log(
            (self.num_samples) / (1 + np.array([df_counter[term] for term in self.shared_vocabulary]))
        )

        # Multily the TF matrix by the IDF values to obtain the TF-IDF matrix.
        tfidf_matrix = tf_matrix * idf_array

        if drop_percentile > 0:
            # Remove features that appear most infrequently.
            tfidf_matrix, tf_matrix = self._remove_rare_features(tfidf_matrix, tf_matrix, drop_percentile)

        return tfidf_matrix, tf_matrix

## Data

In [8]:
dataset = ReviewDataSet([POSITIVE_REVIEWS_DIR, NEGATIVE_REVIEWS_DIR]).load()

In [9]:
feature_set_a = FeatureSetGenerator(dataset)\
    .stem()\
    .remove_punctuation()\
    .remove_stopwords()\
    .create_n_grams(1)

feature_set_b = FeatureSetGenerator(dataset)\
    .stem()\
    .remove_punctuation()\
    .remove_stopwords()\
    .create_n_grams(1)

feature_set_c = FeatureSetGenerator(dataset)\
    .stem()\
    .remove_punctuation()\
    .remove_stopwords()\
    .create_everygrams(2)

norm_feature_set_a = FeatureSetNormalizer(feature_set_a).perform_tf_norm()
norm_feature_set_b = FeatureSetNormalizer(feature_set_b).perform_tf_idf_norm()
norm_feature_set_c = FeatureSetNormalizer(feature_set_c).perform_tf_idf_norm()

# Freeing some memory.
del feature_set_a
del feature_set_b
del feature_set_c

In [12]:
len(norm_feature_set_c.first().contents)

395824

## Train, Dev, Test Splits

In [18]:
X_train, y_train, X_dev, y_dev, X_test, y_test = normalized_feature_set.split_into_train_dev_test_arrays("polarity", 0.3)

In [24]:
train, dev, test = normalized_feature_set.split_into_train_dev_test_dfs("polarity", 0.3)

## Save Original Dataset as CSV File

In [20]:
dataset.to_csv_as_train_dev_test_sets("./data/bert/", "polarity", 0.3)

## N-Gram Comparison

In [12]:
from classifiers import NaiveBayesClassifier
from evaluation import FeatureSetComparator

N = 3

In [13]:
feature_sets = []

for i in range(1, N+1):
    feature_set = FeatureSetGenerator(dataset)\
        .remove_punctuation()\
        .remove_stopwords()\
        .create_n_grams(i)
    
    norm_feature_set = FeatureSetNormalizer(feature_set).perform_tf_idf_norm(0)
    feature_sets.append(norm_feature_set)

In [14]:
X_trains = []
y_trains = []
X_devs = []
y_devs = []

for i in range(N):
    X_train, y_train, X_dev, y_dev, _, _ = feature_sets[i].split_into_train_dev_test_arrays("polarity", 0.3)

    X_trains.append(X_train)
    y_trains.append(y_train)
    X_devs.append(X_dev)
    y_devs.append(y_dev)

In [15]:
comparator = FeatureSetComparator(X_trains, y_trains)

n_gram_performance = comparator.compare(
    NaiveBayesClassifier,
    X_devs,
    y_devs,
    {}
)

n_gram_performance["shared_vocab_size"] = [len(fs.first().contents) for fs in feature_sets]
n_gram_performance.index = [f"N-grams (N = {i+1})" for i, idx in enumerate(n_gram_performance.index)]

In [16]:
n_gram_performance

Unnamed: 0,accuracy,precision,recall,f1,shared_vocab_size
N-grams (N = 1),0.846667,0.848993,0.843333,0.846154,50874
N-grams (N = 2),0.79,0.868644,0.683333,0.764925,399350
N-grams (N = 3),0.67,0.72973,0.54,0.62069,516967
