# Plan
- [x] Load all reviews into a data structure in memory
- [x] Apply tokenization to each review (base step)
- [ ] Choose and apply three pathways through the feature generation process
    - _Remember that you will need to justify your choices with reference to the accuracy that is achieved with each feature set_

A simple idea for a datastructure that holds the dataset in memory is:
```Python
dataset = [
    {
        "id": 1,
        "rating": 7,
        "polarity": 1,
        "contents:": "Lorem Ipsum"
    },
    ...
]
```

Later, this should be a `pandas` dataframe.

## Imports & Constants

In [1]:
import os
import copy
import math
import string
import numpy as np
import pandas as pd
from collections import Counter
from typing_extensions import Self
from functools import cached_property
from sklearn.model_selection import StratifiedShuffleSplit

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk.util import ngrams, everygrams

POSITIVE_REVIEWS_DIR = "./data/pos/"
NEGATIVE_REVIEWS_DIR = "./data/neg/"

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wij21\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wij21\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\wij21\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\wij21\AppData\Roaming\nltk_data...


## Dataset Types

In [4]:
class TextualDataPoint:

    def __init__(self, file_path: str):
        self.file_path = file_path
        self.contents = self._extract_contents()

    @cached_property
    def basename(self) -> str:
        return os.path.basename(self.file_path)

    @cached_property
    def filename(self) -> str:
        return os.path.splitext(self.basename)[0]
    
    def _extract_contents(self) -> str:
        with open(self.file_path, "r") as file:
            return file.read()
        
    def as_dict(self) -> dict:
        return {
            'contents': self.contents,
        }


class Review(TextualDataPoint):

    def __init__(self, file_path: str):
        super().__init__(file_path)
        self.id = self._parse_id()
        self.rating = self._parse_rating()
        self.polarity = self._determine_polarity()

    def _parse_id(self) -> int:
        return self.filename.split("_")[0]

    def _parse_rating(self) -> int:
        return int(self.filename.split("_")[1])
    
    def _determine_polarity(self) -> int:
        if self.rating is None:
            self.rating = self._parse_rating()

        if self.rating <= 4:
            return 0
        elif self.rating >= 7:
            return 1
        else:
            raise ValueError(f"unexpected rating: {self.rating}")

    def as_dict(self) -> dict:
        return {
            'id': self.id,
            'rating': self.rating,
            'polarity': self.polarity,
        } | super().as_dict()

In [5]:
class IterableSet:

    datapoint_class = TextualDataPoint

    def __init__(self, datapoints: list[datapoint_class]):
        self.datapoints = datapoints

        # To keep track of the current iteration position.
        self.index = 0

    def first(self) -> datapoint_class:
        return self.datapoints[0]

    def last(self) -> datapoint_class:
        return self.datapoints[-1]
    
    def __len__(self) -> int:
        return len(self.datapoints)
    
    def __iter__(self) -> Self:
        # Reset the index whenever starting a new iteration.
        self.index = 0
        return self
        
    def __next__(self) -> datapoint_class:
        # Make sure there are more datapoints to yield.
        if self.index < len(self.datapoints):
            result = self.datapoints[self.index]
            self.index += 1
            return result
        else:
            # No more datapoints -> raise StopIteration exception.
            raise StopIteration

    def as_lower_representation(self) -> list[dict]:
        return [
            datapoint.as_dict()
            for datapoint in self.datapoints
        ]


class DataSet(IterableSet):

    def __init__(self, dirs: list[str]):
        super().__init__(None)
        self.dirs = dirs
        
    def load(self) -> Self:
        self.datapoints = [
            self.datapoint_class(directory + file)
            for directory in self.dirs
            for file in os.listdir(directory)
        ]

        return self
    
    def as_lower_representation(self) -> list[dict]:
        # Ensure the dataset has been loaded.
        if self.datapoints is None:
            self.load()

        return super().as_lower_representation()

    def __iter__(self) -> Self:
        # Ensure the dataset has been loaded.
        if self.datapoints is None:
            self.load()

        return super().__iter__()
    

class ReviewDataSet(DataSet):

    datapoint_class = Review

## Preprocessing

In [6]:
class Preprocessor:

    def __init__(self, dataset: DataSet):
        # We don't want to modify the original dataset.
        self.dataset = copy.deepcopy(dataset)
        # Tokenization is the first preprocessing step of most NLP applications.
        self.tokenize()

    def tokenize(self) -> Self:
        for datapoint in self.dataset:
            if isinstance(datapoint.contents, list):
                # This datapoint has already been tokenized.
                continue

            datapoint.contents = nltk.word_tokenize(datapoint.contents)
        
        return self

## Feature Set Generation

[Stemming (NTLK)](https://www.nltk.org/howto/stem.html)  
[A comparison of Stemming Algorithms & Lemmatization Algorithms](https://stackoverflow.com/questions/24647400/what-is-the-best-stemming-method-in-python)
- `PorterStemmer` is apparently one of the most aggresive `nltk` stemmers
    - It appears the choice of stemmer has a significant impact on performance
- `SnowballStemmer` appears to be a lighter middle-ground
- Lemmatizers are usually "lighter" than stemmers, but they cannot handle unknown words

In [7]:
class FeatureSet(IterableSet):

    def __init__(self, dataset: DataSet):
        super().__init__(dataset.datapoints)

    def compare_with(self, other_set: Self):
        set1_dp1 = self.first().contents
        set2_dp1 = other_set.first().contents
        max_length_set1 = len(max(set1_dp1, key=len))

        print("Comparing the first datapoint in feature sets A and B respectively:")
        for token1, token2 in zip(set1_dp1, set2_dp1):
            empty_space = " " * (max_length_set1 - len(token1))
            print(f"Set A: {token1} {empty_space}| Set B: {token2}")

    def as_inputs_and_targets(self, target_variable_name: str):
        inputs = [datapoint.contents for datapoint in self.datapoints]
        targets = [getattr(datapoint, target_variable_name)
                   for datapoint in self.datapoints]

        return np.array(inputs), np.array(targets)

    def split_into_train_dev_test_sets(self, target_variable_name: str, dev_test_size: float, random_state: int = 42):
        inputs, targets = self.as_inputs_and_targets(target_variable_name)

        # Split the data into train and dev+test sets in a ratio of (1-dev_test_size):(dev_test_size).
        initial_splitter = StratifiedShuffleSplit(
            n_splits=1, test_size=dev_test_size, random_state=random_state)
        train_indexes, test_indexes = next(
            initial_splitter.split(inputs, targets))

        X_train, y_train = inputs[train_indexes], targets[train_indexes]
        X_test_dev, y_test_dev = inputs[test_indexes], targets[test_indexes]

        # Split the dev+test set into dev and test sets in a 50:50 ratio.
        final_splitter = StratifiedShuffleSplit(
            n_splits=1, test_size=0.5, random_state=random_state)
        dev_indexes, test_indexes = next(
            final_splitter.split(X_test_dev, y_test_dev))

        X_dev, y_dev = X_test_dev[dev_indexes], y_test_dev[dev_indexes]
        X_test, y_test = X_test_dev[test_indexes], y_test_dev[test_indexes]

        return X_train, y_train, X_dev, y_dev, X_test, y_test

In [8]:
class FeatureSetGenerator(Preprocessor):

    def create_n_grams(self, n: int) -> FeatureSet:
        for datapoint in self.dataset:
            datapoint.contents = list(ngrams(datapoint.contents, n))

        return FeatureSet(self.dataset)
    
    def create_everygrams(self, max_n: int) -> FeatureSet:
        for datapoint in self.dataset:
            datapoint.contents = list(everygrams(datapoint.contents, max_len=max_n))

        return FeatureSet(self.dataset)
    
    def to_lowercase(self) -> Self:
        for datapoint in self.dataset:
            datapoint.contents = [token.lower() for token in datapoint.contents]

        return self
    
    def remove_stopwords(self) -> Self:
        distinct_stopwords = set(stopwords.words('english'))

        for datapoint in self.dataset:
            datapoint.contents = [token for token in datapoint.contents if token not in distinct_stopwords]

        return self
    
    def remove_punctuation(self) -> Self: 
        for datapoint in self.dataset:
            datapoint.contents = [token for token in datapoint.contents if token not in string.punctuation]

        return self

    def lemmatize(self) -> Self:
        lmtzr = WordNetLemmatizer()

        for datapoint in self.dataset:
            datapoint.contents = [lmtzr.lemmatize(token) for token in datapoint.contents]

        return self

    def stem(self) -> Self:
        # Making the assumption that all datapoints are in English.
        stmr = SnowballStemmer("english")

        for datapoint in self.dataset:
            datapoint.contents = [stmr.stem(token) for token in datapoint.contents]

        return self

## Feature Set Normalisation

- This needs to be much faster.
- Are we actually calcuating word frequencies or just word occurences (are these different?)

In [20]:
class FeatureSetNormalizer:

    def __init__(self, feature_set: FeatureSet):
        # We don't want to modify the original feature set.
        self.feature_set = copy.deepcopy(feature_set)

        self.normalized = False
        self.shared_vocabulary = self._collect_shared_vocabulary()

    @cached_property
    def num_datapoints(self) -> int:
        return len(self.feature_set)

    def perform_tf(self) -> FeatureSet:
        self._calculate_term_frequencies()

        for datapoint in self.feature_set:
            datapoint.contents = [
                datapoint.term_frequencies.get(token, 0)
                for token in self.shared_vocabulary
            ]

        return self.feature_set

    def perform_tf_idf(self) -> FeatureSet:
        self._calculate_term_frequencies()
        self.idfs = self._calculate_idfs()

        for datapoint in self.feature_set:
            datapoint.contents = [
                (datapoint.term_frequencies.get(token, 0) * self.idfs.get(token))
                for token in self.shared_vocabulary
            ]

        return self.feature_set

    def peform_ppmi(self) -> FeatureSet:
        raise NotImplementedError

    def _collect_shared_vocabulary(self) -> set:
        return set(sorted({
            token
            for datapoint in self.feature_set
            for token in datapoint.contents
        }))
    
    def _calculate_idfs(self) -> dict:
        # We calculate the document frequencies by creating a unique set of tokens for
        # each datapoint (i.e., for each document in the set, counting each token once
        # per document regardless of its frequency within the document itself). The Counter
        # then aggregates these sets across all datapoints, counting the number of documents
        # in which each distinct token appears. This gives us the document frequency for
        # each term in the `shared_vocabulary`.
        document_frequencies = Counter(
            token
            for datapoint in self.feature_set
            for token in set(datapoint.contents)
        )

        return {
            token: math.log(self.num_datapoints / (doc_frequency + 1))
            for token, doc_frequency in document_frequencies.items()
        }
    
    def _calculate_term_frequencies(self) -> None:
        for datapoint in self.feature_set:
            num_tokens = len(datapoint.contents)
            term_occurences = Counter(datapoint.contents)
            datapoint.term_frequencies = {
                token: count / num_tokens
                for token, count in term_occurences.items()
            }

In [10]:
dataset = ReviewDataSet([POSITIVE_REVIEWS_DIR, NEGATIVE_REVIEWS_DIR]).load()

In [18]:
feature_set_b = FeatureSetGenerator(dataset).remove_punctuation().lemmatize().create_n_grams(1)
feature_set_c = FeatureSetGenerator(dataset).remove_punctuation().stem().create_n_grams(1)

In [None]:
feature_set_b.compare_with(feature_set_c)

In [27]:
normalizer = FeatureSetNormalizer(feature_set_c)

In [28]:
normalized_feature_set = normalizer.perform_tf_idf()

In [29]:
len(normalized_feature_set.first().contents)

32205

In [31]:
import json

filename = "no_punc_snowball_stem_2_grams_tf_idf.json"

with open("./data/feature_sets/"+filename, "w") as file:
    json.dump(normalized_feature_set.as_lower_representation(), file)

## Train, Dev, Test Splits

In [107]:
X_train, y_train, X_dev, y_dev, X_test, y_test = normalized_feature_set.split_into_train_dev_test_sets("polarity", 0.3)