# Plan
- [x] Load all reviews into a data structure in memory
- [x] Apply tokenization to each review (base step)
- [ ] Choose and apply three pathways through the feature generation process
    - _Remember that you will need to justify your choices with reference to the accuracy that is achieved with each feature set_

A simple idea for a datastructure that holds the dataset in memory is:
```Python
dataset = [
    {
        "id": 1,
        "rating": 7,
        "polarity": 1,
        "contents:": "Lorem Ipsum"
    },
    ...
]
```

Later, this should be a `pandas` dataframe.

## Imports & Constants

In [132]:
import os
import copy
import string
from functools import cached_property

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

POSITIVE_REVIEWS_DIR = "./data/pos/"
NEGATIVE_REVIEWS_DIR = "./data/neg/"

[nltk_data] Downloading package punkt to /home/sowell/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/sowell/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Dataset Types

In [133]:
class TextualDataPoint:

    def __init__(self, file_path: str):
        self.file_path = file_path
        self.contents = self._extract_contents()

    @cached_property
    def basename(self) -> str:
        return os.path.basename(self.file_path)

    @cached_property
    def filename(self) -> str:
        return os.path.splitext(self.basename)[0]
    
    def _extract_contents(self) -> str:
        with open(self.file_path, "r") as file:
            return file.read()
        
    def as_dict(self) -> dict:
        return {
            'contents': self.contents,
        }


class Review(TextualDataPoint):

    def __init__(self, file_path: str):
        super().__init__(file_path)
        self.id = self._parse_id()
        self.rating = self._parse_rating()
        self.polarity = self._determine_polarity()

    def _parse_id(self) -> int:
        return self.filename.split("_")[0]

    def _parse_rating(self) -> int:
        return int(self.filename.split("_")[1])
    
    def _determine_polarity(self) -> int:
        if self.rating is None:
            self.rating = self._parse_rating()

        if self.rating <= 4:
            return 0
        elif self.rating >= 7:
            return 1
        else:
            raise ValueError(f"unexpected rating: {self.rating}")

    def as_dict(self) -> dict:
        return {
            'id': self.id,
            'rating': self.rating,
            'polarity': self.polarity,
        } | super().as_dict()

In [134]:
class DataSet:

    datapoint_class = TextualDataPoint

    def __init__(self, dirs: list[str]):
        self.dirs = dirs
        self.datapoints = None

        # To keep track of the current iteration position.
        self.index = 0

    def load(self):
        self.datapoints = [
            self.datapoint_class(directory + file)
            for directory in self.dirs
            for file in os.listdir(directory)
        ]

        return self
    
    def as_lower_representation(self) -> list[dict]:
        # Ensure the dataset has been loaded.
        if self.datapoints is None:
            self.load()

        return [
            datapoint.as_dict()
            for datapoint in self.datapoints
        ]
    
    def __iter__(self):
        # Ensure the dataset has been loaded.
        if self.datapoints is None:
            self.load()

        # Reset the index whenever starting a new iteration.
        self.index = 0
        return self
        
    def __next__(self) -> Review:
        # Make sure there are more datapoints to yield.
        if self.index < len(self.datapoints):
            result = self.datapoints[self.index]
            self.index += 1
            return result
        else:
            # No more datapoints -> raise StopIteration exception.
            raise StopIteration
    

class ReviewDataSet(DataSet):

    datapoint_class = Review

## Preprocessing & Feature Set Generation

In [135]:
class Preprocessor:

    def __init__(self, dataset: DataSet):
        # We don't want to modify the original dataset.
        self.dataset = copy.deepcopy(dataset)
        # Tokenization is the first preprocessing step of most NLP applications.
        self.tokenize()

    def tokenize(self):
        for datapoint in self.dataset:
            if isinstance(datapoint.contents, list):
                # This review has already been tokenized.
                continue

            datapoint.contents = nltk.word_tokenize(datapoint.contents)
        
        return self


class FeatureSetGenerator(Preprocessor):

    def get_feature_set(self) -> DataSet:
        return self.dataset
    
    def to_lowercase(self):
        for datapoint in self.dataset:
            datapoint.contents = [word.lower() for word in datapoint.contents]

        return self
    
    def remove_stopwords(self):
        distinct_stopwords = set(stopwords.words('english'))

        for datapoint in self.dataset:
            datapoint.contents = [word for word in datapoint.contents if word not in distinct_stopwords]

        return self
    
    def remove_punctuation(self): 
        for datapoint in self.dataset:
            datapoint.contents = [word for word in datapoint.contents if word not in string.punctuation]

        return self

In [136]:
dataset = ReviewDataSet([POSITIVE_REVIEWS_DIR, NEGATIVE_REVIEWS_DIR]).load()

In [137]:
feature_set_b = FeatureSetGenerator(dataset).to_lowercase().remove_stopwords().get_feature_set()
feature_set_c = FeatureSetGenerator(dataset).to_lowercase().remove_stopwords().remove_punctuation().get_feature_set()

In [138]:
feature_set_b.as_lower_representation()

[{'id': '872',
  'rating': 9,
  'polarity': 1,
  'contents': ['carla',
   'works',
   'property',
   'developer',
   "'s",
   'excels',
   'unattractive',
   ',',
   'unappreciated',
   'desperate',
   '.',
   'also',
   'deaf.',
   '<',
   'br',
   '/',
   '>',
   '<',
   'br',
   '/',
   '>',
   'boss',
   'offers',
   'hire',
   'somebody',
   'alleviate',
   'heavy',
   'workload',
   'uses',
   'opportunity',
   'secure',
   'male',
   'company',
   '.',
   'help',
   'arrives',
   'form',
   'paul',
   ',',
   'tattooed',
   'hoodlum',
   'fresh',
   'prison',
   'clearly',
   'unsuited',
   'mannered',
   'routine',
   'office',
   'environment.',
   '<',
   'br',
   '/',
   '>',
   '<',
   'br',
   '/',
   '>',
   'implicit',
   'sexual',
   'tension',
   'develops',
   'two',
   'carla',
   'determined',
   'keep',
   'despite',
   'reluctance',
   'embrace',
   'working',
   'week',
   '.',
   'carla',
   'edged',
   'important',
   'contract',
   'negotiating',
   'slimy',
 

In [139]:
feature_set_c.as_lower_representation()

[{'id': '872',
  'rating': 9,
  'polarity': 1,
  'contents': ['carla',
   'works',
   'property',
   'developer',
   "'s",
   'excels',
   'unattractive',
   'unappreciated',
   'desperate',
   'also',
   'deaf.',
   'br',
   'br',
   'boss',
   'offers',
   'hire',
   'somebody',
   'alleviate',
   'heavy',
   'workload',
   'uses',
   'opportunity',
   'secure',
   'male',
   'company',
   'help',
   'arrives',
   'form',
   'paul',
   'tattooed',
   'hoodlum',
   'fresh',
   'prison',
   'clearly',
   'unsuited',
   'mannered',
   'routine',
   'office',
   'environment.',
   'br',
   'br',
   'implicit',
   'sexual',
   'tension',
   'develops',
   'two',
   'carla',
   'determined',
   'keep',
   'despite',
   'reluctance',
   'embrace',
   'working',
   'week',
   'carla',
   'edged',
   'important',
   'contract',
   'negotiating',
   'slimy',
   'colleague',
   'exploits',
   'paul',
   "'s",
   'criminality',
   'steal',
   'contract',
   'back',
   'colleague',
   'quickly',


## Splitting of sets