In [None]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [None]:
import sys
import os
import warnings

warnings.filterwarnings('ignore')
sys.path.append(os.path.abspath(".."))

In [None]:
import logging

from pathlib import Path

import pandas as pd

import nltk

import dataset as nlp_dataset

from utils import io_utils, log_utils

import importlib
importlib.reload(logging)

logger = logging.getLogger()
log_utils.setup_logging()

# CONSTANTS & HYPERPARAMETERS

In [None]:
DO_REMOVE_STOPWORDS   = False

SKLEARN_TOKENIZER     = nltk.tokenize.WordPunctTokenizer()
SKLEARN_STEMMER       = nltk.stem.PorterStemmer()

MODELS_DIR            = Path("../model")
DATA_DIR              = Path("../data")

# DATA

## LOAD THE DATASET + CLEAN

## SET ASIDE VALIDATION/TEST DATASET

In [None]:
LOAD_DFS        = True
DF_TRAIN_PATH   = DATA_DIR/"huggingface_imdb_sentiment_df_train.p"
DF_TEST_PATH    = DATA_DIR/"huggingface_imdb_sentiment_df_test.p"

if LOAD_DFS:
    df_train    = pd.read_pickle(DF_TRAIN_PATH)
    df_test     = pd.read_pickle(DF_TEST_PATH)
else:
    # Create train & test datasets
    dfs = {}
    for split in ("train", "test"):
        df = nlp_dataset.create_dataframe(dataset_name="imdb", dataset_split=split)
        df = nlp_dataset.preprocess_dataframe(
            df,
            model_type            = "sklearn",
            tokenizer             = SKLEARN_TOKENIZER,
            stemmer               = SKLEARN_STEMMER,
            do_remove_stopwords   = DO_REMOVE_STOPWORDS,
            save_path             = DF_TRAIN_PATH if split == "train" else DF_TEST_PATH
        )

        # Add pre-processed token length column
        tokened_text_len = [len(l) for l in df.tokened]
        df.insert(df.columns.get_loc("tokened")+1, column="tokened_len", value=tokened_text_len)

        dfs[split] = df

    df_train, df_test = dfs["train"], dfs["test"]

## Analyze data with descriptive statistics

In [None]:
df_train.head(3)

In [None]:
df_train.label_name.value_counts()

In [None]:
df_train[["text", "tokened_len", "label"]].describe(include="all")

In [None]:
df_test.head(3)

In [None]:
df_test.label_name.value_counts()

In [None]:
df_train[["text", "tokened_len", "label"]].describe(include="all")

In [None]:
# # User ratings (not sure where to go with this yet)
# user_ratings_train = [re.search(r"\d*\.*\d+/10", r) for r in df_train.review]
# user_ratings_test  = [re.search(r"\d*\.*\d+/10", r) for r in df_test.review]

# # Making sure regex works for finding user rating (X/10)
# [(i, re.match("(.*)/10", ur.group()).groups()[0]) for i, ur in enumerate(user_ratings_train) if ur is not None]
# # Checking user rating (found in text) against label
# [(r, l) for r, l in zip(user_ratings_train, df_train.label_name) if r is not None]

## Visualize data

In [None]:
# TODO