In [1]:
import torch
from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe

In [2]:
from torch.utils.data.dataset import Dataset

In [3]:
import re
import logging

import numpy as np
import pandas as pd
import spacy
import torch
from joblib import Memory
from torchtext import data
from sklearn.model_selection import KFold

In [4]:
NLP = spacy.load('en')
MAX_CHARS = 20000
LOGGER = logging.getLogger("imdb_dataset")
MEMORY = Memory(cachedir="cache/", verbose=1)

In [5]:
def tokenizer(review):
    review = re.sub(
        r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", str(review))
    review = re.sub(r"[ ]+", " ", review)
    review = re.sub(r"\!+", "!", review)
    review = re.sub(r"\,+", ",", review)
    review = re.sub(r"\?+", "?", review)
    if (len(review) > MAX_CHARS):
        review = review[:MAX_CHARS]
    return [x.text for x in NLP.tokenizer(review) if x.text != " "]

In [6]:
tokenizer("This is an amazing review, but can't 54help!it")

['This',
 'is',
 'an',
 'amazing',
 'review',
 ',',
 'but',
 'ca',
 "n't",
 '54help',
 'it']

In [7]:
def prepare_csv(train_csv_path="data/train.csv", test_csv_path="data/test.csv", VAL_RATIO = 0.2, seed=37):
    
    df_train = pd.read_csv(train_csv_path)
    df_train["text"] = df_train.text.str.replace("\n", " ")
    idx = np.arange(df_train.shape[0])
    np.random.seed(seed)
    np.random.shuffle(idx)
    val_size = int(len(idx) * VAL_RATIO)
    df_train.iloc[idx[val_size:], :].to_csv(
        "cache/dataset_train.csv", index=False)
    df_train.iloc[idx[:val_size], :].to_csv(
        "cache/dataset_val.csv", index=False)
    
    # repeat this for test
    df_test = pd.read_csv(test_csv_path)
    df_test["text"] = df_test.text.str.replace("\n", " ")
    df_test.to_csv("cache/dataset_test.csv", index=False)

prepare_csv() 

In [8]:
@MEMORY.cache
def read_files(fix_length=100, lower=False, vectors=None):
    if vectors is not None:
        # pretrain vectors only support all lower case
        lower = True
    LOGGER.debug("Preparing CSV files...")
    prepare_csv()
    comment = data.Field(
        sequential=True,
        fix_length=fix_length,
        tokenize=tokenizer,
        pad_first=True,
        tensor_type=torch.cuda.LongTensor,
        lower=lower
    )
    LOGGER.debug("Reading train csv file...")
    train = data.TabularDataset(
        path='cache/dataset_train.csv', format='csv', skip_header=True,
        fields=[
#             ('id', None),
            ('text', review),
            ('label', data.Field(
                use_vocab=False, sequential=False, tensor_type=torch.cuda.ByteTensor)),
        ])
    LOGGER.debug("Reading test csv file...")
    test = data.TabularDataset(
        path='cache/dataset_test.csv', format='csv', skip_header=True,
        fields=[
#             ('id', None),
            ('text', review)
        ])
    LOGGER.debug("Building vocabulary...")
    review.build_vocab(
        train, test,
        max_size=20000,
        min_freq=50,
        vectors=vectors
    )
    LOGGER.debug("Done preparing the datasets")

    return train.examples, test.examples, review

In [9]:
def get_dataset(fix_length=100, lower=False, vectors=None, n_folds=3, seed=37):
    train_exs, test_exs, review = read_files(
        fix_length=fix_length, lower=lower, vectors=vectors)

    kf = KFold(n_splits=n_folds, random_state=seed)

    fields = [
#         ('id', None),
        ('text', review),
        ('label', data.Field(
            use_vocab=False, sequential=False, tensor_type=torch.cuda.ByteTensor)),
   ]

    def iter_folds():
        train_exs_arr = np.array(train_exs)
        for train_idx, val_idx in kf.split(train_exs_arr):
            yield (
                data.Dataset(train_exs_arr[train_idx], fields),
                data.Dataset(train_exs_arr[val_idx], fields),
            )

    test = data.Dataset(test_exs, fields[:2])
    return iter_folds(), test

In [10]:
def get_iterator(dataset, batch_size, train=True, shuffle=True, repeat=False):
    dataset_iter = data.Iterator(
        dataset, batch_size=batch_size, device=0,
        train=train, shuffle=shuffle, repeat=repeat,
        sort=False
    )
    return dataset_iter

In [11]:
for examples in get_iterator(
            self.train_dataset, batch_size, train=True,
            shuffle=True, repeat=False
        ):
    x = examples.comment_text # (fix_length, batch_size) Tensor
    y = torch.stack([
        examples.toxic, examples.severe_toxic, 
        examples.obscene,
        examples.threat, examples.insult, 
        examples.identity_hate
    ], dim=1)

NameError: name 'self' is not defined