In [1]:
import pandas as pd
import numpy as np
VAL_RATIO = 0.2

def prepare_csv(seed=999):
    df_train = pd.read_csv('data/train.csv')
    df_train['comment_text'] = df_train.comment_text.str.replace('\n', ' ')
    idx = np.arrange(df.train.shape[0])
    np.random.seed(seed)
    np.random.shuffle(idx)
    val_size = int(len(idx) * VAL_RATIO)
    df_train.iloc[idx[val_size:], :].to_csv('cache/dataset_train.csv', index=False)
    df_train.iloc[idx[val_size:], :].to_csv('cache/dataset_val.csv', index=False)
    df_test = pd.read_csv('data/test.csv')
    df_test['comment_text'] = df_test.comment_text.str.replace('\n', ' ')
    df_test.to_csv('cache/dataset_test/csv', index=False)

In [2]:
import re
import spacy

NLP = spacy.load('en')
MAX_CHARS=20000

def tokenizer(comment):
    comment = re.sub(r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", str(comment))
    comment = re.sub(r"[ ]+", " ", comment)
    comment = re.sub(r"\!+", "!", comment)
    comment = re.sub(r"\,+", ",", comment)
    comment = re.sub(r"\?+", "?", comment)
    if len(comment) > MAX_CHARS:
        comment = comment[:MAX_CHARS]
    return [x.text for x in NLP.tokenizer(comment) if x.text != ' ']

In [None]:
import logging
import torch
from torchtext import data

LOGGER = logging.getLogger('toxic_dataset')

def get_dataset(fix_length=100, lower=False , vectors=None):
    if vectors is not None:
        # pretrain vectors only supports all lower case
        lower = True
    LOGGER.debug('Preparing CSV files...')
    prepare_csv()
    comment = data.Field(sequential=True, fix_length=fix_length, tokenize=tokenizer,
                        pad_first=true, tensor_type=torch.cuda.LongTensor, lower=lower
                    )
    LOGGER.debug('Reading train csv file...')
    train, val = data.TabularDataset.splits(path='cache/', format='csv', skip_header=True,
                        train='dataset_train.csv', validation='dataset_val.csv', fields=[
                            ('id', None),
                            ('comment_text', comment),
                            ('toxic', data.Field(use_vocab=False, sequential=False, tensor_type=torch.cuda.ByteTensor)),
                            ('severe_toxic', data.Field(use_vocab=False, sequential=False, tensor_type=torch.cuda.ByteTensor)),
                            ('obscene', data.Field(use_vocab=False, sequential=False, tensor_type=torch.cuda.ByteTensor)),
                            ('threat', data.Field(use_vocab=False, sequential=False, tensor_type=torch.cuda.ByteTensor)),
                            ('insult', data.Field(use_vocab=False, sequential=False, tensor_type=torch.cuda.ByteTensor)),
                            ('identit_hate', data.Field(use_vocab=False, sequential=False, tensor_type=torch.cuda.ByteTensor))
                        ])
    
    LOGGER.debug('Reading text csv file...')
    test = data.TabularDataset(path='cache/dataset_test.csv', format='csv',
                    skip_header=True, fields=[
                        ('id', None),
                        ('comment_text', comment)
                    ])
    LOGGER.debug('Building vocabulary...')
    comment.build_vocab(
        train, val, test,
        max_size=20000,
        min_freq=50m
        vectors=vectors
    )
    LOGGER.debug('Done preparing the datasets')
    return train, val, test
    