In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import argparse
import json
import math
import os
import sys
import unidecode
import random
import re
import time
import yaml
from abc import ABCMeta, abstractmethod
from collections import defaultdict, Counter
from copy import deepcopy
from functools import partial
from multiprocessing import Pool
from pathlib import Path

import nltk
import gensim
import sklearn
import torch
import torch.nn.functional as F
import numpy as np
import pandas as pd
from gensim.corpora import Dictionary
from gensim.models import KeyedVectors
from gensim.models import Word2Vec, Doc2Vec, FastText
from sklearn import metrics
from torch import nn
from torch.utils.data import DataLoader, WeightedRandomSampler
from tqdm import tqdm
%load_ext Cython

In [None]:
class QIQCDataset(object):

    def __init__(self, df):
        self.df = df

    @property
    def tokens(self):
        return self.df.tokens.values

    @tokens.setter
    def tokens(self, tokens):
        self.df['tokens'] = tokens

    @property
    def positives(self):
        return self.df[self.df.target == 1]

    @property
    def negatives(self):
        return self.df[self.df.target == 0]

    def build(self, device):
        self._X = self.tids
        self.X = torch.Tensor(self._X).type(torch.long).to(device)
        if 'target' in self.df:
            self._t = self.df.target[:, None]
            self._W = self.df.weights
            self.t = torch.Tensor(self._t).type(torch.float).to(device)
            self.W = torch.Tensor(self._W).type(torch.float).to(device)
        if hasattr(self, '_X2'):
            self.X2 = torch.Tensor(self._X2).type(torch.float).to(device)
        else:
            self._X2 = np.zeros((self._X.shape[0], 1), 'f')
            self.X2 = torch.Tensor(self._X2).type(torch.float).to(device)

    def build_labeled_dataset(self, indices):
        return torch.utils.data.TensorDataset(
            self.X[indices], self.X2[indices],
            self.t[indices], self.W[indices])

In [None]:
def load_qiqc(n_rows=None):
    train_df = pd.read_csv("../input/train.csv", nrows=n_rows)
    submit_df = pd.read_csv("../input/test.csv", nrows=n_rows)
    n_labels = {
        0: (train_df.target == 0).sum(),
        1: (train_df.target == 1).sum(),
    }
    train_df['target'] = train_df.target.astype('f')
    train_df['weights'] = train_df.target.apply(lambda t: 1 / n_labels[t])

    return train_df, submit_df


def build_datasets(train_df, submit_df, holdout=False, seed=0):
    submit_dataset = QIQCDataset(submit_df)
    if holdout:
        # Train : Test split for holdout training
        splitter = sklearn.model_selection.StratifiedShuffleSplit(
            n_splits=1, test_size=0.1, random_state=seed)
        train_indices, test_indices = list(splitter.split(
            train_df, train_df.target))[0]
        train_indices.sort(), test_indices.sort()
        train_dataset = QIQCDataset(
            train_df.iloc[train_indices].reset_index(drop=True))
        test_dataset = QIQCDataset(
            train_df.iloc[test_indices].reset_index(drop=True))
    else:
        train_dataset = QIQCDataset(train_df)
        test_dataset = QIQCDataset(train_df.head(0))

    return train_dataset, test_dataset, submit_dataset

In [None]:
%%cython
import re

NORMALIZER_REGISTRY = {}
#TOKENIZER_REGISTRY = {}
#WORD_EMBEDDING_FEATURIZER_REGISTRY = {}
WORD_EXTRA_FEATURIZER_REGISTRY = {}
SENTENCE_EXTRA_FEATURIZER_REGISTRY = {}

# Registries for training
ENCODER_REGISTRY = {}
AGGREGATOR_REGISTRY = {}
ATTENTION_REGISTRY = {}


def register_preprocessor(name):
    def register_cls(cls):
        NORMALIZER_REGISTRY[name] = cls
        return cls
    return register_cls


'''
def register_tokenizer(name):
    def register_cls(cls):
        TOKENIZER_REGISTRY[name] = cls
        return cls
    return register_cls
'''    

def register_word_extra_features(name):
    def register_cls(cls):
        WORD_EXTRA_FEATURIZER_REGISTRY[name] = cls
        return cls
    return register_cls

def register_sentence_extra_features(name):
    def register_cls(cls):
        SENTENCE_EXTRA_FEATURIZER_REGISTRY[name] = cls
        return cls
    return register_cls

def register_encoder(name):
    def register_cls(cls):
        ENCODER_REGISTRY[name] = cls
        return cls
    return register_cls


def register_aggregator(name):
    def register_cls(cls):
        AGGREGATOR_REGISTRY[name] = cls
        return cls
    return register_cls


def register_attention(name):
    def register_cls(cls):
        ATTENTION_REGISTRY[name] = cls
        return cls
    return register_cls



In [None]:
%%cython
import re



cpdef str cylower(str x):
    return x.lower()

cdef class StringReplacer:
    cpdef public dict rule
    cpdef list keys
    cpdef list values
    cpdef int n_rules

    def __init__(self, dict rule):
        self.rule = rule
        self.keys = list(rule.keys())
        self.values = list(rule.values())
        self.n_rules = len(rule)

    def __call__(self, str x):
        cdef int i
        for i in range(self.n_rules):
            if self.keys[i] in x:
                x = x.replace(self.keys[i], self.values[i])
        return x

    def __getstate__(self):
        return (self.rule, self.keys, self.values, self.n_rules)

    def __setstate__(self, state):
        self.rule, self.keys, self.values, self.n_rules = state
        
        
class PunctSpacer(StringReplacer):

    def __init__(self, edge_only=False):
        puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', '█', '½', '…', '“', '★', '”', '–', '●', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', '¯', '♦', '¤', '▲', '¸', '¾', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]  # NOQA
        if edge_only:
            rule = {
                **dict([(f' {p}', f' {p} ') for p in puncts]),
                **dict([(f'{p} ', f' {p} ') for p in puncts]),
            }
        else:
            rule = dict([(p, f' {p} ') for p in puncts])
        super().__init__(rule)
        
        
        

In [None]:
%%cython
import re
Cache = {}
is_alphabet = re.compile(r'[a-zA-Z]')

cpdef str unidecode_weak(str string):
    """Transliterate an Unicode object into an ASCII string
    >>> unidecode(u"\u5317\u4EB0")
    "Bei Jing "
    """

    cdef list retval = []
    cdef int i = 0
    cdef int n = len(string)
    cdef str char

    for i in range(n):
        char = string[i]
        codepoint = ord(char)

        if codepoint < 0x80: # Basic ASCII
            retval.append(char)
            continue

        if codepoint > 0xeffff:
            continue  # Characters in Private Use Area and above are ignored

        section = codepoint >> 8   # Chop off the last two hex digits
        position = codepoint % 256 # Last two hex digits

        try:
            table = Cache[section]
        except KeyError:
            try:
                mod = __import__('unidecode.x%03x'%(section), [], [], ['data'])
            except ImportError:
                Cache[section] = None
                continue   # No match: ignore this character and carry on.

            Cache[section] = table = mod.data

        if table and len(table) > position:
            if table[position] == '[?]' or is_alphabet.match(table[position]):
                retval.append(' ' + char + ' ')
            else:
                retval.append(table[position])

    return ''.join(retval)    



In [None]:
%%cython
import re
cdef class RegExpReplacer:
    cdef dict rule
    cdef list keys
    cdef list values
    cdef regexp
    cdef int n_rules

    def __init__(self, dict rule):
        self.rule = rule
        self.keys = list(rule.keys())
        self.values = list(rule.values())
        self.regexp = re.compile('(%s)' % '|'.join(self.keys))
        self.n_rules = len(rule)

    @property
    def rule(self):
        return self.rule

    def __call__(self, str x):
        def replace(match):
            x = match.group(0)
            if x in self.rule:
                return self.rule[x]
            else:
                for i in range(self.n_rules):
                    x = re.sub(self.keys[i], self.values[i], x)
                return x
        return self.regexp.sub(replace, x)     


In [None]:


class NumberReplacer(RegExpReplacer):

    def __init__(self, with_underscore=False):
        prefix, suffix = '', ''
        if with_underscore:
            prefix += ' __'
            suffix = '__ '
        rule = {
            '[0-9]{5,}': f'{prefix}#####{suffix}',
            '[0-9]{4}': f'{prefix}####{suffix}',
            '[0-9]{3}': f'{prefix}###{suffix}',
            '[0-9]{2}': f'{prefix}##{suffix}',
        }
        super().__init__(rule)

In [None]:
class MisspellReplacer(StringReplacer):

    def __init__(self):
        rule = {
            "ain't": "is not",
            "aren't": "are not",
            "can't": "cannot",
            "'cause": "because",
            "could've": "could have",
            "couldn't": "could not",
            "didn't": "did not",
            "doesn't": "does not",
            "don't": "do not",
            "hadn't": "had not",
            "hasn't": "has not",
            "haven't": "have not",
            "he'd": "he would",
            "he'll": "he will",
            "he's": "he is",
            "how'd'y": "how do you",
            "how'd": "how did",
            "how'll": "how will",
            "how's": "how is",
            "i'd've": "i would have",
            "i'd": "i would",
            "i'll've": "i will have",
            "i'll": "i will",
            "i'm": "i am",
            "i've": "i have",
            "isn't": "is not",
            "it'd've": "it would have",
            "it'd": "it would",
            "it'll've": "it will have",
            "it'll": "it will",
            "it's": "it is",
            "let's": "let us",
            "ma'am": "madam",
            "mayn't": "may not",
            "might've": "might have",
            "mightn't've": "might not have",
            "mightn't": "might not",
            "must've": "must have",
            "mustn't've": "must not have",
            "mustn't": "must not",
            "needn't've": "need not have",
            "needn't": "need not",
            "o'clock": "of the clock",
            "oughtn't've": "ought not have",
            "oughtn't": "ought not",
            "shan't've": "shall not have",
            "shan't": "shall not",
            "sha'n't": "shall not",
            "she'd've": "she would have",
            "she'd": "she would",
            "she'll've": "she will have",
            "she'll": "she will",
            "she's": "she is",
            "should've": "should have",
            "shouldn't've": "should not have",
            "shouldn't": "should not",
            "so've": "so have",
            "so's": "so as",
            "this's": "this is",
            "that'd've": "that would have",
            "that'd": "that would",
            "that's": "that is",
            "there'd've": "there would have",
            "there'd": "there would",
            "there's": "there is",
            "here's": "here is",
            "they'd've": "they would have",
            "they'd": "they would",
            "they'll've": "they will have",
            "they'll": "they will",
            "they're": "they are",
            "they've": "they have",
            "to've": "to have",
            "wasn't": "was not",
            "we'd've": "we would have",
            "we'd": "we would",
            "we'll've": "we will have",
            "we'll": "we will",
            "we're": "we are",
            "we've": "we have",
            "weren't": "were not",
            "what'll've": "what will have",
            "what'll": "what will",
            "what're": "what are",
            "what's": "what is",
            "what've": "what have",
            "when's": "when is",
            "when've": "when have",
            "where'd": "where did",
            "where's": "where is",
            "where've": "where have",
            "who'll've": "who will have",
            "who'll": "who will",
            "who's": "who is",
            "who've": "who have",
            "why's": "why is",
            "why've": "why have",
            "will've": "will have",
            "won't've": "will not have",
            "won't": "will not",
            "would've": "would have",
            "wouldn't've": "would not have",
            "wouldn't": "would not",
            "y'all'd've": "you all would have",
            "y'all'd": "you all would",
            "y'all're": "you all are",
            "y'all've": "you all have",
            "y'all": "you all",
            "you'd've": "you would have",
            "you'd": "you would",
            "you'll've": "you will have",
            "you'll": "you will",
            "you're": "you are",
            "you've": "you have",
            "colour": "color",
            "centre": "center",
            "favourite": "favorite",
            "travelling": "traveling",
            "counselling": "counseling",
            "theatre": "theater",
            "cancelled": "canceled",
            "labour": "labor",
            "organisation": "organization",
            "wwii": "world war 2",
            "citicise": "criticize",
            "youtu ": "youtube ",
            "qoura": "quora",
            "sallary": "salary",
            "whta": "what",
            "narcisist": "narcissist",
            "howdo": "how do",
            "whatare": "what are",
            "howcan": "how can",
            "howmuch": "how much",
            "howmany": "how many",
            "whydo": "why do",
            "doi": "do i",
            "thebest": "the best",
            "howdoes": "how does",
            "mastrubation": "masturbation",
            "mastrubate": "masturbate",
            "mastrubating": "masturbating",
            "pennis": "penis",
            "etherium": "ethereum",
            "narcissit": "narcissist",
            "bigdata": "big data",
            "2k17": "2017",
            "2k18": "2018",
            "qouta": "quota",
            "exboyfriend": "ex boyfriend",
            "airhostess": "air hostess",
            "whst": "what",
            "watsapp": "whatsapp",
            "demonitisation": "demonetization",
            "demonitization": "demonetization",
            "demonetisation": "demonetization",
        }
        super().__init__(rule)   
        

In [None]:
class KerasFilterReplacer(StringReplacer):

    def __init__(self):
        filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
        rule = dict([(f, ' ') for f in filters])
        super().__init__(rule)        

In [None]:
register_preprocessor('lower')(cylower)
register_preprocessor('punct')(PunctSpacer())
#register_preprocessor('unidecode')(unidecode)
register_preprocessor('unidecode_weak')(unidecode_weak)
register_preprocessor('number')(NumberReplacer())
register_preprocessor('number+underscore')(
    NumberReplacer(with_underscore=True))
register_preprocessor('misspell')(MisspellReplacer())
register_preprocessor('keras')(KerasFilterReplacer())

In [None]:
%%cython
cpdef list cysplit(str x):
    return x.split()

In [None]:
TOKENIZER_REGISTRY = {}
def register_tokenizer(name):
    def register_cls(cls):
        TOKENIZER_REGISTRY[name] = cls
        return cls
    return register_cls

In [None]:
register_tokenizer('space')(cysplit)
register_tokenizer('word_tokenize')(nltk.word_tokenize)

In [None]:
class TextNormalizerWrapper(object):

    registry = NORMALIZER_REGISTRY
    default_config = None

    def __init__(self, config):
        self.normalizers = [self.registry[n] for n in config.normalizers]

    @classmethod
    def add_args(cls, parser):
        assert isinstance(cls.default_config, dict)
        parser.add_argument(
            '--normalizers', nargs='+', choices=cls.registry)
        parser.set_defaults(**cls.default_config)

    def __call__(self, x):
        for normalizer in self.normalizers:
            x = normalizer(x)
        return x    





class TextNormalizerPresets(TextNormalizerWrapper):

    default_config = dict(
        normalizers=[
            'lower',
            'misspell',
            'punct',
            'number+underscore'
        ]
    )




class TextNormalizer(TextNormalizerPresets):
    pass



In [None]:
class ExperimentConfigBuilderBase(metaclass=ABCMeta):

    default_config = None

    def add_args(self, parser):
        parser.add_argument('--modelfile', '-m', type=Path)
        parser.add_argument('--outdir-top', type=Path, default=Path('results'))
        parser.add_argument('--outdir-bottom', type=str, default='default')
        parser.add_argument('--device', '-g', type=int)
        parser.add_argument('--test', action='store_true')
        parser.add_argument('--logging', action='store_true')
        parser.add_argument('--n-rows', type=int)

        parser.add_argument('--seed', type=int, default=1029)
        parser.add_argument('--optuna-trials', type=int)
        parser.add_argument('--gridsearch', action='store_true')
        parser.add_argument('--holdout', action='store_true')
        parser.add_argument('--cv', type=int, default=5)
        parser.add_argument('--cv-part', type=int)
        parser.add_argument('--processes', type=int, default=2)

        parser.add_argument('--lr', type=float, default=1e-3)
        parser.add_argument('--batchsize', type=int, default=512)
        parser.add_argument('--batchsize-valid', type=int, default=1024)
        parser.add_argument('--scale-batchsize', type=int, nargs='+',
                            default=[])
        parser.add_argument('--epochs', type=int, default=5)
        parser.add_argument('--validate-from', type=int)
        parser.add_argument('--pos-weight', type=float, default=1.)
        parser.add_argument('--maxlen', type=float, default=72)
        parser.add_argument('--vocab-mincount', type=float, default=5)
        parser.add_argument('--ensembler-n-snapshots', type=int, default=1)

    @abstractmethod
    def modules(self):
        raise NotImplementedError()

    def build(self, args=None):
        assert self.default_config is not None
        parser = argparse.ArgumentParser()
        self.add_args(parser)
        parser.set_defaults(**self.default_config)

        for module in self.modules:
            module.add_args(parser)
        config, extra_config = parser.parse_known_args(args)

        for module in self.modules:
            if hasattr(module, 'add_extra_args'):
                module.add_extra_args(parser, config)

        if config.test:
            parser.set_defaults(**dict(
                n_rows=500,
                batchsize=64,
                validate_from=0,
                epochs=3,
                cv_part=2,
                ensembler_test_size=1.,
            ))

        config = parser.parse_args(args)
        if config.modelfile is not None:
            config.outdir = config.outdir_top / config.modelfile.stem \
                / config.outdir_bottom
        else:
            config.outdir = Path('.')

        return config

In [None]:
class TextTokenizerWrapper(object):

    registry = TOKENIZER_REGISTRY
    default_config = None

    def __init__(self, config):
        self.tokenizer = self.registry[config.tokenizer]

    @classmethod
    def add_args(cls, parser):
        assert isinstance(cls.default_config, dict)
        parser.add_argument('--tokenizer', choices=cls.registry)
        parser.set_defaults(**cls.default_config)

    def __call__(self, x):
        return self.tokenizer(x)

In [None]:
class TextTokenizerPresets(TextTokenizerWrapper):

    default_config = dict(
        tokenizer='space'
    )


In [None]:
class TextTokenizer(TextTokenizerPresets):
    pass

In [None]:
class WordExtraFeaturizerWrapper(object):

    registry = WORD_EXTRA_FEATURIZER_REGISTRY
    default_config = None

    def __init__(self, config, vocab):
        self.config = config
        self.vocab = vocab
        self.featurizers = {
            k: self.registry[k]() for k in config.word_extra_features}

    @classmethod
    def add_args(cls, parser):
        parser.add_argument(
            '--word-extra-features', nargs='+', choices=cls.registry)
        parser.set_defaults(**cls.default_config)

    def __call__(self, vocab):
        empty = np.empty([len(vocab), 0])
        return np.concatenate([empty, *[
            f(vocab) for f in self.featurizers.values()]], axis=1)
    
    
class WordExtraFeaturizerPresets(WordExtraFeaturizerWrapper):

    default_config = dict(
        word_extra_features=[],
    )

    
    
class WordExtraFeaturizer(WordExtraFeaturizerPresets):

    default_config = dict(
        word_extra_features=['idf', 'unk'],
    )    

In [None]:
class WordVocab(object):

    def __init__(self, mincount=1):
        self.counter = Counter()
        self.n_documents = 0
        self._counters = {}
        self._n_documents = defaultdict(int)
        self.mincount = mincount

    def __len__(self):
        return len(self.token2id)

    def add_documents(self, documents, name):
        self._counters[name] = Counter()
        for document in documents:
            bow = dict.fromkeys(document, 1)
            self._counters[name].update(bow)
            self.counter.update(bow)
            self.n_documents += 1
            self._n_documents[name] += 1

    def build(self):
        counter = dict(self.counter.most_common())
        self.word_freq = {
            **{'<PAD>': 0},
            **counter,
        }
        self.token2id = {
            **{'<PAD>': 0},
            **{word: i + 1 for i, word in enumerate(counter)}
        }
        self.lfq = np.array(list(self.word_freq.values())) < self.mincount
        self.hfq = ~self.lfq

In [None]:
class SentenceExtraFeaturizerWrapper(object):

    registry = SENTENCE_EXTRA_FEATURIZER_REGISTRY
    default_config = None

    def __init__(self, config):
        self.config = config
        self.featurizers = {
            k: self.registry[k]() for k in config.sentence_extra_features}
        self.n_dims = sum(list(f.n_dims for f in self.featurizers.values()))

    @classmethod
    def add_args(cls, parser):
        parser.add_argument(
            '--sentence-extra-features', nargs='+', choices=cls.registry)
        parser.set_defaults(**cls.default_config)

    def __call__(self, sentence):
        empty = np.empty((0,))
        return np.concatenate([empty, *[
            f(sentence) for f in self.featurizers.values()]], axis=0)

    def fit_standardize(self, features):
        assert features.ndim == 2
        self.mean = features.mean(axis=0)
        self.std = features.std(axis=0)
        self.std = np.where(self.std != 0, self.std, 1)
        return (features - self.mean) / self.std

    def standardize(self, features):
        assert hasattr(self, 'mean'), hasattr(self, 'std')
        return (features - self.mean) / self.std




class SentenceExtraFeaturizerPresets(SentenceExtraFeaturizerWrapper):

    default_config = dict(
        sentence_extra_features=[],
    )
    
    
class SentenceExtraFeaturizer(SentenceExtraFeaturizerPresets):

    default_config = dict(
        sentence_extra_features=['char', 'word'],
    )    


In [None]:

class ExperimentConfigBuilder(ExperimentConfigBuilderBase):

    default_config = dict(
        test=False,
        device=0,
        maxlen=72,
        vocab_mincount=5,
        scale_batchsize=[],
        validate_from=4,
    )

    @property
    def modules(self):
        return [
            TextNormalizer,
            TextTokenizer,
            #WordEmbeddingFeaturizer,
            WordExtraFeaturizer,
            SentenceExtraFeaturizer,
            #Embedding,
            #Encoder,
            #Aggregator,
            #MLP,
        ]
       


In [None]:
config = ExperimentConfigBuilder().build(args=[])
print(config)

In [None]:
def set_seed(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [None]:
start = time.time()
set_seed(config.seed)

In [None]:
train_df, submit_df = load_qiqc(n_rows=config.n_rows)
datasets = build_datasets(train_df, submit_df, config.holdout, config.seed)
train_dataset, test_dataset, submit_dataset = datasets

In [None]:
class WordbasedPreprocessor():

    def tokenize(self, datasets, normalizer, tokenizer):
        tokenize = Pipeline(normalizer, tokenizer)
        apply_tokenize = ApplyNdArray(tokenize, processes=2, dtype=object)
        tokens = [apply_tokenize(d.df.question_text.values) for d in datasets]
        return tokens

    def build_vocab(self, datasets, config):
        train_dataset, test_dataset, submit_dataset = datasets
        vocab = WordVocab(mincount=config.vocab_mincount)
        vocab.add_documents(train_dataset.positives.tokens, 'train-pos')
        vocab.add_documents(train_dataset.negatives.tokens, 'train-neg')
        vocab.add_documents(test_dataset.positives.tokens, 'test-pos')
        vocab.add_documents(test_dataset.negatives.tokens, 'test-neg')
        vocab.add_documents(submit_dataset.df.tokens, 'submit')
        vocab.build()
        return vocab

    def build_tokenids(self, datasets, vocab, config):
        token2id = lambda xs: pad_sequence(  # NOQA
            [vocab.token2id[x] for x in xs], config.maxlen)
        apply_token2id = ApplyNdArray(
            token2id, processes=1, dtype='i', dims=(config.maxlen,))
        tokenids = [apply_token2id(d.df.tokens.values) for d in datasets]
        return tokenids

    def build_sentence_features(self, datasets, sentence_extra_featurizer):
        train_dataset, test_dataset, submit_dataset = datasets
        apply_featurize = ApplyNdArray(
            sentence_extra_featurizer, processes=1, dtype='f',
            dims=(sentence_extra_featurizer.n_dims,))
        _X2 = [apply_featurize(d.df.question_text.values) for d in datasets]
        _train_X2, _test_X2, _submit_X2 = _X2
        train_X2 = sentence_extra_featurizer.fit_standardize(_train_X2)
        test_X2 = sentence_extra_featurizer.standardize(_test_X2)
        submit_X2 = sentence_extra_featurizer.standardize(_submit_X2)
        return train_X2, test_X2, submit_X2
'''
    def build_embedding_matrices(self, datasets, word_embedding_featurizer,
                                 vocab, pretrained_vectors):
        pretrained_vectors_merged = np.stack(
            [wv.vectors for wv in pretrained_vectors.values()]).mean(axis=0)
        vocab.unk = (pretrained_vectors_merged == 0).all(axis=1)
        vocab.known = ~vocab.unk
        embedding_matrices = word_embedding_featurizer(
            pretrained_vectors_merged, datasets)
        return embedding_matrices
'''        

def build_word_features(self,word_extra_features):
    return word_extra_features

In [None]:
class Pipeline(object):

    def __init__(self, *modules):
        self.modules = modules

    def __call__(self, x):
        for module in self.modules:
            x = module(x)
        return x

In [None]:
class WordbasedPreprocessor():

    def tokenize(self, datasets, normalizer, tokenizer):
        tokenize = Pipeline(normalizer, tokenizer)
        apply_tokenize = ApplyNdArray(tokenize, processes=2, dtype=object)
        tokens = [apply_tokenize(d.df.question_text.values) for d in datasets]
        return tokens
    
    def build_vocab(self, datasets, config):
        train_dataset, test_dataset, submit_dataset = datasets
        vocab = WordVocab(mincount=config.vocab_mincount)
        vocab.add_documents(train_dataset.positives.tokens, 'train-pos')
        vocab.add_documents(train_dataset.negatives.tokens, 'train-neg')
        vocab.add_documents(test_dataset.positives.tokens, 'test-pos')
        vocab.add_documents(test_dataset.negatives.tokens, 'test-neg')
        vocab.add_documents(submit_dataset.df.tokens, 'submit')
        vocab.build()
        return vocab

In [None]:

class PreprocessorPresets(WordbasedPreprocessor):

    def build_word_features(self, word_extra_features):
        return word_extra_features


In [None]:
class Preprocessor(PreprocessorPresets):
    def build_word_features(self, word_extra_features):
        return word_extra_features

In [None]:
%%cython
import numpy as np
cimport numpy as np
from multiprocessing import Pool


cdef class ApplyNdArray:
    cdef func
    cdef dtype
    cdef dims
    cdef int processes

    def __init__(self, func, processes=1, dtype=object, dims=None):
        self.func = func
        self.processes = processes
        self.dtype = dtype
        self.dims = dims

    def __call__(self, arr):
        if self.processes == 1:
            return self.apply(arr)
        else:
            return self.apply_parallel(arr)

    cpdef apply(self, arr):
        cdef int i
        cdef int n = len(arr)
        if self.dims is not None:
            shape = (n, *self.dims)
        else:
            shape = n
        cdef res = np.empty(shape, dtype=self.dtype)
        for i in range(n):
            res[i] = self.func(arr[i])
        return res

    cpdef apply_parallel(self, arr):
        cdef list arrs = np.array_split(arr, self.processes)
        with Pool(processes=self.processes) as pool:
            outputs = pool.map(self.apply, arrs)
        return np.concatenate(outputs, axis=0)

In [None]:
%%time
print('Tokenize texts...')
preprocessor = Preprocessor()


In [None]:
normalizer = TextNormalizer(config)
tokenizer = TextTokenizer(config)


In [None]:
train_dataset.tokens, test_dataset.tokens, submit_dataset.tokens = \
    preprocessor.tokenize(datasets, normalizer, tokenizer)

In [None]:
%%time
print('Build vocabulary...')
vocab = preprocessor.build_vocab(datasets, config)

In [None]:
%%time
print('Build sentence extra features...')
sentence_extra_featurizer = SentenceExtraFeaturizer(config)
train_dataset._X2, test_dataset._X2, submit_dataset._X2 = \
    preprocessor.build_sentence_features(
        datasets, sentence_extra_featurizer)
[d.build(config.device) for d in datasets]