In [1]:
from __future__ import unicode_literals, print_function
import plac
import spacy
import dill as pickle
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from pathlib import Path

from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural import Model, ReLu, Softmax, Maxout
from thinc.neural import ExtractWindow
from thinc.neural.pooling import Pooling, mean_pool, max_pool
from thinc.neural._classes.static_vectors import StaticVectors, get_word_ids
from thinc.neural._classes.hash_embed import HashEmbed
from thinc.neural._classes.embed import Embed
from thinc.neural._classes.difference import Siamese, CauchySimilarity
from thinc.neural.util import to_categorical
from thinc.neural._classes.batchnorm import BatchNorm as BN
from thinc.neural._classes.resnet import Residual

from thinc.api import layerize, with_flatten, with_getitem, flatten_add_lengths
from thinc.api import add, chain, clone, concatenate, Arg

from thinc.extra import datasets
from thinc.extra.load_nlp import get_spacy, get_vectors


In [2]:
def track_progress(**context):
    '''Print training progress. Called after each epoch.'''
    model = context['model']
    train_X = context['train_X']
    dev_X = context['dev_X']
    dev_y = context['dev_y']
    n_train = len(train_X)
    trainer = context['trainer']
    def each_epoch():
        global epoch_train_acc, epoch
        acc = model.evaluate(dev_X, dev_y)
        with model.use_params(trainer.optimizer.averages):
            avg_acc = model.evaluate(dev_X, dev_y)
        stats = (acc, avg_acc, float(epoch_train_acc) / n_train, trainer.dropout)
        print("%.3f (%.3f) dev acc, %.3f train acc, %.4f drop" % stats)
        track_stat('dev', epoch, avg_acc)
        track_stat('dev_raw', epoch, acc)
        track_stat('train', epoch, epoch_train_acc / n_train)
        track_stat('batch_size', epoch, trainer.batch_size)
        epoch_train_acc = 0.
        epoch += 1
    return each_epoch

def track_stat(name, i, value):
    if CTX is None:
        return
    if name not in CHANNELS:
        CHANNELS[name] = CTX.job.create_channel(name, neptune.ChannelType.NUMERIC)
    channel = CHANNELS[name]
    channel.send(x=i, y=value)

def preprocess(ops, nlp, rows, get_ids):
    '''Parse the texts with spaCy. Make one-hot vectors for the labels.'''
    Xs = []
    ys = []
    for (text1, text2), label in rows:
        Xs.append((get_ids([nlp(text1)])[0], get_ids([nlp(text2)])[0]))
        ys.append(label)
    return Xs, ops.asarray(ys, dtype='float32')

In [5]:
def get_quora_data(src_train, src_test):
    df_train = pd.read_csv(src_train)
    df_train.dropna(inplace = True)
    df_tr, df_val = train_test_split(df_train, test_size = 0.15, random_state = 111)
    return df_tr, df_val

def train_mine(dataset='quora_mine', width=50, depth=2, min_batch_size=128,
        max_batch_size=128, dropout=0.0, dropout_decay=0.0, pooling="mean+max",
        nb_epoch=30, pieces=3, L2=0.0, use_gpu=False, out_loc=None, quiet=False,
        job_id=None, ws_api_url=None, rest_api_url=None):
    global CTX
    if job_id is not None:
        CTX = neptune.Context()
        width = CTX.params.width
        L2 = CTX.params.L2
        nb_epoch = CTX.params.nb_epoch
        depth = CTX.params.depth
        max_batch_size = CTX.params.max_batch_size
    cfg = dict(locals())

    if out_loc:
        out_loc = Path(out_loc)
        if not out_loc.parent.exists():
            raise IOError("Can't open output location: %s" % out_loc)
    print(cfg)
    if pooling == 'mean+max':
        pool_layer = Pooling(mean_pool, max_pool)
    elif pooling == "mean":
        pool_layer = mean_pool
    elif pooling == "max":
        pool_layer = max_pool
    else:
        raise ValueError("Unrecognised pooling", pooling)


    print("Load spaCy")
    nlp = get_spacy('en')

    if use_gpu:
        Model.ops = CupyOps()

    print("Construct model")
    # Bind operators for the scope of the block:
    # * chain (>>): Compose models in a 'feed forward' style,
    # i.e. chain(f, g)(x) -> g(f(x))
    # * clone (**): Create n copies of a model, and chain them, i.e.
    # (f ** 3)(x) -> f''(f'(f(x))), where f, f' and f'' have distinct weights.
    # * concatenate (|): Merge the outputs of two models into a single vector,
    # i.e. (f|g)(x) -> hstack(f(x), g(x))
    Model.lsuv = True
    with Model.define_operators({'>>': chain, '**': clone, '|': concatenate,
                                 '+': add}):
        mwe_encode = ExtractWindow(nW=1) >> Maxout(width, pieces=pieces)

        embed = (StaticVectors('en', width)
                  + HashEmbed(width, 3000)
                  + HashEmbed(width, 3000))
        sent2vec = ( # List[spacy.token.Doc]{B}
            flatten_add_lengths  # : (ids{T}, lengths{B})
            >> with_getitem(0,      # : word_ids{T}
                 BN(embed, nO=width)
                 >> Residual(mwe_encode ** 2)
            ) # : (floats{T, W}, lengths{B})
            >> pool_layer
            >> Residual(Maxout(width*2, pieces=pieces)**2)
        )
        model = Siamese(sent2vec, CauchySimilarity(width*2))

        
    src_train = '../../../features/df_train_spacylemmat_fullclean.csv'
    src_test = '../../../features/df_test_spacylemmat_fullclean.csv'
    print("Read and parse data: %s" % dataset)
    if dataset == 'quora':
        train, dev = datasets.quora_questions()
    if dataset == 'quora_mine':
        train = []
        dev = []
        dftrain, dfdev = get_quora_data(src_train, src_test)
        for i in range(len(dftrain)):
            train.append(((dftrain.iloc[i, -3], dftrain.iloc[i, -2]), int(dftrain.iloc[i, 1])))
        for i in range(len(dfdev)):
            dev.append(((dfdev.iloc[i, -3], dfdev.iloc[i, -2]), int(dfdev.iloc[i, 1])))
    elif dataset == 'snli':
        train, dev = datasets.snli()
    elif dataset == 'stackxc':
        train, dev = datasets.stack_exchange()
    elif dataset in ('quora+snli', 'snli+quora'):
        train, dev = datasets.quora_questions()
        train2, dev2 = datasets.snli()
        train.extend(train2)
        dev.extend(dev2)
    else:
        raise ValueError("Unknown dataset: %s" % dataset)
        
        
    get_ids = get_word_ids(Model.ops)
    train_X, train_y = preprocess(model.ops, nlp, train, get_ids)
    dev_X, dev_y = preprocess(model.ops, nlp, dev, get_ids)

    with model.begin_training(train_X, train_y, **cfg) as (trainer, optimizer):
        # Pass a callback to print progress. Give it all the local scope,
        # because why not?
        trainer.each_epoch.append(track_progress(**locals()))
        trainer.batch_size = min_batch_size
        batch_size = float(min_batch_size)
        print("Accuracy before training", model.evaluate(dev_X, dev_y))
        print("Train")
        global epoch_train_acc
        n_iter = 0
        for X, y in trainer.iterate(train_X, train_y, progress_bar=not quiet):
            # Slightly useful trick: Decay the dropout as training proceeds.
            yh, backprop = model.begin_update(X, drop=trainer.dropout)
            assert yh.shape == y.shape, (yh.shape, y.shape)
            # No auto-diff: Just get a callback and pass the data through.
            # Hardly a hardship, and it means we don't have to create/maintain
            # a computational graph. We just use closures.

            assert (yh >= 0.).all(), yh
            train_acc = ((yh >= 0.5) == (y >= 0.5)).sum()
            loss = ((yh-y)**2).sum() / y.shape[0]
            track_stat('loss', n_iter, loss)
            epoch_train_acc += train_acc
            backprop(yh-y, optimizer)
            n_iter += 1

            # Slightly useful trick: start with low batch size, accelerate.
            trainer.batch_size = min(int(batch_size), max_batch_size)
            batch_size *= 1.001
            track_stat('Batch size', n_iter, trainer.batch_size)
        if out_loc:
            out_loc = Path(out_loc)
            print('Saving to', out_loc)
            with out_loc.open('wb') as file_:
                pickle.dump(model, file_, -1)

In [6]:
CTX = None
CHANNELS = {}
epoch_train_acc = 0.
epoch = 0

train_mine(use_gpu = False, out_loc = 'siamese_1sttry')

{'ws_api_url': None, 'job_id': None, 'dropout_decay': 0.0, 'depth': 2, 'dataset': 'quora_mine', 'quiet': False, 'pieces': 3, 'L2': 0.0, 'dropout': 0.0, 'pooling': 'mean+max', 'min_batch_size': 128, 'max_batch_size': 128, 'nb_epoch': 30, 'width': 50, 'out_loc': 'siamese_1sttry', 'use_gpu': False, 'rest_api_url': None}
Load spaCy
Construct model
Read and parse data: quora_mine


  0%|          | 0/343621 [00:00<?, ?it/s]

Accuracy before training 0.624499084747
Train


343680it [10:10, 674.55it/s]                             
  0%|          | 0/343621 [00:00<?, ?it/s]

0.769 (0.773) dev acc, 0.700 train acc, 0.0000 drop


343680it [10:08, 564.38it/s]                             
  0%|          | 0/343621 [00:00<?, ?it/s]

0.791 (0.800) dev acc, 0.788 train acc, 0.0000 drop


343680it [10:35, 462.36it/s]                             
  0%|          | 0/343621 [00:00<?, ?it/s]

0.808 (0.812) dev acc, 0.811 train acc, 0.0000 drop


343680it [10:23, 582.61it/s]                             
  0%|          | 0/343621 [00:00<?, ?it/s]

0.813 (0.818) dev acc, 0.825 train acc, 0.0000 drop


343680it [10:13, 698.02it/s]                             
  0%|          | 128/343621 [00:00<05:16, 1085.73it/s]

0.815 (0.824) dev acc, 0.835 train acc, 0.0000 drop


343680it [09:34, 767.92it/s]                             
  0%|          | 0/343621 [00:00<?, ?it/s]

0.823 (0.828) dev acc, 0.843 train acc, 0.0000 drop


343680it [09:50, 581.91it/s]                             
  0%|          | 128/343621 [00:00<08:04, 708.29it/s]

0.825 (0.831) dev acc, 0.850 train acc, 0.0000 drop


343680it [08:25, 680.27it/s]                             
  0%|          | 0/343621 [00:00<?, ?it/s]

0.826 (0.833) dev acc, 0.855 train acc, 0.0000 drop


343680it [05:56, 965.13it/s]                             
  0%|          | 0/343621 [00:00<?, ?it/s]

0.827 (0.834) dev acc, 0.860 train acc, 0.0000 drop


343680it [05:51, 978.99it/s]                             
  0%|          | 0/343621 [00:00<?, ?it/s]

0.826 (0.835) dev acc, 0.865 train acc, 0.0000 drop


343680it [05:58, 957.46it/s]                             
  0%|          | 128/343621 [00:00<05:06, 1120.07it/s]

0.828 (0.835) dev acc, 0.869 train acc, 0.0000 drop


343680it [05:27, 1050.84it/s]                            
  0%|          | 0/343621 [00:00<?, ?it/s]

0.829 (0.836) dev acc, 0.873 train acc, 0.0000 drop


343680it [06:30, 880.19it/s]                             
  0%|          | 0/343621 [00:00<?, ?it/s]

0.829 (0.836) dev acc, 0.876 train acc, 0.0000 drop


343680it [06:28, 895.27it/s]                             
  0%|          | 128/343621 [00:00<05:30, 1038.16it/s]

0.829 (0.837) dev acc, 0.879 train acc, 0.0000 drop


343680it [05:18, 1371.61it/s]                            
  0%|          | 0/343621 [00:00<?, ?it/s]

0.830 (0.837) dev acc, 0.882 train acc, 0.0000 drop


343680it [05:11, 1102.64it/s]                            
  0%|          | 128/343621 [00:00<04:30, 1269.51it/s]

0.826 (0.837) dev acc, 0.884 train acc, 0.0000 drop


343680it [05:29, 1042.83it/s]                            
  0%|          | 128/343621 [00:00<04:36, 1242.73it/s]

0.828 (0.836) dev acc, 0.887 train acc, 0.0000 drop


343680it [05:29, 1043.37it/s]                            
  0%|          | 128/343621 [00:00<05:26, 1051.34it/s]

0.828 (0.834) dev acc, 0.888 train acc, 0.0000 drop


343680it [05:59, 960.55it/s]                             
  0%|          | 128/343621 [00:00<04:36, 1240.38it/s]

0.829 (0.831) dev acc, 0.891 train acc, 0.0000 drop


343680it [04:12, 1455.04it/s]                            
  0%|          | 0/343621 [00:00<?, ?it/s]

0.830 (0.827) dev acc, 0.892 train acc, 0.0000 drop


343680it [04:13, 1433.68it/s]                            
  0%|          | 0/343621 [00:00<?, ?it/s]

0.825 (0.824) dev acc, 0.894 train acc, 0.0000 drop


343680it [04:14, 1352.93it/s]                            
  0%|          | 128/343621 [00:00<04:37, 1237.63it/s]

0.828 (0.819) dev acc, 0.895 train acc, 0.0000 drop


343680it [04:13, 1421.50it/s]                            
  0%|          | 0/343621 [00:00<?, ?it/s]

0.826 (0.816) dev acc, 0.897 train acc, 0.0000 drop


343680it [04:15, 1342.44it/s]                            
  0%|          | 0/343621 [00:00<?, ?it/s]

0.826 (0.811) dev acc, 0.897 train acc, 0.0000 drop


343680it [04:15, 1339.22it/s]                            
  0%|          | 128/343621 [00:00<04:28, 1278.69it/s]

0.826 (0.806) dev acc, 0.898 train acc, 0.0000 drop


343680it [04:13, 1354.20it/s]                            
  0%|          | 0/343621 [00:00<?, ?it/s]

0.827 (0.801) dev acc, 0.899 train acc, 0.0000 drop


343680it [04:14, 1348.03it/s]                            
  0%|          | 0/343621 [00:00<?, ?it/s]

0.825 (0.794) dev acc, 0.899 train acc, 0.0000 drop


343680it [04:14, 1412.51it/s]                            
  0%|          | 0/343621 [00:00<?, ?it/s]

0.823 (0.792) dev acc, 0.899 train acc, 0.0000 drop


343680it [04:09, 1379.70it/s]                            
  0%|          | 0/343621 [00:00<?, ?it/s]

0.823 (0.791) dev acc, 0.899 train acc, 0.0000 drop


343680it [04:14, 1349.09it/s]                            


0.825 (0.792) dev acc, 0.899 train acc, 0.0000 drop
Saving to siamese_1sttry
