In [25]:
import itertools
import os
import re
import sys

import pandas as pd
import numpy as np
import scipy as sp

from sharknado.scipy.lib.data_manipulation import shark
from sharknado.scipy.transformers import signature
from sklearn.externals import joblib

%matplotlib inline
from IPython.core.pylabtools import figsize
from IPython.display import clear_output
from seaborn import plt

pd.options.display.max_columns = 999
pd.options.display.max_rows = 100

# Hash Learning

In [115]:
raw_data_dir = os.path.join(os.path.dirname(os.getcwd()), "data", "raw")
schemas = joblib.load(os.path.join(raw_data_dir, "schemas.dat"))

In [213]:
ref_dir = os.path.join(os.path.dirname(os.getcwd()), "references")
with open(os.path.join(ref_dir, "rare_merchants.txt"), "r") as f:
    merchants = np.array([line.strip() for line in f.readlines()])

In [214]:
from neural_merchant_scrubbing.src.models import hash_net as hn
from neural_merchant_scrubbing.src.features import build_features as bf

In [215]:
reload(hn)
trainer, encoder = hn.build_triplet_encoder(
    52, 125,
    encoder_fn=hn.dense_encoder,
    dist_fn=hn.tanh_hammingoid)

In [216]:
for l in encoder.layers:
    print l.name.center(25), l.input_shape, "->", l.output_shape

         input_70         (None, 125) -> (None, 125)
       embedding_26       (None, 125) -> (None, 125, 32)
     convolution1d_91     (None, 125, 32) -> (None, 125, 64)
     convolution1d_92     (None, 125, 32) -> (None, 125, 64)
        merge_102         [(None, 125, 64), (None, 125, 64)] -> (None, 125, 128)
      leakyrelu_107       (None, 125, 128) -> (None, 125, 128)
  globalmaxpooling1d_46   (None, 125, 128) -> (None, 128)
         dense_38         (None, 128) -> (None, 128)
      leakyrelu_108       (None, 128) -> (None, 128)
         dense_39         (None, 128) -> (None, 32)
        reshape_52        (None, 32) -> (None, 32, 1)
  binaryencoderlayer_17   (None, 32, 1) -> (None, 32)


In [217]:
cv = bf.CharacterVectorizer()

In [218]:
import re

PUNCT = re.compile(r"[:/.@*`'-]")
SPACE = re.compile(r" {2,}")

def permute_merchant(m):
    # 10% of the time, drop the last word of a merchant w/ 3+ words
    if len(m.split()) > 2 and np.random.random() < 0.1:
        m = " ".join(m.split()[:-1])
    # 20% of the time, strip up to 5 characters off the end
    if np.random.random() < 0.2:
        m = m[:-np.random.randint(1, 6)].strip()
    # 10% of the time, remove all saces
    if np.random.random() < 0.1:
        replacer = "" if np.random.random() < 0.5 else " "
        m = SPACE.sub(" ", PUNCT.sub(replacer, m))
    return m

def batch_generator(batch_size, schemas, merchants, cv):
    schm_cursor = 0
    schm_size = schemas.shape[0]
    schm_ixs = np.arange(schm_size)
    mrch_cursor = 0
    mrch_size = merchants.shape[0]
    mrch_ixs = np.arange(mrch_size)
    while True:
        
        if schm_cursor + (3 * batch_size) >= schm_size:
            schm_cursor = 0
        if schm_cursor == 0:
            np.random.shuffle(schm_ixs)
        
        if mrch_cursor + (2 * batch_size) >= mrch_size:
            mrch_cursor = 0
        if mrch_cursor == 0:
            np.random.shuffle(mrch_ixs)
        
        batch_merchants = merchants[mrch_ixs[mrch_cursor:mrch_cursor + (2 * batch_size)]]
        x_m = [permute_merchant(m) for m in batch_merchants[:batch_size]]
        y_m = [permute_merchant(m) for m in batch_merchants[:batch_size]]
        z_m = [permute_merchant(m) for m in batch_merchants[batch_size:]]

        batch_schemas = schemas[schm_ixs[schm_cursor:schm_cursor + (3 * batch_size)]]
        x_s, y_s, z_s = batch_schemas.reshape((3, batch_size))

        x_ms = [s.replace("_____", m) for m, s in zip(x_m, x_s)]
        y_ms = [s.replace("_____", m) for m, s in zip(y_m, y_s)]
        z_ms = [s.replace("_____", m) for m, s in zip(z_m, z_s)]
        
        yield (
            [cv.transform(ar).toarray() for ar in (x_ms, y_ms, z_ms)],
            np.zeros(batch_size))
        schm_cursor += 3 * batch_size
        mrch_cursor += 2 * batch_size

In [219]:
schema_ixs = np.arange(schemas.shape[0])
np.random.shuffle(schema_ixs)
schema_test = schemas[schema_ixs[:1000]]
schema_train = schemas[schema_ixs[1000:]]

merch_ixs = np.arange(merchants.shape[0])
np.random.shuffle(merch_ixs)
merch_test = merchants[merch_ixs[:1000]]
merch_train = merchants[merch_ixs[1000:]]

train_gen = batch_generator(32, schema_train, merch_train, cv)
test_gen = batch_generator(32, schema_test, merch_test, cv)

In [220]:
trainer.compile(optimizer="adam", loss="mae")

In [221]:
trainer.fit_generator(
    train_gen,
    samples_per_epoch=32 * 1000,
    nb_epoch=5,
    verbose=1,
    validation_data=test_gen,
    nb_val_samples=32 * 100)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x131438490>

In [230]:
trainer.predict([ar.reshape(1, 125) for ar in cv.transform([
    "CHECKCARD 0913 MCDONALD'S #13156 LINDEN NJ 24231684257206988001017",
    "MCDONALDS 10/28",
    "JAVA ON FOUR"]).toarray()])

array([ 0.], dtype=float32)

In [242]:
preds = encoder.predict(cv.transform([
    "CHECKCARD 0913 MCDONALDS #13156 LINDEN NJ 24231684257206988001017",
    "POS PIN MCDONALDS",
    "Wendy's"]).toarray())

In [244]:
preds[0] == preds[1]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True], dtype=bool)