In [1]:
from collections import Counter
from sklearn.model_selection import KFold
import numpy as np
import re
import os
import shutil

from urllib.parse import urlparse
from urllib.parse import urldefrag
from urllib.request import urlopen
from file_storage import FileStorage
from urllib.parse import urljoin
from IPython.display import clear_output
from collections import defaultdict

from inscriptis import get_text

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

from keras.models import Sequential, Model
from keras.layers import Embedding, LSTM, Bidirectional, Dense, Input, Masking, Lambda
import keras.backend as K
from keras.optimizers import Adam
from keras.callbacks import LambdaCallback, LearningRateScheduler, Callback

Using TensorFlow backend.


In [2]:
def get_max_lens(filename):
    max_query_len, max_document_len = -1, -1
    with open(filename) as handler:
        for line in handler:
            query, document = line.split("\t")
            query_len = len(query.split())
            document_len = len(document.split())
            if query_len > max_query_len:
                max_query_len = query_len
            if document_len > max_document_len:
                max_document_len = document_len
    return max_query_len, max_document_len

def cycle_file(filename):
    while True:
        with open(filename) as f:
            yield from f

In [3]:
#max_query_len, max_document_len = get_max_lens(TRAIN_DATA)
max_document_len, max_query_len = (35840, 1475)

In [4]:
BATCH_SIZE = 32
DATA_SIZE = 500000
QUERY_DICT_SIZE = 247074
DOCUMENT_DICT_SIZE = 583954
ACTIVATION = 'relu'
HIDDEN_DIM = 512

In [5]:
def make_seq_batches_generator(filename, batch_size, max_query_len, max_document_len):
    with open(filename) as handler:
        while True:
            query_batch = np.zeros([batch_size, max_query_len])
            docment_batch = np.zeros([batch_size, max_document_len])
            for i in range(batch_size):
                line = next(handler)
                query, document = line.split("\t")
                query = list(map(int, query.split()))
                document = list(map(int, document.split()))
                query_batch[i, :len(query)] = query
                docment_batch[i, :len(document)] = document
            yield query_batch, docment_batch

def make_non_seq_batches_generator(filename, batch_size):
    with open(filename) as handler:
        while True:
            query_batch = np.zeros([batch_size, QUERY_DICT_SIZE])
            docment_batch = np.zeros([batch_size, DOCUMENT_DICT_SIZE])
            for i in range(batch_size):
                line = next(handler)
                query, document = line.split("\t")
                query = list(map(int, query.split()))
                document = list(map(int, document.split()))
                for word in query:
                    query_batch[i, word] +=1
                for word in document:
                    docment_batch[i, word] +=1
            yield query_batch, docment_batch

def make_data_generator(positive_generator, negative_generator):
    while True:
        positive_query_batch, positive_docment_batch = next(positive_generator)
        negative_query_batch, negative_docment_batch = next(negative_generator)
        query_input = np.concatenate([positive_query_batch, negative_query_batch], axis=0)
        document_input = np.concatenate([positive_docment_batch, negative_docment_batch], axis=0)
        labels = np.concatenate(
            [np.ones(len(positive_query_batch)), -1 * np.ones(len(negative_query_batch))
        ]).reshape([-1, 1])
        yield (
            {'query_input': query_input, 'document_input': document_input},
            {'output': labels}
        )

In [6]:
def my_cosine_proximity(y_true, y_pred):
    return -K.mean(y_pred * y_true)

def mean_positive_score(y_true, y_pred):
    filter_mult = (y_true + 1) / 2
    return K.mean(y_pred * filter_mult)

def mean_positive_var(y_true, y_pred):
    mean_positive = mean_positive_score(y_true, y_pred)
    filter_mult = (y_true + 1) / 2
    return K.mean((y_pred * filter_mult - mean_positive) ** 2)

def get_pred(y_true, y_pred):
    mean_positive = mean_positive_score(y_true, y_pred)
    mean_negative = mean_negative_score(y_true, y_pred)
    
    threshold = (mean_positive + mean_negative) / 2
    
    positive_mult = (y_true + 1) / 2
    negative_mult = (1 - y_true) / 2

    return K.mean((y_pred * filter_mult - mean_positive) ** 2)

def mean_negative_score(y_true, y_pred):
    filter_mult = (1 - y_true) / 2
    return K.mean(y_pred * filter_mult)

def mean_negative_var(y_true, y_pred):
    mean_negative = mean_negative_score(y_true, y_pred)
    filter_mult = (1 - y_true) / 2
    return K.mean((y_pred * filter_mult - mean_negative) ** 2)

def normalize(embedding):
    return K.l2_normalize(embedding, axis=-1)

def dot_product(embeddings):
    return K.sum(embeddings[0] * embeddings[1], axis=-1)

def reshape_to_prediction(score):
    return K.reshape(score, (-1, 1))

def loss(y_true, y_pred):
    alpha = 1
    return (
        #my_cosine_proximity(y_true, y_pred) +
        alpha * (1 + mean_negative_score(y_true, y_pred)) ** 2 +
        alpha * (1 - mean_positive_score(y_true, y_pred)) ** 2
    )

In [7]:
def make_lstm_model(query_dict_size, document_dict_size, hidden_layers_num, activation, hidden_dim, lstm_num):
    query_input = Input(shape=(max_query_len,), name="query_input")  # shape: (BATCH_SIZE, max_query_len)
    query_masked = Masking(mask_value=0)(query_input)
    # shape: (BATCH_SIZE, max_query_len, hidden_dim)
    query_encoded = Embedding(query_dict_size, hidden_dim)(query_masked)
    for i in range(lstm_num - 1):
        # shape: (BATCH_SIZE, hidden_dim)
        query_encoded = Bidirectional(LSTM(hidden_dim, return_sequences=True))(query_encoded)
    query_hidden = Bidirectional(LSTM(hidden_dim))(query_encoded)  # shape: (BATCH_SIZE, hidden_dim)
    for i in range(hidden_layers_num):
        query_hidden = Dense(hidden_dim, activation=activation)(query_hidden)  # shape: (BATCH_SIZE, hidden_dim)
    query_embedding = Lambda(normalize)(query_hidden)  # shape: (BATCH_SIZE, hidden_dim)

    document_input = Input(shape=(max_document_len,), name="document_input")  # shape: (BATCH_SIZE, max_document_len)
    document_masked = Masking(mask_value=0)(document_input)
    # shape: (BATCH_SIZE, max_document_len, hidden_dim)
    document_encoded = Embedding(document_dict_size, hidden_dim)(document_masked)
    for i in range(lstm_num - 1):
        # shape: (BATCH_SIZE, hidden_dim)
        document_encoded = Bidirectional(LSTM(hidden_dim, return_sequences=True))(document_encoded)
    document_hidden = Bidirectional(LSTM(hidden_dim))(document_encoded)  # shape: (BATCH_SIZE, hidden_dim)
    for i in range(hidden_layers_num):
        document_hidden = Dense(hidden_dim, activation=activation)(document_hidden)  # shape: (BATCH_SIZE, hidden_dim)
    label_embedding = Lambda(normalize)(document_hidden)  # shape: (BATCH_SIZE, hidden_dim)

    score = Lambda(dot_product)([label_embedding, query_embedding])
    prediction = Lambda(reshape_to_prediction, name="output")(score)

    model = Model(inputs=[document_input, query_input], outputs=prediction)
    model.compile(
        Adam(),
        loss=my_cosine_proximity,
        metrics=[mean_positive_score, mean_negative_score, mean_positive_var, mean_negative_var, 'acc']
    )
    return model

In [8]:
def make_dense_model(query_dict_size, document_dict_size, hidden_layers_num, activation, hidden_dim):
    query_input = Input(shape=(query_dict_size,), name="query_input")  # shape: (BATCH_SIZE, QUERY_DICT_SIZE)
    query_hidden = Dense(hidden_dim, activation=activation)(query_input)  # shape: (BATCH_SIZE, hidden_dim)
    for i in range(hidden_layers_num - 1):
        query_hidden = Dense(hidden_dim, activation=activation)(query_hidden)  # shape: (BATCH_SIZE, hidden_dim)
    query_embedding = Lambda(normalize)(query_hidden)  # shape: (BATCH_SIZE, hidden_dim)

    # shape: (BATCH_SIZE, document_dict_size)
    document_input = Input(shape=(document_dict_size,), name="document_input")
    document_hidden = Dense(hidden_dim, activation=activation)(document_input)  # shape: (BATCH_SIZE, hidden_dim)
    for i in range(hidden_layers_num - 1):
        document_hidden = Dense(hidden_dim, activation=activation)(document_hidden)  # shape: (BATCH_SIZE, hidden_dim)
    label_embedding = Lambda(normalize)(document_hidden)  # shape: (BATCH_SIZE, hidden_dim)

    score = Lambda(dot_product)([label_embedding, query_embedding])
    prediction = Lambda(reshape_to_prediction, name="output")(score)

    model = Model(inputs=[document_input, query_input], outputs=prediction)
    model.compile(
        Adam(),
        loss=my_cosine_proximity,
        metrics=[mean_positive_score, mean_negative_score, mean_positive_var, mean_negative_var, 'acc']
    )
    return model

In [9]:
dense_model = make_dense_model(
    QUERY_DICT_SIZE,
    DOCUMENT_DICT_SIZE,
    2,
    ACTIVATION,
    HIDDEN_DIM
)

Instructions for updating:
Colocations handled automatically by placer.


In [None]:
history = dense_model.fit_generator(
    make_data_generator(
        make_non_seq_batches_generator("positive_train_data_35K.tsv", BATCH_SIZE),
        make_non_seq_batches_generator("negative_train_data_35K.tsv", BATCH_SIZE)
    ),
    steps_per_epoch=int(DATA_SIZE / (BATCH_SIZE * 2)),
    epochs=10,
    verbose=1,
    initial_epoch=0
)

Instructions for updating:
Use tf.cast instead.
Epoch 1/10
 196/7812 [..............................] - ETA: 16:12:17 - loss: -0.0140 - mean_positive_score: 0.1970 - mean_negative_score: 0.1829 - mean_positive_var: 0.0664 - mean_negative_var: 0.0590 - acc: 0.1555