In [1]:
from collections import Counter
from sklearn.model_selection import KFold
import numpy as np
from scipy.sparse import csr_matrix
import re
import os
import shutil

from IPython.display import clear_output
from collections import defaultdict

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

from keras.models import Sequential, Model
from keras.layers import Embedding, LSTM, Bidirectional, Dense, Input, Masking, Lambda
import keras.backend as K
from keras.optimizers import Adam
from keras.callbacks import LambdaCallback, LearningRateScheduler, Callback

Using TensorFlow backend.


In [2]:
def get_max_lens(filename):
    max_query_len, max_document_len = -1, -1
    with open(filename) as handler:
        for line in handler:
            query, document = line.split("\t")
            query_len = len(query.split())
            document_len = len(document.split())
            if query_len > max_query_len:
                max_qu
                ery_len = query_len
            if document_len > max_document_len:
                max_document_len = document_len
    return max_query_len, max_document_len

def cycle_file(filename):
    while True:
        with open(filename) as f:
            yield from f

In [3]:
#max_query_len, max_document_len = get_max_lens(TRAIN_DATA)
max_document_len, max_query_len = (35840, 1475)

In [4]:
BATCH_SIZE = 32
DATA_SIZE = 500000
QUERY_DICT_SIZE = 247074
DOCUMENT_DICT_SIZE = 583954
ACTIVATION = 'relu'
HIDDEN_DIM = 256

In [5]:
def make_seq_batches_generator(filename, batch_size, max_query_len, max_document_len):
    with open(filename) as handler:
        while True:
            query_batch = np.zeros([batch_size, max_query_len])
            docment_batch = np.zeros([batch_size, max_document_len])
            for i in range(batch_size):
                line = next(handler)
                query, document = line.split("\t")
                query = list(map(int, query.split()))
                document = list(map(int, document.split()))
                query_batch[i, :len(query)] = query
                docment_batch[i, :len(document)] = document
            yield query_batch, docment_batch

def make_non_seq_batches_generator(filename, batch_size):
    with open(filename) as handler:
        while True:
            query_batch = np.zeros([batch_size, QUERY_DICT_SIZE])
            docment_batch = np.zeros([batch_size, DOCUMENT_DICT_SIZE])
            for i in range(batch_size):
                line = next(handler)
                query, document = line.split("\t")
                query = list(map(int, query.split()))
                document = list(map(int, document.split()))
                for word in query:
                    query_batch[i, word] +=1
                for word in document:
                    docment_batch[i, word] +=1
            yield query_batch, docment_batch

def make_data_generator(positive_generator, negative_generator):
    while True:
        positive_query_batch, positive_docment_batch = next(positive_generator)
        negative_query_batch, negative_docment_batch = next(negative_generator)
        query_input = np.concatenate([positive_query_batch, negative_query_batch], axis=0)
        document_input = np.concatenate([positive_docment_batch, negative_docment_batch], axis=0)
        labels = np.concatenate(
            [np.ones(len(positive_query_batch)), -1 * np.ones(len(negative_query_batch))
        ]).reshape([-1, 1])
        yield (
            {'query_input': query_input, 'document_input': document_input},
            {'output': labels}
        )

def make_sparse_data_generator(positive_filename, negative_filename, batch_size):
    with open(positive_filename) as positive_handler, open(negative_filename) as negative_handler:
        while True:
            query_indices_axis_0, document_indices_axis_0 = [], []
            query_indices_axis_1, document_indices_axis_1 = [], []
            query_values, document_values = [], []
            for handler in [positive_handler, negative_handler]:
                for i in range(batch_size):
                    line = next(handler)
                    query, document = line.split("\t")
                    query_words = Counter(map(int, query.split()))
                    document_words = Counter(map(int, document.split()))

                    for word in query_words:
                        query_indices_axis_0.append(i)
                        query_indices_axis_1.append(word)
                        query_values.append(query_words[word])
                    for word in document_words:
                        document_indices_axis_0.append(i)
                        document_indices_axis_1.append(word)
                        document_values.append(document_words[word])
                    
            query_batch = csr_matrix(
                (query_values, (query_indices_axis_0, query_indices_axis_1)),
                shape=(BATCH_SIZE * 2, QUERY_DICT_SIZE)
            )
            docment_batch = csr_matrix(
                (document_values, (document_indices_axis_0, document_indices_axis_1)),
                shape=(BATCH_SIZE * 2, DOCUMENT_DICT_SIZE)
            )
            labels = np.concatenate([np.ones(batch_size), -1 * np.ones(batch_size)]).reshape([-1, 1])
            yield (
                {'query_input': query_batch, 'document_input': docment_batch},
                {'output': labels}
            )

In [6]:
def my_cosine_proximity(y_true, y_pred):
    return -K.mean(y_pred * y_true)

def mean_positive_score(y_true, y_pred):
    filter_mult = (y_true + 1) / 2
    return K.mean(y_pred * filter_mult)

def mean_positive_var(y_true, y_pred):
    mean_positive = mean_positive_score(y_true, y_pred)
    filter_mult = (y_true + 1) / 2
    return K.mean((y_pred * filter_mult - mean_positive) ** 2)

def get_pred(y_true, y_pred):
    mean_positive = mean_positive_score(y_true, y_pred)
    mean_negative = mean_negative_score(y_true, y_pred)
    
    threshold = (mean_positive + mean_negative) / 2
    
    positive_mult = (y_true + 1) / 2
    negative_mult = (1 - y_true) / 2

    return K.mean((y_pred * filter_mult - mean_positive) ** 2)

def mean_negative_score(y_true, y_pred):
    filter_mult = (1 - y_true) / 2
    return K.mean(y_pred * filter_mult)

def mean_negative_var(y_true, y_pred):
    mean_negative = mean_negative_score(y_true, y_pred)
    filter_mult = (1 - y_true) / 2
    return K.mean((y_pred * filter_mult - mean_negative) ** 2)

def normalize(embedding):
    return K.l2_normalize(embedding, axis=-1)

def dot_product(embeddings):
    return K.sum(embeddings[0] * embeddings[1], axis=-1)

def reshape_to_prediction(score):
    return K.reshape(score, (-1, 1))

def loss(y_true, y_pred):
    alpha = 1
    return (
        #my_cosine_proximity(y_true, y_pred) +
        alpha * (1 + mean_negative_score(y_true, y_pred)) ** 2 +
        alpha * (1 - mean_positive_score(y_true, y_pred)) ** 2
    )

In [7]:
def make_lstm_model(query_dict_size, document_dict_size, hidden_layers_num, activation, hidden_dim, lstm_num):
    query_input = Input(shape=(max_query_len,), sparse=True, name="query_input")  # shape: (BATCH_SIZE, max_query_len)
    query_masked = Masking(mask_value=0)(query_input)
    # shape: (BATCH_SIZE, max_query_len, hidden_dim)
    query_encoded = Embedding(query_dict_size, hidden_dim)(query_masked)
    for i in range(lstm_num - 1):
        # shape: (BATCH_SIZE, hidden_dim)
        query_encoded = Bidirectional(LSTM(hidden_dim, return_sequences=True))(query_encoded)
    query_hidden = Bidirectional(LSTM(hidden_dim))(query_encoded)  # shape: (BATCH_SIZE, hidden_dim)
    for i in range(hidden_layers_num):
        query_hidden = Dense(hidden_dim, activation=activation)(query_hidden)  # shape: (BATCH_SIZE, hidden_dim)
    query_embedding = Lambda(normalize)(query_hidden)  # shape: (BATCH_SIZE, hidden_dim)

    # shape: (BATCH_SIZE, max_document_len)
    document_input = Input(shape=(max_document_len,), sparse=True, name="document_input")
    document_masked = Masking(mask_value=0)(document_input)
    # shape: (BATCH_SIZE, max_document_len, hidden_dim)
    document_encoded = Embedding(document_dict_size, hidden_dim)(document_masked)
    for i in range(lstm_num - 1):
        # shape: (BATCH_SIZE, hidden_dim)
        document_encoded = Bidirectional(LSTM(hidden_dim, return_sequences=True))(document_encoded)
    document_hidden = Bidirectional(LSTM(hidden_dim))(document_encoded)  # shape: (BATCH_SIZE, hidden_dim)
    for i in range(hidden_layers_num):
        document_hidden = Dense(hidden_dim, activation=activation)(document_hidden)  # shape: (BATCH_SIZE, hidden_dim)
    label_embedding = Lambda(normalize)(document_hidden)  # shape: (BATCH_SIZE, hidden_dim)

    score = Lambda(dot_product)([label_embedding, query_embedding])
    prediction = Lambda(reshape_to_prediction, name="output")(score)

    model = Model(inputs=[document_input, query_input], outputs=prediction)
    model.compile(
        Adam(),
        loss=my_cosine_proximity,
        metrics=[mean_positive_score, mean_negative_score, mean_positive_var, mean_negative_var, 'acc']
    )
    return model

In [8]:
def make_dense_model(query_dict_size, document_dict_size, hidden_layers_num, activation, hidden_dim, sparse=False):
    # shape: (BATCH_SIZE, QUERY_DICT_SIZE)
    query_input = Input(shape=(query_dict_size,), sparse=sparse, name="query_input")
    query_hidden = Dense(hidden_dim, activation=activation)(query_input)  # shape: (BATCH_SIZE, hidden_dim)
    for i in range(hidden_layers_num - 1):
        query_hidden = Dense(hidden_dim, activation=activation)(query_hidden)  # shape: (BATCH_SIZE, hidden_dim)
    query_embedding = Lambda(normalize)(query_hidden)  # shape: (BATCH_SIZE, hidden_dim)

    # shape: (BATCH_SIZE, document_dict_size)
    document_input = Input(shape=(document_dict_size,), sparse=sparse, name="document_input")
    document_hidden = Dense(hidden_dim, activation=activation)(document_input)  # shape: (BATCH_SIZE, hidden_dim)
    for i in range(hidden_layers_num - 1):
        document_hidden = Dense(hidden_dim, activation=activation)(document_hidden)  # shape: (BATCH_SIZE, hidden_dim)
    label_embedding = Lambda(normalize)(document_hidden)  # shape: (BATCH_SIZE, hidden_dim)

    score = Lambda(dot_product)([label_embedding, query_embedding])
    prediction = Lambda(reshape_to_prediction, name="output")(score)

    model = Model(inputs=[document_input, query_input], outputs=prediction)
    model.compile(
        Adam(),
        loss=my_cosine_proximity,
        metrics=[mean_positive_score, mean_negative_score, mean_positive_var, mean_negative_var, 'acc']
    )
    return model

In [9]:
dense_model = make_dense_model(
    QUERY_DICT_SIZE,
    DOCUMENT_DICT_SIZE,
    2,
    ACTIVATION,
    HIDDEN_DIM,
    True
)

In [None]:
history = dense_model.fit_generator(
    make_sparse_data_generator("positive_train_data_35K.tsv", "negative_train_data_35K.tsv", BATCH_SIZE),
    steps_per_epoch=int(DATA_SIZE / (BATCH_SIZE * 2)),
    epochs=10,
    verbose=1,
    initial_epoch=0
)

Epoch 1/10
  41/7812 [..............................] - ETA: 40:01 - loss: -0.4400 - mean_positive_score: 0.4489 - mean_negative_score: 0.0088 - mean_positive_var: 0.2068 - mean_negative_var: 0.0021 - acc: 0.4855

In [None]:
positive_gen = make_non_seq_batches_generator("positive_train_data_35K.tsv", 1)
negative_gen = make_non_seq_batches_generator("negative_train_data_35K.tsv", 1)

In [None]:
p_q, p_d = next(positive_gen)
n_q, n_d = next(negative_gen)
(
    dense_model.predict({"document_input": p_d, "query_input": p_q})[0, 0],
    dense_model.predict({"document_input": n_d, "query_input": n_q})[0, 0]
)

In [None]:
p_q, p_d = next(positive_gen)

In [None]:
sum(p_q.toarray()[0])

In [None]:
gen = make_sparse_data_generator("positive_train_data_35K.tsv", "negative_train_data_35K.tsv", BATCH_SIZE)

In [None]:
x = next(gen)

In [None]:
x