In [1]:
import pickle
import random
import time
import sys
import numpy as np
import tensorflow as tf
import scipy
from sklearn.feature_extraction.text import CountVectorizer
import smart_open

  from ._conv import register_converters as _register_converters


## Batch Parameters

In [2]:
batch_size = 1024
lower_border = 0
upper_border = 5
lower_n_gram = 3
upper_n_gram = 3

n_negative_pairs = 50

## Network Parameters

In [3]:
l1_units = 300
l2_units = 128
learning_rate = 0.1
max_epochs = 5

## Data Parameters

In [4]:
judgments = "judgments.txt"
threshold = 3.5

data_dir = ".\\data\\"

mooc_corpus = 'courses.cor'
mooc_names = 'courses.cfn'

rpd_names='docs.cfn'
rpd_corpus='docs.cor'

model_filename = time.strftime("%Y%m%d-%H%M%S")

## Data Preparation

In [5]:
def read_file(fname):
    with smart_open.smart_open(fname) as f:
        for i, line in enumerate(f):
            yield line.decode()

def filter_judgments_below_threshold(fname, threshold):
    filtered_judgments = []
    for line in read_file(fname):
        judgment = line
        query_name, protocol, document_name, value = judgment.split(':')
        document_name = protocol + ":" + document_name
        value = value[:-2]
        value = float(value.replace(',', '.'))
        if value > threshold:
            filtered_judgments.append((query_name, document_name, value))
    return filtered_judgments

def get_query_document_text_pairs(pairs, query_names, document_names):
    prepared_query_names = []
    prepared_document_names = []

    prepared_query_corpus = []
    prepared_document_corpus = []
    
    for (query_name, document_name, judgment) in pairs:
        if (query_name in query_names) and (document_name in document_names):
            prepared_query_names.append(query_name)
            prepared_document_names.append(document_name)
            prepared_query_corpus.append(query_corpus[query_names.index(query_name)])
            prepared_document_corpus.append(document_corpus[document_names.index(document_name)])
    
    return prepared_query_names, prepared_document_names, prepared_query_corpus, prepared_document_corpus

def feed_dict(query_data, document_data, batch_id):
    #query, document = prepare_batch(query_data, document_data, batch_id)
    #return {query_batch: query, document_batch: document}    
    query_i, query_v, document_i, document_v = prepare_batch(query_data, document_data, batch_id)
    return {query_indices: query_i, query_values: query_v, document_indices: document_i, document_values : document_v}  

In [6]:
document_corpus = [line[:-1] for line in (read_file(data_dir + mooc_corpus))]
document_names = [line[:-2] for line in (read_file(data_dir + mooc_names))]

query_corpus = [line[:-1] for line in (read_file(data_dir + rpd_corpus))]
query_names = [line[:-1] for line in (read_file(data_dir + rpd_names))]

print('Total documents in corpus: ', len(document_corpus))
print('Total queries in corpus: ', len(query_corpus))

Total documents in corpus:  1276
Total queries in corpus:  29


In [7]:
filtered_judgments = filter_judgments_below_threshold(data_dir + judgments, threshold)
print('Total pairs after filtering: ', len(filtered_judgments))

Total pairs after filtering:  278


In [8]:
prepared_query_names, prepared_document_names, prepared_query_corpus, prepared_document_corpus = \
    get_query_document_text_pairs(filtered_judgments, query_names, document_names)

print(len(prepared_query_corpus), len(prepared_document_corpus))

256 256


## Setting up input data

In [9]:
document_count = len(prepared_query_corpus)
train_test_ration = 0.75
train_count = round(train_test_ration * document_count)
test_count = document_count - train_count

vectorizer = CountVectorizer(analyzer='char', ngram_range=(lower_n_gram,upper_n_gram))
vectorizer.fit(prepared_query_corpus+prepared_document_corpus)

ngram_count = len(vectorizer.vocabulary_)

query_train = vectorizer.transform(prepared_query_corpus[train_count:]).tocsr()
document_train = vectorizer.transform(prepared_document_corpus[train_count:]).tocsr()

query_test = vectorizer.transform(prepared_document_corpus[:test_count]).tocsr()
document_test = vectorizer.transform(prepared_document_corpus[:test_count]).tocsr()

Random data generation:

ngram_count = 10000

document_count = 10*batch_size

train_count = document_count

test_count = document_count

dummy_documents = scipy.sparse.random(document_number, ngram_count, dtype=np.float32)

dummy_queries = scipy.sparse.random(document_number, ngram_count, dtype=np.float32)


## Input Layer

https://github.com/tensorflow/tensorflow/issues/342

So, apparently, that is what tensorflowers consider a good solution to their sparse_placeholder not being able to be fed into the network:

sp_indices = tf.placeholder(tf.int64)

sp_shape = tf.placeholder(tf.int64)

sp_ids_val = tf.placeholder(tf.int64)

sp_ids = tf.SparseTensor(sp_indices, sp_ids_val, sp_shape)

 ** \*facepalm\* **

In [10]:
shape = [batch_size, ngram_count]
query_shape = np.array([batch_size, ngram_count], np.int64)
document_shape = np.array([batch_size, ngram_count], np.int64)

#query_batch = tf.sparse_placeholder(tf.float32, 
#                                    shape=shape, 
#                                    name='QueryBatch')
#document_batch = tf.sparse_placeholder(tf.float32, 
#                                    shape=shape, 
#                                    name='DocumentBatch')

query_indices = tf.placeholder(tf.int64)
document_indices = tf.placeholder(tf.int64)

query_values = tf.placeholder(tf.float32)
document_values = tf.placeholder(tf.float32)

query_batch = tf.SparseTensor(query_indices, query_values, shape)
document_batch = tf.SparseTensor(document_indices, document_values, shape)

Transforming data into small sparse batches:

In [11]:
def pull_batch(query_data, document_data, batch_id):
    batch_start = batch_id * batch_size
    batch_end = max(batch_start + batch_id, query_data.shape[0])
    query_in = query_data[batch_start : batch_end, :]
    document_in = document_data[batch_start : batch_end, :]

    return query_in, document_in

def to_sparse_tensor(data):
    data = data.tocoo()   
    #data = tf.SparseTensorValue(
    #    np.transpose([np.array(data.row, dtype=np.int64), np.array(data.col, dtype=np.int64)]),
    #    np.array(data.data, dtype=np.float),
    #    np.array(data.shape, dtype=np.int64))
    #    return data
    data_indices = np.transpose([np.array(data.row, dtype=np.int64), np.array(data.col, dtype=np.int64)])
    data_values = np.array(data.data, dtype=np.float)
    return data_indices, data_values

def prepare_batch(query_data, document_data, batch_id):
    query, document = pull_batch(query_data, document_data, batch_id)
    #sparse_query_batch = to_sparse_tensor(query)
    #sparse_document_batch = to_sparse_tensor(document)
    #return sparse_query_batch, sparse_document_batch
    query_indices, query_values = to_sparse_tensor(query)
    document_indices, document_values = to_sparse_tensor(document)
    return query_indices, query_values, document_indices, document_values

## Layer Operations

In [12]:
def get_weight_range(n_input_units, n_output_units):
    border = np.sqrt(6.0 / (n_input_units + n_output_units))
    return (-border, border)

def init_shape_randomly(shape, value_range):
    return tf.Variable(tf.random_uniform(shape, value_range[0], value_range[1]))

def get_weights_and_bias(n_input_units, n_output_units):
    value_range = get_weight_range(n_input_units, n_output_units)
    weights = init_shape_randomly([n_input_units, n_output_units], value_range)
    bias = init_shape_randomly([n_output_units], value_range)
    return weights, bias

def get_layer_out_values(data, weights, bias, sparse_layer=False):
    layer_function = tf.sparse_tensor_dense_matmul if sparse_layer else tf.matmul
    data_in = layer_function(data, weights) + bias
    data_out = tf.nn.relu(data_in)
    return data_out

def apply_layer(n_input_units, n_output_units, query, document, sparse_layer=False):
    weight, bias = get_weights_and_bias(n_input_units, n_output_units)
    query_out = get_layer_out_values(query, weight, bias, sparse_layer=sparse_layer)
    document_out = get_layer_out_values(document, weight, bias, sparse_layer=sparse_layer)
    return query_out, document_out

## First Layer

In [13]:
#Sparse here, since input data is sparse
query1_out, document1_out = apply_layer(ngram_count, l1_units, query_batch, document_batch, sparse_layer=True)

## Second Layer

In [14]:
#Regular dense operations, since the output of the first layer is dense
query2_out, document2_out = apply_layer(l1_units, l2_units, query1_out, document1_out)

## >Additional layers would go here<

## Output from the final layer

In [15]:
query_final = query2_out
document_final = document2_out

## Generating negative pairs

This should be redone with proper shuffling, not whatever this is:

In [16]:
def duplicate_and_shuffle_document_vectors(n_negative_pairs, document_batch):
    document_batch_copy = tf.tile(document_batch, [1, 1])
    batch_size = document_batch.get_shape().as_list()[0]
    enlarged_document_batch = document_batch
    for i in range(n_negative_pairs):
        rand = int((random.random() + i) * batch_size / n_negative_pairs)
        enlarged_document_batch = tf.concat([enlarged_document_batch,
                           tf.slice(document_batch_copy, [rand, 0], [batch_size - rand, -1]),
                           tf.slice(document_batch_copy, [0, 0], [rand, -1])],
                          0)
    return enlarged_document_batch

def duplicate_query_vectors(n_negative_pairs, query_batch):
    return tf.tile(query_batch, [n_negative_pairs + 1, 1])

def calculate_vector_norms(vectors):
    return tf.sqrt(tf.reduce_sum(tf.square(vectors), 1, True))

def prepare_negative_pairs(n_negative_pairs, query_batch, document_batch):
    query = duplicate_query_vectors(n_negative_pairs, query_batch)
    document = duplicate_and_shuffle_document_vectors(n_negative_pairs, document_batch)
    
    query_norms = duplicate_query_vectors(n_negative_pairs, calculate_vector_norms(query_batch))
    document_norms = calculate_vector_norms(document)
    
    return query, document, query_norms, document_norms

This also could be done better, if shuffling ever remade:

In [17]:
def calculate_cosine_similarities_for_batch(n_negative_pairs, query_batch, document_batch):
    query, document, query_norms, document_norms = prepare_negative_pairs(n_negative_pairs, query_batch, document_batch)
    batch_size = query_batch.shape[0]
    
    product = tf.reduce_sum(tf.multiply(query, document), 1, True)
    norm_product = tf.multiply(query_norms, document_norms)

    cosine_similarities_raw = tf.truediv(product, norm_product)
    cosine_similarities = tf.transpose(tf.reshape(tf.transpose(cosine_similarities_raw),
                                                [n_negative_pairs + 1, batch_size]))
    return cosine_similarities

## Cosine Similarity Layer

In [18]:
cosine_similarities = calculate_cosine_similarities_for_batch(n_negative_pairs, query_final, document_final)

## Softmax Layer

In [19]:
probabilities = tf.nn.softmax((cosine_similarities))
success_probabilities = tf.slice(probabilities, [0, 0], [-1, 1])

## Loss Calculation

In [20]:
def calculate_loss(success_probabilities, batch_size):
    loss = -tf.reduce_sum(tf.log(success_probabilities)) / batch_size
    return loss

def calculate_average_epoch_loss(session, loss, query_data, document_data, total_batches):
    #total_pairs = 
    epoch_loss = 0
    for i in range(total_batches):
        pair_loss = session.run(loss, feed_dict=feed_dict(query_data, document_data, i))
        epoch_loss += pair_loss

    epoch_loss /= total_batches
    return epoch_loss

In [21]:
loss = calculate_loss(cosine_similarities, batch_size)

## Training Network

In [22]:
def report_progress(epoch, progress):
    sys.stdout.write("\rEpoch {}: {}%".format(epoch, "%.2f" % progress))
    sys.stdout.flush()

In [23]:
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)

In [24]:
saver = tf.train.Saver()

total_train_batches = round(train_count / batch_size) if train_count > batch_size else 1
total_test_batches = round(test_count / batch_size) if test_count > batch_size else 1

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
epoch = -1

start_time = time.time()
with tf.Session(config=config) as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(max_epochs):
        for batch_id in range(total_train_batches):
            progress = 100.0 * batch_id / total_train_batches
            report_progress(epoch, progress)
        
            sess.run(train_step, feed_dict=feed_dict(query_train, document_train, batch_id))
        
        report_progress(epoch, 100.0)
        print("\nTrain Sample (%d values): " % train_count)
        %time train_epoch_loss = \
            calculate_average_epoch_loss(sess, loss, query_train, document_train, total_train_batches)       
        print ("Loss: %-4.3f" % (train_epoch_loss))
        
        print("\nTest Sample (%d values): " % test_count)
        %time test_epoch_loss = \
            calculate_average_epoch_loss(sess, loss, query_test, document_test, total_test_batches) 
        print ("Loss: %-4.3f" % (test_epoch_loss))
        print("-----------------------------")
        
    save_path = saver.save(sess, data_dir + model_filename + ".ckpt")
    print("Model saved as: %s" % save_path)
print("TOTAL TIME: %s seconds" % (time.time() - start_time))

Epoch 0: 100.00%
Train Sample (192 values): 
Wall time: 460 ms
Loss: 1.950

Test Sample (64 values): 
Wall time: 344 ms
Loss: 1.946
-----------------------------
Epoch 1: 100.00%
Train Sample (192 values): 
Wall time: 390 ms
Loss: 0.464

Test Sample (64 values): 
Wall time: 409 ms
Loss: 0.464
-----------------------------
Epoch 2: 100.00%
Train Sample (192 values): 
Wall time: 441 ms
Loss: 0.219

Test Sample (64 values): 
Wall time: 414 ms
Loss: 0.218
-----------------------------
Epoch 3: 100.00%
Train Sample (192 values): 
Wall time: 428 ms
Loss: 0.116

Test Sample (64 values): 
Wall time: 422 ms
Loss: 0.115
-----------------------------
Epoch 4: 100.00%
Train Sample (192 values): 
Wall time: 379 ms
Loss: 0.065

Test Sample (64 values): 
Wall time: 363 ms
Loss: 0.065
-----------------------------
Model saved as: .\data\20180320-212705.ckpt
TOTAL TIME: 8.456505537033081 seconds
