In [27]:
## General libraries
import numpy as np
import tensorflow as tf
import pandas as pd
import collections
import math
import pprint

## My libraries 

import comms

** Algorithm settings **

In [28]:
### This has to be set before generating dictionary ---------------------------------------------------------
## How to count occurence of words -- method comms.create_dictionary argument {"A", "B"}
counts_alg = "A"

## How to generate batch -- method comms.generate_batch argument {"A"}
genbatch_alg = "A"

### This has to be set before running model and  ------------------------------------------------------------
## Size of training batch
batch_size = 32

## Length of embedding vectors 
embedding_size = 32

## Number of negative examples, see nonnegative sampling 
num_sampled = 16 

## Number of iterations of optimization algorithm
num_iter = 10000

** Load data, create word_bags, split to train and test **

In [29]:
data = comms.load_jsons() ## list of jsons, TESTING:    pprint.pprint(data[0], depth=1)
user_item_df = comms.user_item_dataframe(data) ## dataframe ["context", "word"] = ["user", "product"]
word_bags, context_ids = comms.create_contexts(user_item_df) ## list of lists of product_ids, list of user_ids

In [32]:
test_size = min(len(word_bags)//5, 100) ## number of users to make testing set of 
test_indices = np.random.choice(len(word_bags), test_size)
train_indices = [i for i in range(len(word_bags)) if i not in test_indices]

word_bags_test = [word_bags[i] for i in test_indices]
word_bags_train = [word_bags[i] for i in train_indices]

** Dictionary and batch generating **

In [31]:
dictionary, reversed_dictionary = comms.create_dictionary(word_bags_train, counts_alg)

## Number of distinct words in vocabulary
vocabulary_size = len(dictionary)

genbatch_prereq = comms.create_genbatch_prerequisities(word_bags_train, dictionary)

def generate_batch(batch_size): 
    return comms.generate_batch(batch_size, genbatch_prereq, genbatch_alg) 

** Build model and train it **

In [20]:
w2v_model = comms.create_w2v_tf_model(batch_size, embedding_size, vocabulary_size, num_sampled)
train_inputs, train_labels, embeddings, nce_weights, nce_biases, loss, optimizer, init = w2v_model

In [22]:
sess = tf.Session()
sess.run(init)

In [25]:
## Learn the embeddings
for i in range(num_iter): 
    batch, labels = generate_batch(batch_size)
    _, loss_eval = sess.run([optimizer, loss], {train_inputs : batch, train_labels : labels})
    if i % 1000 == 0: 
        print("Loss at iteration",i, " : ",loss_eval)

Loss at iteration 0  :  1.45273
Loss at iteration 1000  :  1.59382
Loss at iteration 2000  :  1.73079
Loss at iteration 3000  :  1.47814
Loss at iteration 4000  :  1.80039
Loss at iteration 5000  :  1.57665
Loss at iteration 6000  :  1.48287
Loss at iteration 7000  :  1.6787
Loss at iteration 8000  :  1.9216
Loss at iteration 9000  :  1.45983


In [36]:
embeddings_eval, nce_weights_eval, nce_biases_eval = sess.run([embeddings, nce_weights, nce_biases]) 

** Test the model on data ** 

In [115]:
## Take last word from each user 
## Try to predict it using the rest 
test_labels = [dictionary.get(wb[-1]) for wb in word_bags_test]
test_batches = [[dictionary.get(w) for w in wb[:-1]] for wb in word_bags_test]

## Remove Nones
remove_indicators = [tl == None or all(x is None for x in tb) for tl,tb in zip(test_labels, test_batches)]

test_labels_filter = [tl for tl, ri in zip(test_labels, remove_indicators) if not ri]
test_batches_filter = [tb for tb, ri in zip(test_batches, remove_indicators) if not ri]

In [116]:
user_embeds = [np.mean(embeddings_eval[tb, :], axis = 0) for tb in test_batches]

In [117]:
user_preferences = [comms.softmax(np.sum(nce_weights_eval * user_embed, axis = 1) + nce_biases_eval) for user_embed in user_embeds]

In [149]:
orders_of_selected_products = [(len(up) - np.argsort(np.argsort(up)))[ul] for up,ul in zip(user_preferences, test_labels_filter)]

In [151]:
## We would present the items he actually selected in the following orders 
orders_of_selected_products 

[3, 18]