In [1]:
## General libraries
import numpy as np
import tensorflow as tf
import pandas as pd
import collections
import math
import pprint

## My libraries 
import comms

Application of w2v to (user - product) pairs 
---

** Load Raw Data **

In [2]:
data = comms.load_jsons() ## list of jsons, TESTING:    pprint.pprint(data[0], depth=1)

user_item_df = comms.user_item_dataframe(data) ## dataframe ["context", "word"] = ["user", "product"]

** Transform the data to list of item - item pairs **

In [3]:
word_bags, context_ids = comms.create_contexts(user_item_df) ## list of lists of product_ids, list of user_ids

** Create item dictionary **

For w2v it is important to order the words by their frequency in text. 
In text processing applications this is obvious, but here 
there are two options how to count occurence of each word (product).
- a) by number of users that viewed it  
- b) by frequency in word_context_pairs -- this will higly push upward items that viewed someone who viewed a lot of items

** Warning **
Till now we worked with words as strings/ints - from now on words will be marked only by index in dictionary

In [4]:
## word_context_pairs -- pairs of (product, product) that occured in views of some user
## dictionary -- list of "item_id" : frequency -- frequency of word x is 3 means x is third most frequent word 
## reversed_dictionary -- frequency : "item_id"
dictionary, reversed_dictionary = comms.create_dictionary(word_bags, "A")

** Batch generating functions  **

In [5]:
genbatch_prereq = comms.create_genbatch_prerequisities(word_bags, dictionary)

In [6]:
def generate_batch(batch_size): 
    return comms.generate_batch(batch_size, genbatch_prereq, "A") 

** Build Tensorflow model and optimize it ** 

This is modified copy of official tutorial 

In [7]:
## Size of training batch
batch_size = 32

## Length of embedding vectors 
embedding_size = 32

## Number of negative examples, see nonnegative sampling 
num_sampled = 16 

## Number of iterations of optimization algorithm
num_iter = 10000

In [8]:
## Number of distinct words in vocabulary
vocabulary_size = len(dictionary)

In [9]:
w2v_model = comms.create_w2v_tf_model(batch_size, embedding_size, vocabulary_size, num_sampled)

In [10]:
train_inputs, train_labels, embeddings, nce_weights, nce_biases, loss, optimizer, init = w2v_model

In [11]:
sess = tf.Session()
sess.run(init)

In [12]:
## Learn the embeddings
for i in range(num_iter): 
    batch, labels = generate_batch(batch_size)
    sess.run(optimizer, {train_inputs : batch, train_labels : labels})

In [13]:
embeddings_eval, nce_weights_eval, nce_biases_eval = sess.run([embeddings, nce_weights, nce_biases]) 

In [14]:
print("embeddings shape \t", embeddings_eval.shape)
print("nce weights shape \t", nce_weights_eval.shape)
print("nce biases shape \t", nce_biases_eval.shape)

embeddings shape 	 (25, 32)
nce weights shape 	 (25, 32)
nce biases shape 	 (25,)


** Prediction ** 

I have many word embeddings and I need to predict distribution of that user. 

Just calculate the average embedding x and calculate softmax(Wx + b)


In [15]:
context_ids[3]

'730aa47b6abdb6a103df165ba9a69ce7'

In [24]:
## Prediction by user id 
user_id = "730aa47b6abdb6a103df165ba9a69ce7"
ind = context_ids.index(user_id)
user_embed = np.mean(embeddings_eval[[dictionary[word] for word in word_bags[ind]], :], axis = 0)
prob_new_word = comms.softmax(np.sum(nce_weights_eval * user_embed, axis=1) + nce_biases_eval)