In [206]:
import numpy as np
import tensorflow as tf
import pandas as pd
import collections
import math

** Specify dimensions **

In [4]:
num_users = 128
num_items = 1024

** Generate data **

In [16]:
avg_items_per_user = 5
data_len = avg_items_per_user * num_users

user_indices = np.random.randint(0, num_users, size = data_len)
item_indices = np.random.randint(0, num_items, size = data_len)

user_item_df = pd.DataFrame({"user" : user_indices, "item" : item_indices})

** Transform the data to list of item - item pairs **

In [115]:
grouping = user_item_df.groupby("user")
contexts = grouping.item.apply(list)
## Create list of [list of id items for one user]
contexts = [c for c in contexts if len(c) > 1] 
## Create all i,j item pairs i!=j that occur together in some context
item_context_pairs = [(i, j) for c in contexts for i in c for j in c if i != j] 

** Create item dictionary **
Question is how to count occurence of each item, there are two options 
- a) by number of users that viewed it  
- b) by frequency in word_context_pairs -- this will higly push upward items that viewed someone who viewed a lot of items

In [116]:
## option a) 
items_by_count = list(user_item_df["item"].value_counts().keys()) ## items with highest counts come first

In [117]:
## option b) 
item_context_pairs_flat = [item for pair in item_context_pairs for item in pair]
items_by_count = [pair[0] for pair in collections.Counter(item_context_pairs_flat).most_common()]

In [118]:
dictionary = dict(zip(items_by_count, range(len(items_by_count))))
reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))

** Warning **
Till now we worked with items as strings/ints - from now on items will be marked only by index in dictionary (which has format "item_id" : item index -- here the item index corresponds to one of the orderings of items defined above and IS important for the algorithm to work properly)

** Lets write functions to generate our batches for training **

Again, we can use more refined sampling (but slower and maybe not neccessarily better) or simpler and faster one. 

In [121]:
ic_pairs_len = len(item_context_pairs)

In [135]:
indices = np.random.choice(a = ic_pairs_len, size = 10, replace = False)
[item_context_pairs[i][0] for i in indices]

[285, 868, 261, 277, 11, 829, 725, 463, 493, 97]

In [227]:
def generate_batch(batch_size): 
    """
    Generates batch, labels (as in tf function)
    Primitive version -- not taking into account different context sizes 

    Arguments: 
    batch_size -- size of the batch
    
    Return: 
    batch -- words from which we try to predict labels
    labels -- words that we are trying to predict from batch
    
    """
    indices = np.random.choice(a = ic_pairs_len, size = batch_size, replace = False)
    batch = np.array([dictionary[item_context_pairs[i][0]] for i in indices])
    labels = np.array([dictionary[item_context_pairs[i][1]] for i in indices])
    return batch, labels.reshape((-1,1))

In [137]:
## We calculate lengths of contexts
context_lens = np.array([len(s) for s in contexts])

## We calculate the total size of the text 
context_N = sum(c_lens)

## And the number of contexts (users that viewed two or more items -- can be even the same items)
len_context = len(contexts)

In [None]:
def generate_batch(batch_size = 32, f = lambda x : x): 
    """
    Generates batch, labels (as in tf function)
    More advanced version
    
    Arguments: 
    batch_size -- size of the batch
    w_func -- how to account for the different sizes of contexts -- we will 
              choose a batch word from context of size n with probability f(n/N)
              where N is the sum of sizes of all contexts 
    
    Return: 
    batch -- words from which we try to predict labels
    labels -- words that we are trying to predict from batch
    """
    ## Calculate probability of selecting each context 
    p_select_context = f(context_lens/context_N)
    ## Multinomial distribution, distributing batch_size items to groups of 0 .. (len_c - 1)
    ## with probabilities p_select_context
    how_many_words_from_context = np.random.multinomial(batch_size, p_select_context)
    arg_nonzero = np.argwhere(how_many_words_from_context)
    how_many_nonzero = how_many_words_from_context[arg_nonzero]
    
    ----- TODO -----

** Build Tensorflow model ** 

This is modified copy of official tutorial 

In [212]:
## Size of training batch
batch_size = 32

## Length of embedding vectors 
embedding_size = 32

## Number of negative examples, see nonnegative sampling 
num_sampled = 16 

## Number of iterations of optimization algorithm
max_iter = 10000

In [210]:
## Number of distinct words in vocabulary
vocabulary_size = len(user_item_df["item"].unique())

In [222]:
# Input data.
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])

# Look up embeddings for inputs.
embeddings = tf.Variable(
    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
embed = tf.nn.embedding_lookup(embeddings, train_inputs)

# Construct the variables for the NCE loss
nce_weights = tf.Variable(
    tf.truncated_normal([vocabulary_size, embedding_size],
                        stddev=1.0 / math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

# Compute the average NCE loss for the batch.
# tf.nce_loss automatically draws a new sample of the negative labels each
# time we evaluate the loss.
# Explanation of the meaning of NCE loss:
#   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
loss = tf.reduce_mean(
    tf.nn.nce_loss(weights=nce_weights,
                   biases=nce_biases,
                   labels=train_labels,
                   inputs=embed,
                   num_sampled=num_sampled,
                   num_classes=vocabulary_size))

# Construct the SGD optimizer using a learning rate of 1.0.
optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

# Add variable initializer.
init = tf.global_variables_initializer()


In [223]:
sess = tf.Session()
sess.run(init)

In [228]:
## Learn the embeddings
for i in range(max_iter): 
    batch, labels = generate_batch(batch_size)
    sess.run(optimizer, {train_inputs : batch, train_labels : labels})

In [229]:
embed_run = sess.run(embeddings)

** Prediction ** 

I have many word embeddings and I need to predict distribution of that user. 

Just calculate the average embedding x and calculate softmax(Wx + b)


In [None]:
--- TODO -- promyslet trochu jeste predchozi bunku 