### the following is heavily influenced by this [blogpost](http://adventuresinmachinelearning.com/word2vec-tutorial-tensorflow/)

In [None]:
#import data science libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#data types
import collections

#import general libraries
import gc
import time
import random
import datetime

#deep learning 
import tensorflow as tf

In [108]:
#list_of_products = pd.read_csv('./data/subset_products_list_july30.csv')
df_product_embeddings = pd.read_csv('./data/subset_data_for_embeds_july30.csv')

In [27]:
count_product_user = df_product_embeddings.groupby(['user_id','order_id']).agg({'product_id':'count'})

In [31]:
print(count_product_user.product_id.mean())
print(count_product_user.product_id.median())

10.085320529540459
8.0


In [107]:
count_product_user_unique = df_product_embeddings.groupby('user_id').agg({'product_id':'nunique'})
count_product_user_unique.product_id.describe()

count    185274.000000
mean         31.871094
std          30.529451
min           1.000000
25%          11.000000
50%          22.000000
75%          43.000000
max         418.000000
Name: product_id, dtype: float64

In [None]:
order_products_prior_df = pd.read_csv('./data/order_products__prior.csv', engine='c',
                                          dtype={'order_id': np.int32, 'product_id': np.int32,
                                                 'add_to_cart_order': np.int16, 'reordered': np.int8})

order_products_train_df = pd.read_csv('./data/order_products__train.csv', engine='c',
                                      dtype={'order_id': np.int32, 'product_id': np.int32,
                                             'add_to_cart_order': np.int16, 'reordered': np.int8})

orders_df = pd.read_csv('./data/orders.csv', engine='c',
                        dtype={'order_id': np.int32, 'user_id': np.int32, 'order_number': np.int32,
                               'order_dow': np.int8, 'order_hour_of_day': np.int8,
                               'days_since_prior_order': np.float16})

products_df = pd.read_csv("./data/products.csv", engine='c')

df_train = orders_df.merge(order_products_train_df, how='inner', on='order_id')
df_train = df_train.merge(products_df, how='inner', on='product_id')
df_train.sort_values(['user_id', 'order_number'], axis=0, inplace=True)

df_prior = orders_df.merge(order_products_prior_df, how='inner', on='order_id')
df_prior = df_prior.merge(products_df, how='inner', on='product_id')
df_prior.sort_values(['user_id', 'order_number'], axis=0, inplace=True)

df_products_orders_all = pd.concat([df_prior, df_train])

In [None]:
df_products_orders_all.head()

In [None]:
list_of_products = df_products_orders_all.product_name.tolist()

In [None]:
#number of unique products in dataset
print(len(set(list_of_products)))
n_products = len(set(list_of_products))

In [None]:
#build skip-gram dataset
def embeddings_pre_processing(categories, n_categories):
    """Process raw inputs into a dataset."""
    
    count = [['UNK', -1]]
    count.extend(collections.Counter(categories).most_common(n_categories - 1))
    most_common_category_dict = dict()
    for category, _ in count:
        most_common_category_dict[category] = len(most_common_category_dict)
    
    idx_categories = list()
    unk_count = 0
    for category in categories:
        if category in most_common_category_dict:
            index = most_common_category_dict[category]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        idx_categories.append(index)
    
    count[0][1] = unk_count
    reversed_most_common_category_dict = dict(zip(most_common_category_dict.values(),
                                                   most_common_category_dict.keys()))
    
    return idx_categories, count, most_common_category_dict, reversed_most_common_category_dict

In [None]:
idx_products, count_products, most_common_products_dict, reversed_most_common_products_dict = embeddings_pre_processing(list_of_products, n_products)

In [None]:
data_index = 0
# generate batch data
def generate_batch(data, batch_size, num_skips, skip_window):
    
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    context = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window input_word skip_window ]
    
    buffer = collections.deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    
    for i in range(batch_size // num_skips):
        target = skip_window  # input word at the center of the buffer
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]  # this is the input word
            context[i * num_skips + j, 0] = buffer[target]  # these are the context words
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    
    return batch, context


In [None]:
batch_size = 300
embedding_size = 300  # Dimension of the embedding vector.
skip_window = 3       # How many words to consider left and right.
num_skips = 1         # How many times to reuse an input to generate a context.
products_size = n_products

In [None]:
valid_size = 5     # Random set of words to evaluate similarity on.
valid_window = 50  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

In [None]:
tf.reset_default_graph()
tf.get_default_graph()

In [None]:
graph = tf.Graph()
num_sampled = 20000

In [None]:
with graph.as_default():
        
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_context = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    
    embeddings = tf.get_variable('embeddings_graph', [products_size, embedding_size])
    embed_product_ids = tf.nn.embedding_lookup(embeddings, train_inputs)
    
    # Construct the variables for the NCE loss
    nce_weights = tf.get_variable('nce_weights', [products_size, embedding_size])
    nce_biases = tf.get_variable('biases', [products_size], initializer=tf.zeros_initializer)

    nce_loss = tf.reduce_mean(
        tf.nn.nce_loss(weights=nce_weights,
                       biases=nce_biases,
                       labels=train_context,
                       inputs=embed_product_ids,
                       num_sampled=num_sampled,
                       num_classes=products_size))

    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(nce_loss)

    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)
    
    # Add variable initializer.
    init = tf.global_variables_initializer()

In [None]:
# #create embeddings variable and lookup tensor
# embeddings = tf.get_variable('embeddings_y', [20000, embedding_size])
# embed_product_ids = tf.nn.embedding_lookup(embeddings, train_inputs)

# #create weights and biases
# weights = tf.get_variable('weights_embeds', [20000, embedding_size])
# biases = tf.get_variable('biases', [20000], initializer=tf.zeros_initializer)
# hidden_out = tf.matmul(embed_product_ids, tf.transpose(weights)) + biases

# # convert train_context to a one-hot format
# train_one_hot = tf.one_hot(train_labels, 20000)
# cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=hidden_out, labels=train_one_hot))
# # Construct the SGD optimizer using a learning rate of 0.1.
# optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1).minimize(cross_entropy)

# norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
# normalized_embeddings = embeddings / norm
# valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
# similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

# init = tf.global_variables_initializer()


In [None]:
def get_embeddings(graph, num_steps):
    with tf.Session(graph = graph) as session:
    
        # We must initialize all variables before we use them.
        init.run()
        print('Initialized')

        #tf.global_variables_initializer().run()

        average_loss = 0
        for step in range(num_steps):
            batch_inputs, batch_context = generate_batch(idx_products, batch_size, num_skips, skip_window)
            feed_dict = {train_inputs: batch_inputs, train_context: batch_context}

        # We perform one update step by evaluating the optimizer op (including it
        # in the list of returned values for session.run()
            _, loss_val = session.run([optimizer, nce_loss], feed_dict=feed_dict)
            average_loss += loss_val

            if step % 2000 == 0:
                if step > 0:
                    average_loss /= 2000
                  # The average loss is an estimate of the loss over the last 2000 batches.
                print(' ')
                print('Average loss at step ', step, ': ', average_loss)
                average_loss = 0
                
            if step % 4000 == 0:
                sim = similarity.eval()
                for i in range(valid_size):
                    valid_word = reversed_most_common_products_dict[valid_examples[i]]
                    top_k = 5  # number of nearest neighbors
                    nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                    log_str = 'Nearest to %s:' % valid_word
                    for k in range(top_k):
                        close_word = reversed_most_common_products_dict[nearest[k]]
                        log_str = '%s %s,' % (log_str, close_word)
                    print(log_str)
                    print(' ')

        final_embeddings = normalized_embeddings.eval()
        return final_embeddings

In [None]:
%%time
product_embeddings_results = get_embeddings(graph, num_steps=50000)

In [None]:
product_embeddings_results[1]

In [None]:
index_df = pd.DataFrame.from_dict(reversed_most_common_products_dict,orient='index')
index_df.columns = ['product_name']

embeddings_w_product_name = pd.concat([index_df, pd.DataFrame(product_embeddings_results)], axis=1)

In [None]:
embeddings_w_product_name.to_csv('./data/sample__product_embeddings_results_long_trial.csv', index=False)