In [1]:
%load_ext autoreload
%autoreload 1

In [2]:
%aimport utils

In [3]:
import numpy as np
import pandas as pd
import heapq
import random
import os
import time
import tensorflow as tf
from tqdm import tqdm
from os import path
from scipy.spatial.distance import pdist, squareform
from sklearn.decomposition import PCA
from utils import load_embeddings_and_ids, User

In [4]:
# use a single GPU because we want to be nice with other people :)
os.environ["CUDA_VISIBLE_DEVICES"]="1"

###  Load pre-trained ResNet50 image embeddings

In [5]:
resnet50_embeddings,\
artwork_ids,\
artwork_id2index = load_embeddings_and_ids(
'/mnt/workspace/Ugallery/ResNet50/', 'flatten_1.npy', 'ids')

In [6]:
n_artworks = len(artwork_ids)
n_artworks

13297

###  Load transactions

In [7]:
sales_df = pd.read_csv('./valid_sales.csv')
artworks_df = pd.read_csv('./valid_artworks.csv')

In [8]:
artist_ids = np.full((n_artworks,), -1, dtype=int)
for _artworkId, _artistId in zip(artworks_df.id, artworks_df.artist_id):
    i = artwork_id2index[_artworkId]
    artist_ids[i] = _artistId

In [9]:
artistId2artworkIndexes = dict()
for i, _artistId in enumerate(artist_ids):
    if _artistId == -1:
        continue
    try:
        artistId2artworkIndexes[_artistId].append(i)
    except KeyError:
        artistId2artworkIndexes[_artistId] = [i]

### Collect transactions per user (making sure we hide the last nonfirst purchase basket per user)

#### create list of users

In [14]:
user_ids = sales_df.customer_id.unique()
user_id2index = { _id:i for i,_id in enumerate(user_ids) }
users = [User(uid) for uid in user_ids]
n_users = len(user_ids)

#### collect and sanity check transactions per user

In [11]:
sorted_sales_df = sales_df.sort_values('order_date')

In [15]:
# clear structures to prevent possible duplicate elements
for user in users:
    user.clear()

# collect transactions per user sorted by timestamp
for uid, aid, t in zip(sorted_sales_df.customer_id,
                       sorted_sales_df.artwork_id,
                       sorted_sales_df.order_date):
    users[user_id2index[uid]].append_transaction(aid,t,artwork_id2index,artist_ids)
    assert users[user_id2index[uid]]._uid == uid
    
# bin transctions with same timestamps into purchase baskets
for user in users:
    user.build_purchase_baskets()
    user.sanity_check_purchase_baskets()
    user.remove_last_nonfirst_purchase_basket(artwork_id2index, artist_ids)
    user.sanity_check_purchase_baskets()

### Compute minimun cosine distance from each user profile to each item in the dataset
\* using R200 vectors obtained with PCA(200) over ResNet50 embeddings

In [16]:
resnet50_PCA200 = PCA(n_components=200).fit_transform(resnet50_embeddings)

In [17]:
resnet50_PCA200.shape

(13297, 200)

In [18]:
distmat = squareform(pdist(resnet50_PCA200, 'cosine'))

In [19]:
user2artwork_mindist = np.empty((n_users, n_artworks))

In [20]:
for ui in tqdm(range(n_users)):
    for ai in range(n_artworks):
        user2artwork_mindist[ui][ai] = min(distmat[ai][j] for j in users[ui].artwork_idxs)

100%|██████████| 2919/2919 [00:42<00:00, 69.41it/s]


### Generate training data

In [71]:
def hash(ui, pi, ni):
    return  ((pi * n_artworks) + ni) * n_users + ui

In [72]:
train_instances = []

In [73]:
test_instances = []

In [74]:
used_instances = set()

In [75]:
_collisions = 0

In [76]:
def sanity_check_instance(instance, pos_is_purchased=True):
    ui, pi, ni = instance    
    try:
        assert 0 <= ui < n_users
        assert 0 <= pi < n_artworks
        assert 0 <= ni < n_artworks
        assert pi != ni
        user = users[ui]
        if pos_is_purchased is True:
            assert pi in user.artwork_idxs_set
        else:
            assert pi not in user.artwork_idxs_set
        assert ni not in user.artwork_idxs_set
        assert artist_ids[ni] not in user.artist_ids_set
    except AssertionError:
        print('ui = ', ui)
        print('pi = ', pi)
        print('ni = ', ni)
        raise

In [77]:
def append_instance(container, instance, **kwargs):    
    global _collisions
    h = hash(*instance)
    if h in used_instances:
        _collisions += 1
        return False
    sanity_check_instance(instance, **kwargs)
    container.append(instance)
    used_instances.add(h)
    return True

##### 1) Each purchased item should trivially be ranked higher than any item of non-purchased artists

In [56]:
def sample_artwork_index__notsharingartist(profile_artist_ids):
    while True:
        i = random.randint(0, n_artworks-1)
        if artist_ids[i] not in profile_artist_ids:
            return i

In [57]:
def generate_samples__rank_purchased_above_nonpurchased(n_neg_per_pos=10, n_test_samples=5000):
    
    # --- train instances
    print('sampling train instances ....')
    for ui, user in enumerate(users):
        u_artwork_idxs = user.artwork_idxs
        u_artist_ids = user.artist_ids_set        
        for pi in u_artwork_idxs:
            for _ in range(n_neg_per_pos):
                for __ in range(4):
                    ni = sample_artwork_index__notsharingartist(u_artist_ids)
                    if append_instance(train_instances, (ui, pi, ni)):
                        break
        
    # --- test instances
    print('sampling test instances ....')
    while n_test_samples > 0:
        ui = random.randint(0,n_users-1)
        user = users[ui]
        pi = random.choice(user.artwork_idxs)
        ni = sample_artwork_index__notsharingartist(user.artist_ids_set)
        if append_instance(test_instances, (ui, pi, ni)):
            n_test_samples -= 1

In [78]:
generate_samples__rank_purchased_above_nonpurchased(n_neg_per_pos=250, n_test_samples=30000)
print(len(train_instances), len(test_instances))
print('collisions = ', _collisions)

sampling train instances ....
sampling test instances ....
1395250 30000
collisions =  14276


##### 2) Given a user, any non-purchased item sharing the same artist with a purchased item should be ranked higher than any item of a non-purchased artist as long as ResNet50 doesn't disagree by much

In [64]:
def sample_artwork_index__nonpurchased_sharingartist(artist_id, artwork_idxs_set):
    candidate_idxs = artistId2artworkIndexes[artist_id]
    for _ in range(10): # try at most 10 times
        i = random.choice(candidate_idxs)
        if i not in artwork_idxs_set:
            return i
    return None # failed to find

In [65]:
def reject_ui_pi_ni_triplet(ui, pi, ni, threshold=0.55):
    dp = user2artwork_mindist[ui][pi]
    dn = user2artwork_mindist[ui][ni]
    assert dp + dn > 0
    return dp / (dp + dn) > threshold

In [66]:
def sample_artwork_index__notsharingartist_tripletacceptable(ui, pi, threshold):
    while True:
        ni = sample_artwork_index__notsharingartist(users[ui].artist_ids_set)
        if not reject_ui_pi_ni_triplet(ui, pi, ni, threshold=threshold):
            return ni

In [67]:
def generate_samples__rank_purchased_artist_above_nonpurchased_artist(instances_container, n_samples_per_user=100):
    for ui in range(n_users):
        user = users[ui]
        for _ in range(n_samples_per_user):
            for __ in range(5):
                aid = artist_ids[random.choice(user.artwork_idxs)]
                assert aid != -1
                pi = sample_artwork_index__nonpurchased_sharingartist(aid, user.artwork_idxs_set)
                if pi is None:
                    continue
                ni = sample_artwork_index__notsharingartist_tripletacceptable(ui, pi, 0.55)
                if append_instance(instances_container, (ui, pi, ni), pos_is_purchased=False):
                    break

In [79]:
print('sampling train instances ...')
generate_samples__rank_purchased_artist_above_nonpurchased_artist(train_instances, n_samples_per_user=600)
print('sampling test instances ...')
generate_samples__rank_purchased_artist_above_nonpurchased_artist(test_instances, n_samples_per_user=30)
print(len(train_instances), len(test_instances))
print('collisions = ', _collisions)

sampling train instances ...
sampling test instances ...
3114447 115954
collisions =  18501


### Build Tensorflow Network Graph

In [80]:
class Network:
    def __init__(self, n_users, n_items, user_latent_dim, item_latent_dim, item_visual_dim,
                 pretrained_dim=2048,
                 learning_rate=1e-4):
        
        assert (user_latent_dim == item_latent_dim + item_visual_dim)
        
        print('Network::__init__: learning_rate = ', learning_rate)
        
        self._item_visual_dim = item_visual_dim
        
        # --- placeholders
        self._pretrained_image_embeddings = tf.placeholder(shape=[None, pretrained_dim], dtype=tf.float32,
                                                     name='pretrained_image_embeddings')    
        self._user_index = tf.placeholder(shape=[None], dtype=tf.int32,
                                          name='user_index')
        self._positive_item_index = tf.placeholder(shape=[None], dtype=tf.int32,
                                                   name='positive_item_index')
        self._negative_item_index = tf.placeholder(shape=[None], dtype=tf.int32,
                                                   name='negative_item_index')
            
        # ------------------------------------
        # ---- Global trainable variables
        
        # -- user latent factor matrix
        # (n_users x user_latent_dim)
        self._user_latent_factors = tf.Variable(
            tf.random_uniform([n_users, user_latent_dim], -1.0, 1.0),
            name='user_latent_factors'
        )
        
        # -- item latent factor matrix
        # (n_items x item_latent_dim)
        self._item_latent_factors = tf.Variable(
            tf.random_uniform([n_items, item_latent_dim], -1.0, 1.0),
            name='item_latent_factors'
        )
        
        # -- item latent biases
        self._item_latent_biases = tf.Variable(
            tf.random_uniform([n_items], -1.0, 1.0),
            name='item_latent_biases'
        )
        
        # -- global visual bias
        self._visual_bias = tf.Variable(
            tf.random_uniform([pretrained_dim], -1.0, 1.0),
            name='visual_bias'
        )
        
        # -------------------------------
        # ---- minibatch tensors
        
        # -- user
        self._user_latent_vector = tf.gather(self._user_latent_factors, self._user_index)
        
        # -- positive item
        self._pos_vector,\
        self._pos_latent_bias,\
        self._pos_visual_bias = self.get_item_variables(self._positive_item_index)
        self._pos_score = tf.reduce_sum(self._user_latent_vector * self._pos_vector, 1) +\
                    self._pos_latent_bias +\
                    self._pos_visual_bias
        
        # -- negative item
        self._neg_vector,\
        self._neg_latent_bias,\
        self._neg_visual_bias = self.get_item_variables(self._negative_item_index)
        self._neg_score = tf.reduce_sum(self._user_latent_vector * self._neg_vector, 1) +\
                    self._neg_latent_bias +\
                    self._neg_visual_bias
        
        # -------------------------------
        # ---- train-test tensors
        
        # -- train loss
        delta_score = self._pos_score - self._neg_score
        ones = tf.fill(tf.shape(self._user_latent_vector)[:1], 1.0)
        loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=delta_score, labels=ones)
        loss = tf.reduce_mean(loss, name='train_loss')
        self._train_loss = loss
        
        # -- test accuracy
        accuracy = tf.reduce_sum(tf.cast(delta_score > .0, tf.float32), name = 'test_accuracy')
        self._test_accuracy = accuracy
        
        # -- optimizer
        self._optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self._train_loss)
        
    def get_item_variables(self, item_index):
        pre_vector = tf.gather(self._pretrained_image_embeddings, item_index)
        # 1) item vector
        #    1.1) visual vector
        visual_vector = self.trainable_image_embedding(pre_vector, self._item_visual_dim)
        #    1.2) latent vector
        latent_vector = tf.gather(self._item_latent_factors, item_index)
        #    1.3) concatenation
        final_vector = tf.concat([visual_vector, latent_vector], 1)
        # 2) latent bias
        latent_bias = tf.gather(self._item_latent_biases, item_index)
        # 3) visual bias
        visual_bias = tf.reduce_sum(pre_vector * self._visual_bias, 1)
        # return
        return final_vector, latent_bias, visual_bias
        
    @staticmethod
    def trainable_image_embedding(X, output_dim):
        with tf.variable_scope("trainable_image_embedding", reuse=tf.AUTO_REUSE):
            fc1 = tf.layers.dense( # None -> output_dim
                inputs=X,
                units=output_dim,
                name='fc1'
            )
            return fc1
    
    def optimize_and_get_train_loss(self, sess, pretrained_image_embeddings,
                                    user_index, positive_item_index, negative_item_index):
        return sess.run([
            self._optimizer,
            self._train_loss,
        ], feed_dict={
            self._pretrained_image_embeddings: pretrained_image_embeddings,
            self._user_index: user_index,
            self._positive_item_index: positive_item_index,
            self._negative_item_index: negative_item_index,
        })
    
    def get_train_loss(self, sess, pretrained_image_embeddings, user_index, positive_item_index, negative_item_index):
        return sess.run(
            self._train_loss, feed_dict={
            self._pretrained_image_embeddings: pretrained_image_embeddings,
            self._user_index: user_index,
            self._positive_item_index: positive_item_index,
            self._negative_item_index: negative_item_index,
        })
    
    def get_test_accuracy(self, sess, pretrained_image_embeddings, user_index, positive_item_index, negative_item_index):
        return sess.run(
            self._test_accuracy, feed_dict={
            self._pretrained_image_embeddings: pretrained_image_embeddings,
            self._user_index: user_index,
            self._positive_item_index: positive_item_index,
            self._negative_item_index: negative_item_index,
        })

In [159]:
# # DEBUGGING 
# with tf.Graph().as_default():
#     network = Network(n_users=10, n_items=10, user_latent_dim=40, item_latent_dim=20,
#                       item_visual_dim=20, pretrained_dim=2048)
#     with tf.Session() as sess:
#         sess.run(tf.global_variables_initializer())
#         tmp_debug = sess.run([
#             network._user_latent_factors,
#             network._item_latent_factors,
#             network._pos_vector,
#             network._pos_latent_bias,
#             network._pos_visual_bias,
#             network._pos_score,
#             network._neg_vector,
#             network._neg_latent_bias,
#             network._neg_visual_bias,
#             network._neg_score,
#             network._user_latent_vector,
#         ], feed_dict={
#             network._pretrained_image_embeddings: resnet50_embeddings,
#             network._user_index: [0, 2, 5],
#             network._positive_item_index: [1, 3, 9],
#             network._negative_item_index: [2, 7, 8],
#         })

### Training Network

In [81]:
def generate_minibatches(tuples, batch_size):
    n_tuples = len(tuples)
    n_batches = (n_tuples // batch_size) + int(n_tuples % batch_size > 0)
    
    indexes = list(range(n_tuples))
    random.shuffle(indexes)
    
    print('n_tuples = ', n_tuples)
    print('n_batches = ', n_batches)
    
    user_index_batches = [None] * n_batches
    pos_index_batches = [None] * n_batches
    neg_index_batches = [None] * n_batches
    
    for i in range(n_batches):
        jmin = i * batch_size
        jmax = min(jmin + batch_size, n_tuples)
        actual_batch_size = jmax - jmin
        
        user_index_batch = np.empty((actual_batch_size,), dtype=int)
        pos_index_batch = np.empty((actual_batch_size,), dtype=int)
        neg_index_batch = np.empty((actual_batch_size,), dtype=int)
        
        for j in range(actual_batch_size):
            t = tuples[indexes[jmin+j]]
            user_index_batch[j] = t[0]
            pos_index_batch[j] = t[1]
            neg_index_batch[j] = t[2]

        user_index_batches[i] = user_index_batch
        pos_index_batches[i] = pos_index_batch
        neg_index_batches[i] = neg_index_batch
        
    return dict(
        user_index_batches = user_index_batches,
        pos_index_batches  = pos_index_batches,
        neg_index_batches  = neg_index_batches,
        n_batches               = n_batches,
    )

In [82]:
def sanity_check_minibatches(minibatches):
    user_index_batches = minibatches['user_index_batches']
    pos_index_batches = minibatches['pos_index_batches']
    neg_index_batches = minibatches['neg_index_batches']
    n_batches = minibatches['n_batches']
    assert n_batches == len(user_index_batches)
    assert n_batches == len(pos_index_batches)
    assert n_batches == len(neg_index_batches)
    assert n_batches > 0
    
    for user_index, pos_index, neg_index in zip(
        user_index_batches,
        pos_index_batches,
        neg_index_batches
    ):
        n = user_index.shape[0]
        assert n == pos_index.shape[0]
        assert n == neg_index.shape[0]
        
        for i in range(n):
            ui = user_index[i]
            pi = pos_index[i]
            ni = neg_index[i]
            assert pi != ni
            assert ni not in users[ui].artwork_idxs_set

In [83]:
MODEL_PATH = '/mnt/workspace/pamessina_models/ugallery/VBPR/v2_hidinglast/'

In [96]:
def train_network(train_instances, test_instances, batch_size=64, max_epochs=60,
                  learning_rate=1e-4, early_stopping_epochs=5, min_elapsed_epochs_to_save=3, session_config=None):
    
    train_minibatches = generate_minibatches(train_instances, batch_size)
    test_minibatches = generate_minibatches(test_instances, batch_size)
    sanity_check_minibatches(train_minibatches)
    sanity_check_minibatches(test_minibatches)
    n_train_batches = train_minibatches['n_batches']
    n_test_batches = test_minibatches['n_batches']
    n_test_instances = len(test_instances)
    
    with tf.Graph().as_default():
        network = Network(
            n_users=n_users,
            n_items=n_artworks,
            user_latent_dim=128,
            item_latent_dim=64,
            item_visual_dim=64,
            pretrained_dim=2048,
            learning_rate=learning_rate,
        )
        with tf.Session(config=session_config) as sess:
            try:
                saver = tf.train.Saver()
                saver.restore(sess, tf.train.latest_checkpoint(MODEL_PATH))
                print('model successfully restored from checkpoint!')
            except ValueError:
                print('no checkpoint found: initializing variables with random values')
                os.makedirs(MODEL_PATH, exist_ok=True)
                sess.run(tf.global_variables_initializer())

            # ========= BEFORE TRAINING ============
            
            initial_test_acc = 0.
            for user_index, pos_index, neg_index in zip(
                test_minibatches['user_index_batches'],
                test_minibatches['pos_index_batches'],
                test_minibatches['neg_index_batches']
            ):
                minibatch_test_acc = network.get_test_accuracy(
                    sess, resnet50_embeddings, user_index, pos_index, neg_index)
                initial_test_acc += minibatch_test_acc
            initial_test_acc = (initial_test_acc / n_test_instances) * 100.

            print("Before training: test_accuracy = %f%%" % initial_test_acc)
            
            best_test_acc = initial_test_acc
            last_improvement_epoch = -1
            last_improvement_epoch_train_loss = None
            last_save_epoch = -1

            # ========= TRAINING ============
            
            print ('Starting training ...')

            for epoch in range(max_epochs):
                
                start_time = time.time()

                # --- training
                epoch_train_loss = 0.
                for user_index, pos_index, neg_index in zip(
                    train_minibatches['user_index_batches'],
                    train_minibatches['pos_index_batches'],
                    train_minibatches['neg_index_batches']
                ):
                    _, minibatch_train_loss = network.optimize_and_get_train_loss(
                        sess, resnet50_embeddings, user_index, pos_index, neg_index)
                    epoch_train_loss += minibatch_train_loss
                epoch_train_loss /= n_train_batches

                # --- testing
                epoch_test_acc = 0.
                for user_index, pos_index, neg_index in zip(
                    test_minibatches['user_index_batches'],
                    test_minibatches['pos_index_batches'],
                    test_minibatches['neg_index_batches']
                ):
                    minibatch_test_acc = network.get_test_accuracy(
                        sess, resnet50_embeddings, user_index, pos_index, neg_index)
                    epoch_test_acc += minibatch_test_acc
                epoch_test_acc = (epoch_test_acc / n_test_instances) * 100.
                
                # elapsed time
                elapsed_seconds = time.time() - start_time
                
                # --- check for improvements and update best model if necessary
                print("epoch %d: train_loss = %.15f, test_accuracy = %f%%, elapsed_seconds = %f" % (
                        epoch, epoch_train_loss, epoch_test_acc, elapsed_seconds))                
                if ((epoch_test_acc > best_test_acc) or (
                    epoch_test_acc == best_test_acc and (
                        last_improvement_epoch_train_loss is not None and\
                        epoch_train_loss < last_improvement_epoch_train_loss
                    )
                )) and (epoch - last_save_epoch >= min_elapsed_epochs_to_save or epoch == max_epochs-1):
                    saver = tf.train.Saver()
                    save_path = saver.save(sess, MODEL_PATH)
                    last_save_epoch = epoch
                    last_improvement_epoch = epoch
                    last_improvement_epoch_train_loss = epoch_train_loss
                    best_test_acc = epoch_test_acc                    
                    print("   ** improvement detected: model saved to path ", save_path)
                else:                    
                    if (epoch - last_improvement_epoch >= early_stopping_epochs):
                        print("   *** %d epochs with no improvements -> early stopping :(" % early_stopping_epochs)
                        return

In [None]:
train_network(train_instances, test_instances,
              batch_size=10000,
              max_epochs=2000,
              learning_rate=1e-3,
              early_stopping_epochs=7,
              min_elapsed_epochs_to_save=5)