In [8]:
%load_ext autoreload
%autoreload 1

In [9]:
%aimport utils

In [127]:
import numpy as np
import pandas as pd
import random
import os
import time
from math import ceil
from utils import load_embeddings_and_ids, User, VisualDuplicateDetector, get_decaying_learning_rates

In [11]:
# use a single GPU because we want to be nice with other people :)
os.environ["CUDA_VISIBLE_DEVICES"]="1"

###  Load pre-trained ResNet50 image embeddings

In [16]:
resnet50 = load_embeddings_and_ids('/mnt/workspace/Ugallery/ResNet50/', 'flatten_1.npy', 'ids')

In [18]:
resnet50_embeddings =  resnet50['featmat']
artwork_ids = resnet50['index2id']
artwork_id2index = resnet50['id2index']

In [21]:
n_artworks = len(artwork_ids)
n_artworks

13297

###  Load visual clusters

In [22]:
import json

In [31]:
with open('/mnt/workspace/Ugallery/Clustering/artworkId2clusterId(resnet50+alexnet+inceptionv3+vgg19+incepresv2).json') as f:
    artworkId2clusterId = json.load(f)
cluster_ids = np.full((n_artworks,), -1, dtype=int)
for k, v in artworkId2clusterId.items():
    cluster_ids[artwork_id2index[int(k)]] = v

In [32]:
cluster_ids.min(), cluster_ids.max(), cluster_ids.shape

(0, 99, (13297,))

In [34]:
n_clusters = len(set(cluster_ids))
n_clusters

100

In [35]:
clusterId2artworkIndexes = [[] for _ in range(n_clusters)]
for i, cluster_id in enumerate(cluster_ids):
    clusterId2artworkIndexes[cluster_id].append(i)

###  Load transactions

In [37]:
sales_df = pd.read_csv('./data/valid_sales.csv')
artworks_df = pd.read_csv('./data/valid_artworks.csv')

In [38]:
artist_ids = np.full((n_artworks,), -1, dtype=int)
for _artworkId, _artistId in zip(artworks_df.id, artworks_df.artist_id):
    i = artwork_id2index[_artworkId]
    artist_ids[i] = _artistId

In [39]:
artistId2artworkIndexes = dict()
for i, _artistId in enumerate(artist_ids):
    if _artistId == -1:
        continue
    try:
        artistId2artworkIndexes[_artistId].append(i)
    except KeyError:
        artistId2artworkIndexes[_artistId] = [i]

### Collect transactions per user (making sure we hide the last nonfirst purchase basket per user)

#### create list of users

In [128]:
user_ids = sales_df.customer_id.unique()
user_id2index = { _id:i for i,_id in enumerate(user_ids) }
users = [User(uid) for uid in user_ids]
n_users = len(user_ids)
n_users

2919

#### collect and sanity check transactions per user

In [123]:
sorted_sales_df = sales_df.sort_values('order_date')

In [129]:
# clear structures to prevent possible duplicate elements
for user in users:
    user.clear()

# collect transactions per user sorted by timestamp
for uid, aid, t in zip(sorted_sales_df.customer_id,
                       sorted_sales_df.artwork_id,
                       sorted_sales_df.order_date):
    users[user_id2index[uid]].append_transaction(
        aid, t, artwork_id2index, artist_ids, cluster_ids)
    assert users[user_id2index[uid]]._uid == uid
    
# bin transctions with same timestamps into purchase baskets
for user in users:
    user.build_purchase_baskets()
    user.sanity_check_purchase_baskets()
    user.remove_last_nonfirst_purchase_basket(
        artwork_id2index, artist_ids, cluster_ids)
    user.sanity_check_purchase_baskets()
    user.refresh_nonpurchased_cluster_ids(n_clusters)
    user.refresh_cluster_ids()
    user.refresh_artist_ids()

### Generate training data

In [130]:
def hash_triple(ui, pi, ni):
    return  ((pi * n_artworks) + ni) * n_users + ui

In [131]:
def sanity_check_instance(instance, pos_is_purchased=True,
                          pos_sharing_cluster_artist=None,
                          neg_notsharing_cluster_artist=None):
    ui, pi, ni = instance    
    try:
        assert 0 <= ui < n_users
        assert 0 <= pi < n_artworks
        assert 0 <= ni < n_artworks
        assert pi != ni
        assert not visdup_detector.same(pi,ni)
        
        user = users[ui]
        
        if pos_is_purchased is True:
            assert pi in user.artwork_idxs_set
        elif pos_is_purchased is False:
            assert pi not in user.artwork_idxs_set
            
        if pos_sharing_cluster_artist:
            assert artist_ids[pi] in user.artist_ids_set
            assert cluster_ids[pi] in user.cluster_ids_set
            
        assert ni not in user.artwork_idxs_set
        if neg_notsharing_cluster_artist:
            assert artist_ids[ni] not in user.artist_ids_set
            assert cluster_ids[ni] not in user.cluster_ids_set
    except AssertionError:
        print('ui = ', ui)
        print('pi = ', pi)
        print('ni = ', ni)
        raise

In [132]:
def append_instance(container, instance, **kwargs):
    global _hash_collisions
    ui, pi, ni = instance
    
    h = hash_triple(ui, pi, ni)
    if h in used_hashes:
        _hash_collisions += 1
        return False
    
    if visdup_detector.same(pi, ni):
        return False
    
    sanity_check_instance(instance, **kwargs)
    container.append(instance)
    used_hashes.add(h)
    return True

In [None]:
visdup_detector = VisualDuplicateDetector(cluster_ids, resnet50_embeddings)

In [133]:
visdup_detector.count = 0
used_hashes = set()
_hash_collisions = 0
train_instances = []
test_instances = []

In [134]:
N_STRATEGIES = 2
TOTAL_SAMPLES__TRAIN = 10000000
TOTAL_SAMPLES__TEST =  300000

In [135]:
N_SAMPLES_PER_STRATEGY__TRAIN = int(TOTAL_SAMPLES__TRAIN / N_STRATEGIES)
N_SAMPLES_PER_STRATEGY__TRAIN
N_SAMPLES_PER_STRATEGY__TEST = int(TOTAL_SAMPLES__TEST / N_STRATEGIES)
N_SAMPLES_PER_STRATEGY__TEST
N_SAMPLES_PER_STRATEGY__TRAIN, N_SAMPLES_PER_STRATEGY__TEST

(5000000, 150000)

## A) Original BPR strategy

##### Given a user, his purchased items should be ranked higher than any of his non-purchased items

In [80]:
def sample_artwork_index__nonpurchased(purchased_artwork_idxs):
    while True:
        i = random.randint(0, n_artworks-1)
        if i not in purchased_artwork_idxs:
            return i

In [79]:
def generate_samples__rank_purchased_above_nonpurchased(instances_container, n_samples):
    n_samples_per_user = ceil(n_samples / n_users)
    for ui, user in enumerate(users):
        profile = user.artwork_idxs
        profile_set = user.artwork_idxs_set
        for _ in range(n_samples_per_user):
            for __ in range(5):
                pi = random.choice(profile)
                ni = sample_artwork_index__nonpurchased(profile_set)
                if append_instance(instances_container, (ui, pi, ni)):
                    break

In [89]:
print('sampling train instances ...')
generate_samples__rank_purchased_above_nonpurchased(train_instances, n_samples=N_SAMPLES_PER_STRATEGY__TRAIN)
print('sampling test instances ...')
generate_samples__rank_purchased_above_nonpurchased(test_instances, n_samples=N_SAMPLES_PER_STRATEGY__TEST)
print(len(train_instances), len(test_instances))
print('hash_collisions = ', _hash_collisions)
print('visual_collisions = ', visdup_detector.count)

sampling train instances ...
sampling test instances ...
9999117 300383
hash_collisions =  1370203
visual_collisions =  1


## B) Domain-specific strategies

##### 1) Given a user, his purchased items should be ranked higher than any of his non-purchased items that do not share artist and visual cluster with items in the user's profile

In [136]:
def sample_artwork_index__notsharingartist_notsharingcluster(nonpurchased_cluster_ids_list, purchased_artist_ids_set):
    while True:
        c = random.choice(nonpurchased_cluster_ids_list)
        i = random.choice(clusterId2artworkIndexes[c])
        if artist_ids[i] not in purchased_artist_ids_set:
            return i

In [138]:
def generate_samples__rank_purchased_above_notsharingartist_notsharingcluster(instances_container, n_samples):
    n_samples_per_user = ceil(n_samples / n_users)
    for ui, user in enumerate(users):
        profile = user.artwork_idxs
        nonp_cluster_ids = user.nonp_cluster_ids
        artist_ids_set = user.artist_ids_set
        for _ in range(n_samples_per_user):
            for __ in range(5):
                pi = random.choice(profile)
                ni = sample_artwork_index__notsharingartist_notsharingcluster(nonp_cluster_ids, artist_ids_set)
                if append_instance(instances_container, (ui, pi, ni)):
                    break

In [145]:
print('sampling train instances ...')
generate_samples__rank_purchased_above_notsharingartist_notsharingcluster(
#     train_instances, n_samples=N_SAMPLES_PER_STRATEGY__TRAIN)
    train_instances, n_samples=1190000)
print('sampling test instances ...')
generate_samples__rank_purchased_above_notsharingartist_notsharingcluster(
#     test_instances, n_samples=N_SAMPLES_PER_STRATEGY__TEST)
    test_instances, n_samples=32700)
print(len(train_instances), len(test_instances))
print('hash_collisions = ', _hash_collisions)
print('visual_collisions = ', visdup_detector.count)

sampling train instances ...
sampling test instances ...
9999919 302244
hash_collisions =  783934
visual_collisions =  0


##### 2) Given a user, any non-purchased item sharing artist and visual cluster with items in the user's profile should be ranked higher than any of his non-purchased items that do not share artist and visual cluster with items in the user's profile

In [141]:
def sample_artwork_index__nonpurchased_sharingartist_sharingcluster(
        purchased_artwork_idxs_set, purchased_artist_ids_list, purchased_cluster_ids_set):
    for _ in range(20): # try at most 20 times
        a = random.choice(purchased_artist_ids_list)
        i = random.choice(artistId2artworkIndexes[a]) # sharing artist
        if i not in purchased_artwork_idxs_set and\
            cluster_ids[i] in purchased_cluster_ids_set: # nonpurchased and sharing cluster
            return i
    return None # failed to find

In [143]:
def generate_samples__rank_nonpurchased_sharingartistcluster_above_notsharingartistcluster(instances_container, n_samples):
    n_samples_per_user = ceil(n_samples / n_users)
    for ui, user in enumerate(users):
        
        profile_set = user.artwork_idxs_set
        artist_ids = user.artist_ids
        cluster_ids_set = user.cluster_ids_set
        nonp_cluster_ids = user.nonp_cluster_ids
        artist_ids_set = user.artist_ids_set
        
        for _ in range(n_samples_per_user):
            for __ in range(5):
                pi = sample_artwork_index__nonpurchased_sharingartist_sharingcluster(profile_set, artist_ids, cluster_ids_set)
                if pi is None:
                    continue
                ni = sample_artwork_index__notsharingartist_notsharingcluster(nonp_cluster_ids, artist_ids_set)
                if append_instance(instances_container, (ui, pi, ni),
                                   pos_is_purchased=False,
                                   pos_sharing_cluster_artist=True,
                                   neg_notsharing_cluster_artist=True):
                    break

In [144]:
print('sampling train instances ...')
generate_samples__rank_nonpurchased_sharingartistcluster_above_notsharingartistcluster(
    train_instances, n_samples=N_SAMPLES_PER_STRATEGY__TRAIN)
print('sampling test instances ...')
generate_samples__rank_nonpurchased_sharingartistcluster_above_notsharingartistcluster(
    test_instances, n_samples=N_SAMPLES_PER_STRATEGY__TEST)
print(len(train_instances), len(test_instances))
print('hash_collisions = ', _hash_collisions)
print('visual_collisions = ', visdup_detector.count)

sampling train instances ...
sampling test instances ...
8809226 267223
hash_collisions =  537807
visual_collisions =  0


### Training Model

In [90]:
def generate_minibatches(tuples, batch_size):
    n_tuples = len(tuples)
    n_batches = ceil(n_tuples / batch_size)
    
    assert n_batches * batch_size >= n_tuples
    assert (n_batches - 1) * batch_size < n_tuples
    
    indexes = list(range(n_tuples))
    random.shuffle(indexes)
    
    print('n_tuples = ', n_tuples)
    print('n_batches = ', n_batches)
    
    user_index_batches = [None] * n_batches
    pos_index_batches = [None] * n_batches
    neg_index_batches = [None] * n_batches
    
    for i in range(n_batches):
        jmin = i * batch_size
        jmax = min(jmin + batch_size, n_tuples)
        actual_batch_size = jmax - jmin
        
        user_index_batch = np.empty((actual_batch_size,), dtype=int)
        pos_index_batch = np.empty((actual_batch_size,), dtype=int)
        neg_index_batch = np.empty((actual_batch_size,), dtype=int)
        
        for j in range(actual_batch_size):
            t = tuples[indexes[jmin+j]]
            user_index_batch[j] = t[0]
            pos_index_batch[j] = t[1]
            neg_index_batch[j] = t[2]

        user_index_batches[i] = user_index_batch
        pos_index_batches[i] = pos_index_batch
        neg_index_batches[i] = neg_index_batch
        
    return dict(
        user_index_batches = user_index_batches,
        pos_index_batches  = pos_index_batches,
        neg_index_batches  = neg_index_batches,
        n_batches          = n_batches,
    )

In [91]:
def sanity_check_minibatches(minibatches):
    user_index_batches = minibatches['user_index_batches']
    pos_index_batches = minibatches['pos_index_batches']
    neg_index_batches = minibatches['neg_index_batches']
    n_batches = minibatches['n_batches']
    assert n_batches == len(user_index_batches)
    assert n_batches == len(pos_index_batches)
    assert n_batches == len(neg_index_batches)
    assert n_batches > 0
    
    for user_index, pos_index, neg_index in zip(
        user_index_batches,
        pos_index_batches,
        neg_index_batches
    ):
        n = user_index.shape[0]
        assert n == pos_index.shape[0]
        assert n == neg_index.shape[0]
        
        for i in range(n):
            ui = user_index[i]
            pi = pos_index[i]
            ni = neg_index[i]
            assert pi != ni
            assert ni not in users[ui].artwork_idxs_set

In [161]:
MODEL_PATH = '/mnt/workspace/pamessina_models/ugallery/VBPR/v5(10M-300K,+prof,+nonprof_favc,-nonfavc)/'

In [162]:
import tensorflow as tf
from Networks import VBPR_Network_Train, TrainLogger

In [163]:
def train_network(train_minibatches, test_minibatches,
                  n_train_instances, n_test_instances, batch_size,
                  max_seconds_training=3600,
                  min_seconds_to_check_improvement=60,
                  early_stopping_checks=4,
                  learning_rates=[1e-3]):
    
    n_train_batches = train_minibatches['n_batches']
    
    print('learning_rates = ', learning_rates)
    
    with tf.Graph().as_default():
        network = VBPR_Network_Train(
            n_users=n_users,
            n_items=n_artworks,
            user_latent_dim=128,
            item_latent_dim=64,
            item_visual_dim=64,
            pretrained_dim=2048,
        )
        with tf.Session() as sess:
            try:
                saver = tf.train.Saver()            
                saver.restore(sess, tf.train.latest_checkpoint(MODEL_PATH))
                print('model successfully restored from checkpoint!')
            except ValueError:
                print('no checkpoint found: initializing variables with random values')
                os.makedirs(MODEL_PATH, exist_ok=True)
                sess.run(tf.global_variables_initializer())            
            trainlogger = TrainLogger(MODEL_PATH + 'train_logs.csv')

            # ========= BEFORE TRAINING ============
            
            initial_test_acc = 0.            
            for user_index, pos_index, neg_index in zip(
                test_minibatches['user_index_batches'],
                test_minibatches['pos_index_batches'],
                test_minibatches['neg_index_batches']
            ):
                minibatch_test_acc = network.get_test_accuracy(
                    sess, resnet50_embeddings, user_index, pos_index, neg_index)
                initial_test_acc += minibatch_test_acc
            initial_test_acc /= n_test_instances

            print("Before training: test_accuracy = %f" % initial_test_acc)
            
            best_test_acc = initial_test_acc
            seconds_training = 0
            elapsed_seconds_from_last_check = 0
            checks_with_no_improvement = 0
            last_improvement_loss = None
            
            # ========= TRAINING ============
            
            print ('Starting training ...')
            n_lr = len(learning_rates)
            lr_i = 0
            train_loss_ema = None # exponential moving average
            
            while seconds_training < max_seconds_training:
                
                for train_i, (user_index, pos_index, neg_index) in enumerate(zip(
                    train_minibatches['user_index_batches'],
                    train_minibatches['pos_index_batches'],
                    train_minibatches['neg_index_batches']
                )):
                    # optimize and get traing loss
                    start_t = time.time()
                    _, minibatch_train_loss = network.optimize_and_get_train_loss(
                        sess, resnet50_embeddings, user_index, pos_index, neg_index, learning_rates[lr_i])
                    delta_t = time.time() - start_t
                    
                    # update train loss exponential moving average
                    train_loss_ema = minibatch_train_loss if train_loss_ema is None else\
                                    0.999 * train_loss_ema + 0.001 * minibatch_train_loss
                    
                    # update time tracking variables
                    seconds_training += delta_t
                    elapsed_seconds_from_last_check += delta_t
                    
                    # check for improvements using test set if it's time to do so
                    if elapsed_seconds_from_last_check >= min_seconds_to_check_improvement:
                        
                        # --- testing                        
                        test_acc = 0.
                        for _user_index, _pos_index, _neg_index in zip(
                            test_minibatches['user_index_batches'],
                            test_minibatches['pos_index_batches'],
                            test_minibatches['neg_index_batches']
                        ):
                            minibatch_test_acc = network.get_test_accuracy(
                                sess, resnet50_embeddings, _user_index, _pos_index, _neg_index)
                            test_acc += minibatch_test_acc
                        test_acc /= n_test_instances
                    
                        print(("train_i=%d, train_loss = %.12f, test_accuracy = %.6f,"
                               " check_secs = %.2f, total_secs = %.2f") % (
                                train_i, train_loss_ema, test_acc, elapsed_seconds_from_last_check, seconds_training))                        
                        
                        # check for improvements
                        if (test_acc > best_test_acc) or (
                            test_acc == best_test_acc and (
                                last_improvement_loss is not None and\
                                last_improvement_loss > train_loss_ema
                            )
                        ):  
                            last_improvement_loss = train_loss_ema
                            best_test_acc = test_acc
                            checks_with_no_improvement = 0
                            saver = tf.train.Saver()
                            save_path = saver.save(sess, MODEL_PATH)                    
                            print("   ** improvement detected: model saved to path ", save_path)
                            model_updated = True
                        else:
                            checks_with_no_improvement += 1                            
                            model_updated = False

                        # --- logging ---                        
                        trainlogger.log_update(
                            train_loss_ema, test_acc, n_train_instances, n_test_instances,
                            elapsed_seconds_from_last_check, batch_size, learning_rates[lr_i], 't' if model_updated else 'f')
                        
                        # --- check for early stopping
                        if checks_with_no_improvement >= early_stopping_checks:
                            if lr_i + 1 < len(learning_rates):
                                lr_i += 1
                                checks_with_no_improvement = 0
                                print("   *** %d checks with no improvements -> using a smaller learning_rate = %f" % (
                                    early_stopping_checks, learning_rates[lr_i]))
                            else:
                                print("   *** %d checks with no improvements -> early stopping :(" % early_stopping_checks)
                                return
                        
                        # --- reset check variables
                        elapsed_seconds_from_last_check = 0                        
            print('====== TIMEOUT ======')

In [149]:
train_batch_size = 100000
train_minibatches = generate_minibatches(train_instances, train_batch_size)
sanity_check_minibatches(train_minibatches)

n_tuples =  9999919
n_batches =  100


In [151]:
test_batch_size = 101000
test_minibatches = generate_minibatches(test_instances, test_batch_size)
sanity_check_minibatches(test_minibatches)

n_tuples =  302244
n_batches =  3


In [168]:
learning_rates = get_decaying_learning_rates(0.00025, 1e-4, 0.5)
learning_rates

[0.00025, 0.000125]

In [169]:
train_network(
    train_minibatches, test_minibatches,
    len(train_instances), len(test_instances), train_batch_size,
    max_seconds_training=3600 / 6,
    min_seconds_to_check_improvement=60 * 2,
    early_stopping_checks=2,
    learning_rates=learning_rates)

learning_rates =  [0.00025, 0.000125]
INFO:tensorflow:Restoring parameters from /mnt/workspace/pamessina_models/ugallery/VBPR/v5(10M-300K,+prof,+nonprof_favc,-nonfavc)/
model successfully restored from checkpoint!
Before training: test_accuracy = 0.999656
Starting training ...
train_i=39, train_loss = 0.000000004078, test_accuracy = 0.999656, check_secs = 120.09, total_secs = 120.09
train_i=79, train_loss = 0.000000004014, test_accuracy = 0.999656, check_secs = 120.05, total_secs = 240.14
   *** 2 checks with no improvements -> using a smaller learning_rate = 0.000125
train_i=19, train_loss = 0.000000003932, test_accuracy = 0.999656, check_secs = 120.04, total_secs = 360.18
train_i=60, train_loss = 0.000000003863, test_accuracy = 0.999656, check_secs = 120.14, total_secs = 480.32
   *** 2 checks with no improvements -> early stopping :(
