In [342]:
import numpy as np
import pandas as pd
import heapq
import random
import os
import time
from tqdm import tqdm
from os import path
from scipy.spatial.distance import pdist, squareform

###  Load pre-trained ResNet50 image embeddings

In [2]:
def load_embeddings_and_ids(dirpath, embedding_file, ids_file):
    embeddings = np.load(path.join(dirpath, embedding_file))
    with open(path.join(dirpath, ids_file)) as f:
        ids = [int(x) for x in f.readlines()]
        id2index = { _id:i for i,_id in enumerate(ids) }    
    assert (embeddings.shape[0] == len(ids))
    return embeddings, ids, id2index

In [3]:
resnet50_embeddings,\
resnet50_ids,\
resnet50_id2index = load_embeddings_and_ids(
'/mnt/workspace/Ugallery/ResNet50/', 'flatten_1.npy', 'ids')

###  Load UGallery transactions

In [4]:
sales_df = pd.read_csv('./valid_sales.csv')
artworks_df = pd.read_csv('./valid_artworks.csv')

In [5]:
artwork_ids = list(artworks_df.id)
artwork_id_2_index = { _id:i+1 for i,_id in enumerate(artwork_ids) }

In [6]:
n_artworks = len(artwork_ids)

In [7]:
artist_ids = list(artworks_df.artist_id)
artwork_id_2_artist_id = {i:j for i,j in zip(artwork_ids, artist_ids)}

###  reduce embedding matrix to only rows of artworks appearing in transactions

In [8]:
embeddings = np.zeros((n_artworks + 1, 2048))
for i, _id in enumerate(artwork_ids):
    embeddings[i+1] = resnet50_embeddings[resnet50_id2index[_id]]

In [9]:
assert(n_artworks + 1 == embeddings.shape[0])
n_artworks

7742

### Collect transactions per user

In [179]:
class User:
    def __init__(self, uid):
        self._uid = uid
        self.artwork_ids = []
        self.artwork_idxs = []
        self.artwork_idxs_set = set()
        self.timestamps = []
        self.artist_ids = set()
        
    def clear(self):
        self.artwork_ids.clear()
        self.artwork_idxs.clear()
        self.artwork_idxs_set.clear()
        self.timestamps.clear()
        self.artist_ids.clear()
        
    def append_transaction(self, artwork_id, timestamp):
        self.artwork_ids.append(artwork_id)
        self.artwork_idxs.append(artwork_id_2_index[artwork_id])
        self.artwork_idxs_set.add(artwork_id_2_index[artwork_id])
        self.timestamps.append(timestamp)
        
    def build_purchase_baskets(self):
        baskets = []
        prev_t = None
        offset = 0
        count = 0
        for i, t in enumerate(self.timestamps):
            if t != prev_t:
                if prev_t is not None:
                    baskets.append((offset, count))
                    offset = i
                count = 1
            else:
                count += 1
            prev_t = t
        baskets.append((offset, count))
        self.baskets = baskets
        
    def sanity_check_purchase_baskets(self):
        ids = self.artwork_ids
        ts = self.timestamps
        baskets = self.baskets
        assert(len(ids) == len(ts))
        n = len(ts)
        assert (n > 0)
        for b in baskets:
            for j in range(b[0], b[0] + b[1] - 1):
                assert(ts[j] == ts[j+1])
        for i in range(1, len(baskets)):
            b1 = baskets[i-1]
            b2 = baskets[i]
            assert(b1[0] + b1[1] == b2[0])
        assert(baskets[0][0] == 0)
        assert(baskets[-1][0] + baskets[-1][1] == n)

#### create list of users

In [180]:
user_ids = sales_df.customer_id.unique()
user_id_2_index = { _id:i for i,_id in enumerate(user_ids) }
n_users = len(user_ids)
users = [User(uid) for uid in user_ids]

#### collect and sanitiy check transactions per user

In [181]:
sorted_sales_df = sales_df.sort_values('order_date')

In [182]:
# clear structures to prevent possible duplicate elements
for user in users:
    user.clear()

# collect transactions per user sorted by timestamp
for uid, aid, t in zip(sorted_sales_df.customer_id,
                       sorted_sales_df.artwork_id,
                       sorted_sales_df.order_date):
    i = user_id_2_index[uid]
    users[i].append_transaction(aid,t)
    users[i].artist_ids.add(artwork_id_2_artist_id[aid])
    
# bin transctions with same timestamps into purchase baskets
for user in users:
    user.build_purchase_baskets()
    user.sanity_check_purchase_baskets()

In [17]:
# x = 6
# users[x].artwork_ids,\
# users[x].timestamps,\
# users[x].baskets

### Find Top K most disimilar images not sharing artists for each user
#### (to draw negative samples from)

In [14]:
distmat = squareform(pdist(embeddings, 'cosine'))

In [183]:
TOPK = 40
for user in tqdm(users):
    u_artwork_idxs = user.artwork_idxs
    u_artist_ids = user.artist_ids
    h = []
    for artwork_id, artist_id in zip(artwork_ids, artist_ids):        
        if artist_id in u_artist_ids:
            continue
        i = artwork_id_2_index[artwork_id]
        score = min(distmat[i][j] for j in u_artwork_idxs)
        if (len(h) < TOPK):
            heapq.heappush(h, (score, i))
        else:
            heapq.heappushpop(h, (score, i))
    assert(len(h) == TOPK)
    user.negative_idxs = [i for _,i in h]
    user.negative_idxs_set = set(user.negative_idxs)

100%|██████████| 2919/2919 [00:27<00:00, 105.54it/s]


### Generate training data

In [321]:
train_instances = []

In [322]:
test_instances = []

In [323]:
def sanity_check_data(data):
    for t in data:
        profile = t[0]
        pos = t[1]
        neg = t[2]
        user = users[t[3]]
        try:
            assert all(i in user.artwork_idxs_set for i in profile)
            assert pos in user.artwork_idxs_set
            assert neg not in user.artwork_idxs_set
            assert artwork_id_2_artist_id[artwork_ids[neg-1]] not in user.artist_ids
            assert neg in user.negative_idxs_set
        except AssertionError:
            print(t)
            print(user._uid)
            print(user.artwork_idxs)
            print(user.artwork_idxs_set)
            print(user.negative_idxs)
            print(user.negative_idxs_set)
            raise

##### 1) Given the past, rank higher each one of next purchase basket

In [324]:
def generate_samples__given_past_rank_next(n_neg):
    for ui, user in tqdm(enumerate(users)):
        past_items = []
        n = len(user.baskets)
        for i in range(n-1):
            cur_b = user.baskets[i]        
            for j in range(cur_b[0], cur_b[0] + cur_b[1]):
                past_items.append(user.artwork_idxs[j])
            next_b  = user.baskets[i+1]
            profile = past_items.copy()
            for pi in range(next_b[0], next_b[0] + next_b[1]):
                neg = random.sample(user.negative_idxs, n_neg+1)
                p = user.artwork_idxs[pi]
                for k in range(n_neg):
                    train_instances.append((profile, p, neg[k], ui))
                test_instances.append((profile, p, neg[-1], ui))

In [325]:
generate_samples__given_past_rank_next(n_neg=6)
sanity_check_data(train_instances)
sanity_check_data(test_instances)
len(train_instances), len(test_instances)

2919it [00:00, 121128.39it/s]


(15984, 2664)

##### 2) Given only the present, hide one and rank it higher

In [326]:
def generate_samples__given_present_hide_rank_one(n_neg):
    for ui, user in tqdm(enumerate(users)):
        for b in user.baskets:
            if b[1] < 2:
                continue
            bs = b[0]
            be = b[0] + b[1]
            for i in range(bs, be):            
                profile = [user.artwork_idxs[j] for j in range(bs, be) if j != i]
                pi = user.artwork_idxs[i]            
                neg = random.sample(user.negative_idxs, n_neg+1)
                for j in range(n_neg):
                    train_instances.append((profile, pi, neg[j], ui))
                test_instances.append((profile, pi, neg[-1], ui))

In [327]:
generate_samples__given_present_hide_rank_one(n_neg=6)
sanity_check_data(train_instances)
sanity_check_data(test_instances)
len(train_instances), len(test_instances)

2919it [00:00, 128959.67it/s]


(31116, 5186)

##### 3) Given the past and the present, hide one and rank it higher

In [328]:
def generate_samples__given_past_present_hide_rank_one(n_neg):
    for ui, user in tqdm(enumerate(users)):
        if (len(user.baskets) < 2):
            continue
        u_baskets = user.baskets
        u_artwork_idxs = user.artwork_idxs
        n = len(u_baskets)
        purchased = []
        for i in range(n):
            b = u_baskets[i]
            purchased.extend(u_artwork_idxs[j] for j in range(b[0], b[0] + b[1]))
            if i == 0:
                continue        
            assert len(purchased) == b[0] + b[1]
            jmax = b[0] + (b[1] if b[1] >= 2 else 0)
            for j in range(jmax):
                profile = [x for k,x in enumerate(purchased) if k != j]
                p = u_artwork_idxs[j]
                neg = random.sample(user.negative_idxs, n_neg+1)
                for k in range(n_neg):
                    train_instances.append((profile, p, neg[k], ui))
                test_instances.append((profile, p, neg[-1], ui))

In [329]:
generate_samples__given_past_present_hide_rank_one(3)
sanity_check_data(train_instances)
sanity_check_data(test_instances)
len(train_instances), len(test_instances)

2919it [00:00, 6480.24it/s]


(153123, 45855)

#### sort train and test instances by profile size

In [330]:
train_instances.sort(key=lambda x: len(x[0]))
test_instances.sort(key=lambda x: len(x[0]))

### Build Tensorflow Network Graph

In [331]:
import tensorflow as tf

In [355]:
class Network:
    def __init__(self):
        
        # --- placeholders
        self._pretrained_embeddings = tf.placeholder(shape=[None, 2048], dtype=tf.float32,
                                                     name='pretrained_embeddings')            
        self._profile_item_indexes = tf.placeholder(shape=[None,None], dtype=tf.int32,
                                                    name='profile_item_indexes')
        self._profile_item_counts = tf.placeholder(shape=[None], dtype=tf.float32,
                                                   name='profile_item_counts')
        self._positive_item_index = tf.placeholder(shape=[None], dtype=tf.int32,
                                                   name='positive_item_index')
        self._negative_item_index = tf.placeholder(shape=[None], dtype=tf.int32,
                                                   name='negative_item_index')
            
        # ---- user profile vector
        # profile item embeddings average
        tmp = tf.gather(self._pretrained_embeddings, self._profile_item_indexes)
        self._profile_item_embeddings = self.trainable_item_embedding(tmp)        
        self._profile_items_average =\
            tf.reduce_sum(self._profile_item_embeddings, axis=1) /\
            tf.reshape(self._profile_item_counts, [-1, 1])
        # user hidden layer
        self._user_hidden = tf.layers.dense(
            inputs=self._profile_items_average,
            units=128,
            activation=tf.nn.selu,
            name='user_hidden'
        )
        # user final vector
        self._user_vector = tf.layers.dense(
            inputs=self._user_hidden,
            units=128,
            activation=tf.nn.selu,
            name='user_vector'
        )
        
        # ---- positive item vector
        tmp = tf.gather(self._pretrained_embeddings, self._positive_item_index)
        self._positive_item_vector = self.trainable_item_embedding(tmp)
        
        # ---- negative item vector
        tmp = tf.gather(self._pretrained_embeddings, self._negative_item_index)
        self._negative_item_vector = self.trainable_item_embedding(tmp)
        
        # --- train loss
        dot_pos = tf.reduce_sum(tf.multiply(self._user_vector, self._positive_item_vector), 1)
        dot_neg = tf.reduce_sum(tf.multiply(self._user_vector, self._negative_item_vector), 1)
        dot_delta = dot_pos - dot_neg
        ones = tf.fill(tf.shape(self._user_vector)[:1], 1.0)
        loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=dot_delta, labels=ones)
        loss = tf.reduce_mean(loss, name='train_loss')
        self._train_loss = loss
        
        # --- test loss
        loss = tf.reduce_sum(tf.cast(dot_delta <= .0, tf.float32), name = 'test_loss')
        self._test_loss = loss
        
        # --- optimizer
        self._optimizer = tf.train.AdamOptimizer(0.2e-5).minimize(self._train_loss)
        
    @staticmethod
    def trainable_item_embedding(X):
        with tf.variable_scope("trainable_item_embedding", reuse=tf.AUTO_REUSE):
            fc1 = tf.layers.dense( # None -> 256
                inputs=X,
                units=256,
                activation=tf.nn.selu,
                name='fc1'
            )
            fc2 = tf.layers.dense( # 256 -> 128
                inputs=fc1,
                units=128,
                activation=tf.nn.selu,
                name='fc2'
            )
            return fc2
    
    def optimize_and_get_train_loss(self, sess, pretrained_embeddings, profile_item_indexes, profile_item_counts,
             positive_item_index, negative_item_index):
        return sess.run([
            self._optimizer,
            self._train_loss,
        ], feed_dict={
            self._pretrained_embeddings: pretrained_embeddings,
            self._profile_item_indexes: profile_item_indexes,
            self._profile_item_counts: profile_item_counts,
            self._positive_item_index: positive_item_index,
            self._negative_item_index: negative_item_index,
        })
    
    def get_train_loss(self, sess, pretrained_embeddings, profile_item_indexes, profile_item_counts,
             positive_item_index, negative_item_index):
        return sess.run(
            self._train_loss, feed_dict={
            self._pretrained_embeddings: pretrained_embeddings,
            self._profile_item_indexes: profile_item_indexes,
            self._profile_item_counts: profile_item_counts,
            self._positive_item_index: positive_item_index,
            self._negative_item_index: negative_item_index,
        })
    
    def get_test_loss(self, sess, pretrained_embeddings, profile_item_indexes, profile_item_counts,
             positive_item_index, negative_item_index):
        return sess.run(
            self._test_loss, feed_dict={
            self._pretrained_embeddings: pretrained_embeddings,
            self._profile_item_indexes: profile_item_indexes,
            self._profile_item_counts: profile_item_counts,
            self._positive_item_index: positive_item_index,
            self._negative_item_index: negative_item_index,
        })

### Training Network

In [334]:
def generate_minibatches(tuples, batch_size):
    n_tuples = len(tuples)
    n_batches = (n_tuples // batch_size) + int(n_tuples % batch_size > 0)
    
    print('n_tuples = ', n_tuples)
    print('n_batches = ', n_batches)
    
    profile_indexes_batches = [None] * n_batches
    profile_size_batches = [None] * n_batches
    positive_index_batches = [None] * n_batches
    negative_index_batches = [None] * n_batches
    
    for i in range(n_batches):
        jmin = i * batch_size
        jmax = min(jmin + batch_size, n_tuples)
        actual_batch_size = jmax - jmin
        
        profile_maxlen = max(len(tuples[j][0]) for j in range(jmin, jmax))
        profile_indexes_batch = np.zeros((actual_batch_size, profile_maxlen), dtype=int)
        profile_size_batch = np.empty((actual_batch_size,))
        positive_index_batch = np.empty((actual_batch_size,), dtype=int)
        negative_index_batch = np.empty((actual_batch_size,), dtype=int)
        
        for j in range(actual_batch_size):            
            # profile indexes
            for k,v in enumerate(tuples[jmin+j][0]):
                profile_indexes_batch[j][k] = v
            # profile size
            profile_size_batch[j] = len(tuples[jmin+j][0])        
            # positive index
            positive_index_batch[j] = tuples[jmin+j][1]
            # negative index
            negative_index_batch[j] = tuples[jmin+j][2]
            
        profile_indexes_batches[i] = profile_indexes_batch
        profile_size_batches[i] = profile_size_batch
        positive_index_batches[i] = positive_index_batch
        negative_index_batches[i] = negative_index_batch
        
    return dict(
        profile_indexes_batches = profile_indexes_batches,
        profile_size_batches    = profile_size_batches,
        positive_index_batches  = positive_index_batches,
        negative_index_batches  = negative_index_batches,
        n_batches               = n_batches,
    )

In [335]:
def sanity_check_minibatches(minibatches):
    for profile_indexes, profile_size, positive_index, negative_index in zip(
        minibatches['profile_indexes_batches'],
        minibatches['profile_size_batches'],
        minibatches['positive_index_batches'],
        minibatches['negative_index_batches']
    ):
        assert all(profile_indexes[i].shape[0] >= profile_size[i] for i in range(profile_size.shape[0]))

In [336]:
MODEL_PATH = '/mnt/workspace/pamessina_models/ugallery/youtube_like/'

In [351]:
def train_network(train_instances, test_instances, batch_size=64, max_epochs=60, session_config=None):
    
    train_minibatches = generate_minibatches(train_instances, batch_size)
    test_minibatches = generate_minibatches(test_instances, batch_size)    
    sanity_check_minibatches(train_minibatches)
    sanity_check_minibatches(test_minibatches)
    n_train_batches = train_minibatches['n_batches']
    n_test_batches = test_minibatches['n_batches']
    n_test_instances = len(test_instances)
    
    with tf.Graph().as_default():      
        network = Network()  
        with tf.Session(config=session_config) as sess:
            try:
                saver = tf.train.Saver()            
                saver.restore(sess, tf.train.latest_checkpoint(MODEL_PATH))
                print('model successfully restored from checkpoint!')
            except ValueError:
                print('no checkpoint found: initializing variables with random values')
                os.makedirs(MODEL_PATH, exist_ok=True)
                sess.run(tf.global_variables_initializer())

            # ========= BEFORE TRAINING ============
            
            initial_test_loss = 0.
            for profile_indexes, profile_size, positive_index, negative_index in zip(
                test_minibatches['profile_indexes_batches'],
                test_minibatches['profile_size_batches'],
                test_minibatches['positive_index_batches'],
                test_minibatches['negative_index_batches']
            ):
                minibatch_test_loss = network.get_test_loss(
                    sess, embeddings, profile_indexes, profile_size, positive_index, negative_index)
                initial_test_loss += minibatch_test_loss
            initial_test_loss = (initial_test_loss / n_test_instances) * 100.

            print("Before training: test_loss = %f%%" % initial_test_loss)
            
            best_test_loss = initial_test_loss
            last_improvement_epoch = -1

            # ========= TRAINING ============
            
            print ('Starting training ...')

            for epoch in range(max_epochs):
                
                start_time = time.time()

                # --- training
                epoch_train_loss = 0.
                for profile_indexes, profile_size, positive_index, negative_index in zip(
                    train_minibatches['profile_indexes_batches'],
                    train_minibatches['profile_size_batches'],
                    train_minibatches['positive_index_batches'],
                    train_minibatches['negative_index_batches']
                ):
                    _, minibatch_train_loss = network.optimize_and_get_train_loss(
                        sess, embeddings, profile_indexes, profile_size, positive_index, negative_index)                
                    epoch_train_loss += minibatch_train_loss
                epoch_train_loss /= n_train_batches

                # --- testing
                epoch_test_loss = 0.
                for profile_indexes, profile_size, positive_index, negative_index in zip(
                    test_minibatches['profile_indexes_batches'],
                    test_minibatches['profile_size_batches'],
                    test_minibatches['positive_index_batches'],
                    test_minibatches['negative_index_batches']
                ):
                    minibatch_test_loss = network.get_test_loss(
                        sess, embeddings, profile_indexes, profile_size, positive_index, negative_index)
                    epoch_test_loss += minibatch_test_loss
                epoch_test_loss = (epoch_test_loss / n_test_instances) * 100.
                
                elapsed_seconds = time.time() - start_time
                
                # --- check for improvements and update best model if necessary
                print("epoch %d: train_loss = %f, test_loss = %f%%, elapsed_seconds = %f" % (
                        epoch, epoch_train_loss, epoch_test_loss, elapsed_seconds))                
                if (epoch_test_loss < best_test_loss):
                    saver = tf.train.Saver()
                    save_path = saver.save(sess, MODEL_PATH)
                    best_test_loss = epoch_test_loss
                    last_improvement_epoch = epoch
                    print("   ** improvement detected: model saved to path ", save_path)
                else:                    
                    if (epoch - last_improvement_epoch >= 6):
                        print("   *** 6 epochs with no improvements -> early stopping :(")
                        return

In [354]:
train_network(train_instances, test_instances, batch_size=256, max_epochs=60)

n_tuples =  153123
n_batches =  599
n_tuples =  45855
n_batches =  180
INFO:tensorflow:Restoring parameters from /mnt/workspace/pamessina_models/ugallery/youtube_like/
model successfully restored from checkpoint!
Before training: test_loss = 0.030531%
Starting training ...
epoch 0: train_loss = 0.000222, test_loss = 0.028350%, elapsed_seconds = 47.148772
   ** improvement detected: model saved to path  /mnt/workspace/pamessina_models/ugallery/youtube_like/
epoch 1: train_loss = 0.000016, test_loss = 0.028350%, elapsed_seconds = 47.048991
epoch 2: train_loss = 0.000001, test_loss = 0.028350%, elapsed_seconds = 47.054279
epoch 3: train_loss = 0.000000, test_loss = 0.028350%, elapsed_seconds = 47.088204
epoch 4: train_loss = 0.000000, test_loss = 0.028350%, elapsed_seconds = 47.014436
epoch 5: train_loss = 0.000000, test_loss = 0.028350%, elapsed_seconds = 47.044549
epoch 6: train_loss = 0.000000, test_loss = 0.026169%, elapsed_seconds = 47.029676
   ** improvement detected: model saved t