In [1]:
%load_ext autoreload
%autoreload 1

In [2]:
%aimport utils, Networks

In [3]:
import numpy as np
import pandas as pd
import random
import os
import time
from math import ceil
from sklearn.preprocessing import StandardScaler
from utils import load_embeddings_and_ids, concatenate_featmats, User,\
        VisualSimilarityHandler, VisualSimilarityHandler_ContentAndStyle, get_decaying_learning_rates

In [4]:
# use a single GPU because we want to be nice with other people :)
os.environ["CUDA_VISIBLE_DEVICES"]="0"

###  Load pre-trained image embeddings

In [5]:
resnet50 = load_embeddings_and_ids('/mnt/workspace/Ugallery/ResNet50/', 'flatten_1.npy', 'ids')

In [6]:
grammat_b1c1 = load_embeddings_and_ids('/mnt/workspace/Ugallery/gram_matrix/', 'block1_conv1.npy', 'ids')

###  Concatenate embeddings + z-score normalization

In [7]:
embedding_list = [
    resnet50,
    grammat_b1c1,
]

In [8]:
artwork_ids_set = set()
for embedding in embedding_list:
    artwork_ids_set.update(embedding['index2id'])
artwork_ids = list(artwork_ids_set)
artwork_id2index = {_id:i for i,_id in enumerate(artwork_ids)}
n_artworks = len(artwork_ids)
n_artworks

13297

In [9]:
featmat_list = [tmp['featmat'] for tmp in embedding_list]
id2index_list = [tmp['id2index'] for tmp in embedding_list]
concat_featmat = concatenate_featmats(artwork_ids, featmat_list, id2index_list)

In [10]:
concat_featmat = StandardScaler().fit_transform(concat_featmat)

In [11]:
concat_featmat.shape

(13297, 6144)

###  Load clusters

In [12]:
import json

In [13]:
def load_clusters(json_path):
    with open(json_path) as f:
        artId2clustId = json.load(f)
    cluster_ids = np.full((n_artworks,), -1, dtype=int)
    for k, v in artId2clustId.items():
        cluster_ids[artwork_id2index[int(k)]] = v
    return cluster_ids, artId2clustId

In [14]:
def get_art_indexes_per_cluster(cluster_ids, n_clusters):
    clusterId2artworkIndexes = [[] for _ in range(n_clusters)]
    for i, cluster_id in enumerate(cluster_ids):
        clusterId2artworkIndexes[cluster_id].append(i)
    return clusterId2artworkIndexes

In [15]:
# with open('/mnt/workspace/Ugallery/Clustering/artworkId2clusterId(resnet50+alexnet+inceptionv3+vgg19+incepresv2).json') as f:

content_cluster_ids, artId2clustId_content = load_clusters(
    '/mnt/workspace/Ugallery/Clustering/artworkId2clusterId(resnet50).json')

style_cluster_ids, artId2clustId_style = load_clusters(
    '/mnt/workspace/Ugallery/Clustering/artworkId2clusterId(grammat_b1c1).json')

In [16]:
print(content_cluster_ids.min(), content_cluster_ids.max(), content_cluster_ids.shape)
print(style_cluster_ids.min(), style_cluster_ids.max(), style_cluster_ids.shape)

0 99 (13297,)
0 99 (13297,)


In [17]:
n_content_clusters = len(set(content_cluster_ids))
n_content_clusters

100

In [18]:
n_style_clusters = len(set(style_cluster_ids))
n_style_clusters

100

In [19]:
clustId2artIndexes_content = get_art_indexes_per_cluster(content_cluster_ids, n_content_clusters)
clustId2artIndexes_style = get_art_indexes_per_cluster(style_cluster_ids, n_style_clusters)

In [20]:
content_cluster_ids[0], style_cluster_ids[0]

(20, 32)

###  Load PCA200 embeddings

In [21]:
pca200 = load_embeddings_and_ids(
#     '/mnt/workspace/Ugallery/PCA200(resnet50+alexnet+inceptionv3+vgg19+incepresv2)/',
    '/mnt/workspace/Ugallery/PCA200(resnet50)/',
    'embeddings.npy',
    'ids.npy',
)

In [22]:
pca200_embeddings = pca200['featmat']
pca200_index2id = pca200['index2id']
pca200_id2index = pca200['id2index']

In [23]:
pca200_embeddings.shape

(13297, 200)

In [24]:
assert np.array_equal(artwork_ids, pca200_index2id)

###  Load transactions

In [25]:
sales_df = pd.read_csv('./data/valid_sales.csv')
artworks_df = pd.read_csv('./data/valid_artworks.csv')

In [26]:
artist_ids = np.full((n_artworks,), -1, dtype=int)
for _artworkId, _artistId in zip(artworks_df.id, artworks_df.artist_id):
    i = artwork_id2index[_artworkId]
    artist_ids[i] = _artistId

In [27]:
artistId2artworkIndexes = dict()
for i, _artistId in enumerate(artist_ids):
    if _artistId == -1:
        continue
    try:
        artistId2artworkIndexes[_artistId].append(i)
    except KeyError:
        artistId2artworkIndexes[_artistId] = [i]

### Collect transactions per user (making sure we hide the last nonfirst purchase basket per user)

#### create list of users

In [28]:
user_ids = sales_df.customer_id.unique()
user_id2index = { _id:i for i,_id in enumerate(user_ids) }
users = [User(uid) for uid in user_ids]
n_users = len(user_ids)
n_users

2919

#### collect and sanity check transactions per user

In [29]:
sorted_sales_df = sales_df.sort_values('order_date')

In [30]:
# clear structures to prevent possible duplicate elements
for user in users:
    user.clear()

# collect transactions per user sorted by timestamp
for uid, aid, t in zip(sorted_sales_df.customer_id,
                       sorted_sales_df.artwork_id,
                       sorted_sales_df.order_date):
    users[user_id2index[uid]].append_transaction(
        aid, t, artwork_id2index, artist_ids, content_cluster_ids, style_cluster_ids)
    assert users[user_id2index[uid]]._uid == uid
    
# bin transctions with same timestamps into purchase baskets
for user in users:
    user.build_purchase_baskets()
    user.sanity_check_purchase_baskets()
    user.remove_last_nonfirst_purchase_basket(
        artwork_id2index, artist_ids, content_cluster_ids, style_cluster_ids)
    user.sanity_check_purchase_baskets()
    user.refresh_nonpurchased_cluster_ids(n_content_clusters, n_style_clusters)
    user.refresh_cluster_ids()
    user.refresh_artist_ids()

### Generate training data

In [31]:
_MOD = 402653189
_BASE = 92821
def hash_triple(profile, pi, ni):
    h = 0
    for x in profile:
        h = ((h * _BASE) % _MOD + x) % _MOD
    h = ((h * _BASE) % _MOD + pi) % _MOD
    h = ((h * _BASE) % _MOD + ni) % _MOD
    return h

In [32]:
def sanity_check_instance(instance,
                          pos_in_profile=True,
                          
                          pos_sharing_content_cluster=None,
                          pos_sharing_style_cluster=None,
                          pos_sharing_any_cluster=None,
                          pos_sharing_both_clusters=None,
                          pos_sharing_artist=None,
                          
                          neg_sharing_artist=None,                          
                          neg_sharing_content_cluster=None,
                          neg_sharing_style_cluster=None,
                          neg_sharing_any_cluster=None,
                          
                          profile_set=None,
                          content_clusters_set=None,
                          style_clusters_set=None,
                          artists_set=None,                          
                         ):
    profile, pi, ni, ui = instance
    try:
        assert 0 <= pi < n_artworks
        assert 0 <= ni < n_artworks
        assert pi != ni        
        assert not vissimhandler.same(pi,ni)
        if ui == -1: return
        
        assert 0 <= ui < n_users
        user = users[ui]
        
        assert all(i in user.artwork_idxs_set for i in profile)
        
        user_profile = user.artwork_idxs_set if profile_set is None else profile_set
        
        # neg not in profile            
        assert ni not in user_profile
        
        # pos in profile
        if pos_in_profile is not None:
            assert (pi in user_profile) == pos_in_profile
            
        # pos sharing content cluster
        if pos_sharing_content_cluster is not None:
            assert (content_cluster_ids[pi] in content_clusters_set) == pos_sharing_content_cluster
            
        # pos sharing style cluster
        if pos_sharing_style_cluster is not None:
            assert (style_cluster_ids[pi] in style_clusters_set) == pos_sharing_style_cluster
            
        # pos sharing any cluster
        if pos_sharing_any_cluster is not None:
            assert ((content_cluster_ids[pi] in content_clusters_set) or
                    (style_cluster_ids[pi] in style_clusters_set)) == pos_sharing_any_cluster
        
        # pos sharing both clusters
        if pos_sharing_both_clusters is not None:
            assert ((content_cluster_ids[pi] in content_clusters_set) and
                    (style_cluster_ids[pi] in style_clusters_set)) == pos_sharing_both_clusters
            
        # pos sharing artist
        if pos_sharing_artist is not None:
            assert (artist_ids[pi] in artists_set) == pos_sharing_artist
        
        
        # neg sharing artist
        if neg_sharing_artist is not None:
            assert (artist_ids[ni] in artists_set) == neg_sharing_artist
            
        # neg sharing content cluster
        if neg_sharing_content_cluster is not None:
            assert (content_cluster_ids[ni] in content_clusters_set) == neg_sharing_content_cluster
            
        # neg sharing style cluster
        if neg_sharing_style_cluster is not None:
            assert (style_cluster_ids[ni] in style_clusters_set) == neg_sharing_style_cluster
            
        # neg sharing any cluster
        if neg_sharing_any_cluster is not None:
            assert ((content_cluster_ids[ni] in content_clusters_set) or
                    (style_cluster_ids[ni] in style_clusters_set)) == neg_sharing_any_cluster

    except AssertionError:
        print('profile = ', profile)
        print('pi = ', pi)
        print('ni = ', ni)
        print('ui = ', ui)
        raise

In [33]:
def append_instance(container, instance, **kwargs):
    global _hash_collisions
    profile, pi, ni, ui = instance
    
    h = hash_triple(profile, pi, ni)
    if h in used_hashes:
        _hash_collisions += 1
        return False
    
    if vissimhandler.same(pi, ni):
        return False
    
    sanity_check_instance(instance, **kwargs)
    container.append(instance)
    used_hashes.add(h)
    return True

In [34]:
def print_triple(t):
    profile, pi, ni, ui = t
    print ('profile = ', [artwork_ids[i] for i in profile])
    print ('pi = ', artwork_ids[pi])
    print ('ni = ', artwork_ids[ni])
    print ('ui = ', user_ids[ui] if ui != -1 else -1)

In [35]:
def print_num_samples(sampler_func):
    def wrapper(instances_container, n_samples):        
        while True:
            len_before = len(instances_container)
            sampler_func(instances_container, n_samples)
            actual_samples = len(instances_container) - len_before
            delta = n_samples - actual_samples
            print('  target samples: %d' % n_samples)
            print('  actual samples: %d' % actual_samples)
            print('  delta: %d' % (delta))
            if delta <= 0: break
            print('  ** delta > 0 -> sampling more instances again ...')
            n_samples = delta
    return wrapper

In [36]:
# vissimhandler = VisualSimilarityHandler_ContentAndStyle(
#     content_cluster_ids, style_cluster_ids, pca200_embeddings)
vissimhandler = VisualSimilarityHandler(content_cluster_ids, pca200_embeddings)

In [37]:
vissimhandler.count = 0
used_hashes = set()
_hash_collisions = 0
train_instances = []
test_instances = []

In [38]:
N_STRATEGIES_FAKE = 2
N_STRATEGIES_REAL = 4
FAKE_COEF = 1.
TOTAL_SAMPLES__TRAIN = 10000000
TOTAL_SAMPLES__TEST =  TOTAL_SAMPLES__TRAIN * 0.05

N_SAMPLES_PER_FAKE_STRATEGY__TRAIN = ceil(TOTAL_SAMPLES__TRAIN * FAKE_COEF / N_STRATEGIES_FAKE)
N_SAMPLES_PER_FAKE_STRATEGY__TEST = ceil(TOTAL_SAMPLES__TEST * FAKE_COEF / N_STRATEGIES_FAKE)
N_SAMPLES_PER_REAL_STRATEGY__TRAIN = ceil(TOTAL_SAMPLES__TRAIN * (1. - FAKE_COEF) / N_STRATEGIES_REAL)
N_SAMPLES_PER_REAL_STRATEGY__TEST = ceil(TOTAL_SAMPLES__TEST * (1. - FAKE_COEF) / N_STRATEGIES_REAL)

print(N_SAMPLES_PER_FAKE_STRATEGY__TRAIN, N_SAMPLES_PER_FAKE_STRATEGY__TEST)
print(N_SAMPLES_PER_REAL_STRATEGY__TRAIN, N_SAMPLES_PER_REAL_STRATEGY__TEST)

5000000 250000
0 0


In [39]:
FINE_GRAINED_THRESHOLD = 0.7
VISUAL_CONFIDENCE_THRESHOLD = 0.1

## Original BPR strategy

#### 1) given profile, recommend profile (real users)
Given a user's profile, all items in the profile should be ranked higher than items outside the profile

In [40]:
def sample_artwork_index(pi):
    if random.random() <= FINE_GRAINED_THRESHOLD:
        if random.randint(0, 1) == 0:
            clustId2artIndexes = clustId2artIndexes_content
            cluster_ids = content_cluster_ids
        else:
            clustId2artIndexes = clustId2artIndexes_style
            cluster_ids = style_cluster_ids
        ni = random.choice(clustId2artIndexes[cluster_ids[pi]])
    else:
        if random.randint(0, 1) == 0:
            clustId2artIndexes = clustId2artIndexes_content
            n_clusters = n_content_clusters
        else:
            clustId2artIndexes = clustId2artIndexes_style
            n_clusters = n_style_clusters
        c = random.randint(0, n_clusters-1)
        ni = random.choice(clustId2artIndexes[c])
    return ni

In [41]:
def sample_artwork_index__outsideprofile(profile_set, pi):
    while True:
        ni = sample_artwork_index(pi)
        if ni not in profile_set:
            return ni

In [42]:
@print_num_samples
def generate_samples__rank_profile_above_nonprofile(instances_container, n_samples):
    n_samples_per_user = ceil(n_samples / n_users)    
    for ui, user in enumerate(users):
        profile = user.artwork_idxs
        profile_set = user.artwork_idxs_set
        for _ in range(n_samples_per_user):
            for __ in range(5):
                pi = random.choice(profile)
                ni = sample_artwork_index__outsideprofile(profile_set, pi)
                if append_instance(instances_container, (profile, pi, ni, ui),
                                   pos_in_profile=True,
                                   profile_set=profile_set):
                    break

In [44]:
# print('=======================================\nsampling train instances ...')
# generate_samples__rank_profile_above_nonprofile(
#     train_instances, n_samples=N_SAMPLES_PER_REAL_STRATEGY__TRAIN)

# print('=======================================\nsampling test instances ...')
# generate_samples__rank_profile_above_nonprofile(
#     test_instances, n_samples=N_SAMPLES_PER_REAL_STRATEGY__TEST)

# print(len(train_instances), len(test_instances))
# print('hash_collisions = ', _hash_collisions)
# print('visual_collisions = ', vissimhandler.count)

##### 2) Given profile, recommend profile (fake 1-item profiles)
Given a fake profile of a single item, such item should be ranked higher than any other item

In [45]:
def sample_artwork_index__nonidentical(pi):        
    while True:
        ni = sample_artwork_index(pi)
        if ni != pi:
            return ni

In [46]:
@print_num_samples
def generate_samples__rank_single_item_above_anything_else(instances_container, n_samples):
    n_samples_per_item = ceil(n_samples / n_artworks)
    for pi in range(n_artworks):
        profile = (pi,)
        n = n_samples_per_item
        while n > 0:
            ni = sample_artwork_index__nonidentical(pi)
            if append_instance(instances_container, (profile, pi, ni, -1)):
                n -= 1

In [47]:
print('=======================================\nsampling train instances ...')
generate_samples__rank_single_item_above_anything_else(
    train_instances, n_samples=N_SAMPLES_PER_FAKE_STRATEGY__TRAIN)

print('=======================================\nsampling test instances ...')
generate_samples__rank_single_item_above_anything_else(
    test_instances, n_samples=N_SAMPLES_PER_FAKE_STRATEGY__TEST)

print(len(train_instances), len(test_instances))
print('hash_collisions = ', _hash_collisions)
print('visual_collisions = ', vissimhandler.count)

sampling train instances ...
  target samples: 5000000
  actual samples: 5012969
  delta: -12969
sampling test instances ...
  target samples: 250000
  actual samples: 252643
  delta: -2643
5012969 252643
hash_collisions =  2317206
visual_collisions =  13127


## Domain-specific strategies

##### 3) Recommend visually similar items from favorite artists (real users)
Given a user, any item outside the user's profile that shares artist and visual cluster with items in the user's profile should be ranked higher than any item from an artist and visual cluster not present in the user's profile

In [48]:
def sample_artwork_index__outsideprofile__sharing_artist_content_cluster(
        profile_set, artists_list, content_clusters_set):
    for _ in range(4): # try at most 4 times
        # sharing artist
        a = random.choice(artists_list)
        i = random.choice(artistId2artworkIndexes[a])
        # sharing cluster
        if content_cluster_ids[i] not in content_clusters_set:
            continue
        # oustide profile
        if i in profile_set: continue
        # done
        return i
    return None # failed to find

In [49]:
def sample_artwork_index__notsharing_artist_content_cluster(
        artists_set, unused_content_clusters_list):
    while True:
        # not sharing content cluster
        c = random.choice(unused_content_clusters_list)
        i = random.choice(clustId2artIndexes_content[c])        
        # not sharing artist
        if artist_ids[i] in artists_set:
            continue
        return i

In [50]:
@print_num_samples
def generate_samples__rank_sharing_artist_content_cluster_above_notsharing_artist_content_cluster(
        instances_container, n_samples):
    
    n_samples_per_user = ceil(n_samples / n_users)    
    for ui, user in enumerate(users):
        profile = user.artwork_idxs
        profile_set = user.artwork_idxs_set
        artists_list = user.artist_ids
        artists_set = user.artist_ids_set
        content_clusters_set = user.content_cluster_ids_set
        unused_content_clusters_list = user.nonp_content_cluster_ids
        for _ in range(n_samples_per_user):
            for __ in range(5):
                pi = sample_artwork_index__outsideprofile__sharing_artist_content_cluster(
                    profile_set, artists_list, content_clusters_set)
                if pi is None: continue
                ni = sample_artwork_index__notsharing_artist_content_cluster(
                    artists_set, unused_content_clusters_list)
                if append_instance(instances_container, (profile, pi, ni, ui),
                                   pos_in_profile=False,
                                   pos_sharing_artist=True,
                                   pos_sharing_content_cluster=True,
                                   neg_sharing_artist=False,
                                   neg_sharing_content_cluster=False,
                                   profile_set = profile_set,
                                   content_clusters_set = content_clusters_set,
                                   artists_set = artists_set,
                                  ):
                    break

In [51]:
# print('=======================================\nsampling train instances ...')
# generate_samples__rank_sharing_artist_content_cluster_above_notsharing_artist_content_cluster(
#     train_instances, n_samples=N_SAMPLES_PER_REAL_STRATEGY__TRAIN)

# print('=======================================\nsampling test instances ...')
# generate_samples__rank_sharing_artist_content_cluster_above_notsharing_artist_content_cluster(
#     test_instances, n_samples=N_SAMPLES_PER_REAL_STRATEGY__TEST)

# print(len(train_instances), len(test_instances))
# print('hash_collisions = ', _hash_collisions)
# print('visual_collisions = ', vissimhandler.count)

##### 4) Recommend visual similar items from favorite artists (fake 1-item profiles)
Given a fake profile of a single item, other items sharing same artist should be ranked higher than items from different artists as long as the PCA200 embedding agrees

In [52]:
def sample_artwork_index__nonidentical_sharing_artist(i):
    a = artist_ids[i]
    assert a != -1
    candidate_idxs = artistId2artworkIndexes[a]
    assert len(candidate_idxs) >= 2
    while True:
        pi = random.choice(candidate_idxs) # sharing artist
        if pi != i: # non-identical
            return pi

In [53]:
def sample_artwork_index__notsharing_artist__visually_acceptable(i, pi):
    for _ in range(4): # try at most 4 times
        ni = sample_artwork_index(i)
        if artist_ids[ni] == artist_ids[i]: # not sharing artist
            continue
        if vissimhandler.validate_triple(i, pi, ni, margin=VISUAL_CONFIDENCE_THRESHOLD): # visually acceptable
            return ni
    return None

In [54]:
@print_num_samples
def generate_samples__rank_sharing_artist_above_notsharing_artist__visuallyacceptable__single_item(
        instances_container, n_samples):
    
    n_valid_items = sum(1 for i in range(n_artworks) if artist_ids[i] != -1 and\
                        len(artistId2artworkIndexes[artist_ids[i]]) >= 2)
    n_samples_per_item = ceil(n_samples / n_valid_items)
    
    
    print('(debug) n_valid_items = ', n_valid_items)
    print('(debug) n_samples_per_item = ', n_samples_per_item)
    count = 0
    for i in range(n_artworks):
        a = artist_ids[i]
        if a == -1 or len(artistId2artworkIndexes[a]) < 2:
            continue
        count += 1
        if (count == n_valid_items or count % 100 == 0):
            print('(debug) %d/%d' % (count, n_valid_items), flush=True, end='\r')
        profile = (i,)
        for _ in range(n_samples_per_item):
            for __ in range(5):
                pi = sample_artwork_index__nonidentical_sharing_artist(i)
                ni = sample_artwork_index__notsharing_artist__visually_acceptable(i, pi)
                if ni is None:
                    continue
                if append_instance(instances_container, (profile, pi, ni, -1)):
                    break

In [55]:
print('=======================================\nsampling train instances ...')
generate_samples__rank_sharing_artist_above_notsharing_artist__visuallyacceptable__single_item(
    train_instances, n_samples=N_SAMPLES_PER_FAKE_STRATEGY__TRAIN)

print('=======================================\nsampling test instances ...')
generate_samples__rank_sharing_artist_above_notsharing_artist__visuallyacceptable__single_item(
    test_instances, n_samples=N_SAMPLES_PER_FAKE_STRATEGY__TEST)

print(len(train_instances), len(test_instances))
print('hash_collisions = ', _hash_collisions)
print('visual_collisions = ', vissimhandler.count)

sampling train instances ...
(debug) n_valid_items =  7587
(debug) n_samples_per_item =  660
  target samples: 5000000
  actual samples: 4939496
  delta: 60504
  ** delta > 0 -> sampling more instances again ...
(debug) n_valid_items =  7587
(debug) n_samples_per_item =  8
  target samples: 60504
  actual samples: 59582
  delta: 922
  ** delta > 0 -> sampling more instances again ...
(debug) n_valid_items =  7587
(debug) n_samples_per_item =  1
  target samples: 922
  actual samples: 7447
  delta: -6525
sampling test instances ...
(debug) n_valid_items =  7587
(debug) n_samples_per_item =  33
  target samples: 250000
  actual samples: 245653
  delta: 4347
  ** delta > 0 -> sampling more instances again ...
(debug) n_valid_items =  7587
(debug) n_samples_per_item =  1
  target samples: 4347
  actual samples: 7457
  delta: -3110
10019494 505753
hash_collisions =  2834378
visual_collisions =  13127


##### 5) Predict next purchase basket
Given all previous purchases, rank each  item of the next purchase basket higher than any item from a never purchased artist and cluster

In [70]:
@print_num_samples
def generate_samples__given_past_rank_next(instances_container, n_samples):
    
    n_valid_users = sum(1 for user in users if len(user.baskets) >= 2) # at last 2 purchase baskets
    n_samples_per_user = ceil(n_samples / n_valid_users)
    
    for ui, user in enumerate(users):
        n = len(user.baskets)
        if n <= 1:
            continue
        past_items = []        
        n_samples_per_basket = ceil(n_samples_per_user / (n-1))
        for bi in range(n-1):
            cur_b = user.baskets[bi]
            for j in range(cur_b[0], cur_b[0] + cur_b[1]):
                past_items.append(user.artwork_idxs[j])
            next_b  = user.baskets[bi+1]
            profile = past_items.copy()
            for _ in range(n_samples_per_basket):
                for __ in range(5):
                    pi = user.artwork_idxs[random.randint(next_b[0], next_b[0] + next_b[1] - 1)]
                    ni = sample_artwork_index__notsharing_artist_content_cluster(
                        user.artist_ids_set, user.nonp_content_cluster_ids)
                    if append_instance(instances_container, (profile, pi, ni, ui),
                                       neg_sharing_artist=False,
                                       neg_sharing_content_cluster=False,
                                       artists_set=user.artist_ids_set,
                                       content_clusters_set=user.content_cluster_ids_set,
                                      ):
                        break

In [71]:
print('=======================================\nsampling train instances ...')
generate_samples__given_past_rank_next(
    train_instances, n_samples=N_SAMPLES_PER_REAL_STRATEGY__TRAIN)

print('=======================================\nsampling test instances ...')
generate_samples__given_past_rank_next(
    test_instances, n_samples=N_SAMPLES_PER_REAL_STRATEGY__TEST)

print(len(train_instances), len(test_instances))
print('hash_collisions = ', _hash_collisions)
print('visual_collisions = ', vissimhandler.count)

sampling train instances ...
  target samples: 250000
  actual samples: 250450
  delta: -450
sampling test instances ...
  target samples: 12500
  actual samples: 13136
  delta: -636
9762710 499482
hash_collisions =  2050355
visual_collisions =  9325


##### 6) Predict hidden item in the k-th purchase basket given first k
Given the first k purchase baskets of a user, hide one item in the k-th purchase basket, use the rest as profile and rank the hidden item higher than any item from a never purchased artist and cluster

In [72]:
@print_num_samples
def generate_samples__hide_and_predict_one_from_last__first_k_purchase_baskets(instances_container, n_samples):
    
    n_valid_baskets_list = [sum(1 for b in user.baskets if b[1] >= 2) for user in users]
    n_valid_users = sum(1 for x in n_valid_baskets_list if x > 0)
    n_samples_per_user = ceil(n_samples / n_valid_users)
    
    for ui, (user, n_valid_baskets) in enumerate(zip(users, n_valid_baskets_list)):
        if n_valid_baskets == 0:
            continue
        n_samples_per_basket = ceil(n_samples_per_user / n_valid_baskets)
        u_artwork_idxs = user.artwork_idxs
        purchased = []
        for b in user.baskets:            
            bs = b[0]
            be = b[0] + b[1]
            purchased.extend(u_artwork_idxs[j] for j in range(bs, be))
            assert len(purchased) == be
            if b[1] < 2:
                continue            
            n_samples_per_item = ceil(n_samples_per_basket / b[1])            
            for i in range(bs, be):
                profile = [purchased[j] for j in range(be) if j != i]
                assert len(profile) == be - 1
                assert len(profile) > 0
                pi = purchased[i]
                for _ in range(n_samples_per_item):
                    for __ in range(5):
                        ni = sample_artwork_index__notsharing_artist_content_cluster(
                            user.artist_ids_set, user.nonp_content_cluster_ids)
                        if append_instance(instances_container, (profile, pi, ni, ui),
                                           neg_sharing_artist=False,
                                           neg_sharing_content_cluster=False,
                                           artists_set=user.artist_ids_set,
                                           content_clusters_set=user.content_cluster_ids_set,
                                          ):
                            break

In [73]:
print('=======================================\nsampling train instances ...')
generate_samples__hide_and_predict_one_from_last__first_k_purchase_baskets(
    train_instances, n_samples=N_SAMPLES_PER_REAL_STRATEGY__TRAIN)

print('=======================================\nsampling test instances ...')
generate_samples__hide_and_predict_one_from_last__first_k_purchase_baskets(
    test_instances, n_samples=N_SAMPLES_PER_REAL_STRATEGY__TEST)

print(len(train_instances), len(test_instances))
print('hash_collisions = ', _hash_collisions)
print('visual_collisions = ', vissimhandler.count)

sampling train instances ...
  target samples: 250000
  actual samples: 250914
  delta: -914
sampling test instances ...
  target samples: 12500
  actual samples: 13705
  delta: -1205
10013624 513187
hash_collisions =  2060634
visual_collisions =  9325


#### sort train and test instances by profile size

In [56]:
random.shuffle(train_instances)
train_instances.sort(key=lambda x: len(x[0]))
test_instances.sort(key=lambda x: len(x[0]))

### Train Model

In [57]:
def generate_minibatches(tuples, max_users_items_per_batch):
    ui_count = 0
    offset = 0
    
    batch_ranges = []
    for i, t in enumerate(tuples):
        ui_count += len(t[0]) + 3
        if ui_count > max_users_items_per_batch:
            batch_ranges.append((offset, i))
            ui_count = len(t[0]) + 3
            offset = i
            assert ui_count <= max_users_items_per_batch
    assert offset < len(tuples)
    batch_ranges.append((offset, len(tuples)))
            
    n_tuples = len(tuples)
    n_batches = len(batch_ranges)
    print('n_tuples = ', n_tuples)
    print('n_batches = ', n_batches)
    
    assert batch_ranges[0][0] == 0
    assert all(batch_ranges[i][1] == batch_ranges[i+1][0] for i in range(n_batches-1))
    assert batch_ranges[-1][1] == n_tuples
    assert sum(b[1] - b[0] for b in batch_ranges) == n_tuples
    
    profile_indexes_batches = [None] * n_batches
    profile_size_batches = [None] * n_batches
    positive_index_batches = [None] * n_batches
    negative_index_batches = [None] * n_batches
    
    for i, (jmin, jmax) in enumerate(batch_ranges):
        actual_batch_size = jmax - jmin
        profile_maxlen = max(len(tuples[j][0]) for j in range(jmin, jmax))
        profile_indexes_batch = np.full((actual_batch_size, profile_maxlen), 0, dtype=int)
        profile_size_batch = np.empty((actual_batch_size,))
        positive_index_batch = np.empty((actual_batch_size,), dtype=int)
        negative_index_batch = np.empty((actual_batch_size,), dtype=int)
        
        for j in range(actual_batch_size):
            # profile indexes
            for k,v in enumerate(tuples[jmin+j][0]):
                profile_indexes_batch[j][k] = v
            # profile size
            profile_size_batch[j] = len(tuples[jmin+j][0])        
            # positive index
            positive_index_batch[j] = tuples[jmin+j][1]
            # negative index
            negative_index_batch[j] = tuples[jmin+j][2]
            
        profile_indexes_batches[i] = profile_indexes_batch
        profile_size_batches[i] = profile_size_batch
        positive_index_batches[i] = positive_index_batch
        negative_index_batches[i] = negative_index_batch
        
    return dict(
        profile_indexes_batches = profile_indexes_batches,
        profile_size_batches    = profile_size_batches,
        positive_index_batches  = positive_index_batches,
        negative_index_batches  = negative_index_batches,
        n_batches               = n_batches,
    )

In [58]:
def sanity_check_minibatches(minibatches):
    profile_indexes_batches = minibatches['profile_indexes_batches']
    profile_size_batches = minibatches['profile_size_batches']
    positive_index_batches = minibatches['positive_index_batches']
    negative_index_batches = minibatches['negative_index_batches']
    n_batches = minibatches['n_batches']
    assert n_batches == len(profile_indexes_batches)
    assert n_batches == len(profile_size_batches)
    assert n_batches == len(positive_index_batches)
    assert n_batches == len(negative_index_batches)
    assert n_batches > 0
    
    for profile_indexes, profile_size, positive_index, negative_index in zip(
        profile_indexes_batches,
        profile_size_batches,
        positive_index_batches,
        negative_index_batches
    ):
        n = profile_size.shape[0]
        assert n == profile_indexes.shape[0]
        assert n == positive_index.shape[0]
        assert n == negative_index.shape[0]
        
        for i in range(n):
            assert positive_index[i] != negative_index[i]
            psz = int(profile_size[i])
            m = profile_indexes[i].shape[0]
            assert psz <= m
            for j in range(psz, m):
                assert profile_indexes[i][j] == 0

In [59]:
import tensorflow as tf
from Networks import ContentBasedLearn2RankNetwork_Train, TrainLogger

In [78]:
# # DEBUGGING NETWORK
# with tf.Graph().as_default():
#     network = ContentBasedLearn2RankNetwork_Train(
#         pretrained_embedding_dim=concat_featmat.shape[1],
#         user_layer_units=[300, 300, 200],
#         item_layer_units=[200, 200],
#         weight_decay=0.0001,
#         profile_pooling_mode='AVG+MAX',
#     )
#     with tf.Session() as sess:
#         sess.run(tf.global_variables_initializer())
#         debug_ = sess.run(
#             [network._profile_masks__avgpool,
#              network._profile_masks__maxpool,
#              network._masked_profile_item_embeddings__avgpool,
#              network._masked_profile_item_embeddings__maxpool,
#              network._profile_items_avgpool,
#              network._profile_items_maxpool,
#             ], feed_dict={
#                 network._pretrained_embeddings: concat_featmat,
#                 network._profile_item_indexes: [[0,1,2,9,9,9], [1,2,3,4,5,6]],
#                 network._profile_sizes: [3,6],
#             }
#         )
#         for x in tf.global_variables():
#             print(x)        
# #         for op in sess.graph.get_operations():
# #             print(op.name)

In [60]:
def train_network(train_minibatches, test_minibatches,
                  n_train_instances, n_test_instances, batch_size,
                  pretrained_embeddings,
                  user_layer_units,
                  item_layer_units,
                  profile_pooling_mode,
                  model_path,
                  max_seconds_training=3600,
                  min_seconds_to_check_improvement=60,
                  early_stopping_checks=4,
                  weight_decay=0.001,
                  learning_rates=[1e-3]):
    
    n_train_batches = train_minibatches['n_batches']
    n_test_batches = test_minibatches['n_batches']
    
    print('learning_rates = ', learning_rates)
    
    with tf.Graph().as_default():
        network = ContentBasedLearn2RankNetwork_Train(
            pretrained_embedding_dim=pretrained_embeddings.shape[1],
            user_layer_units=user_layer_units,
            item_layer_units=item_layer_units,
            weight_decay=weight_decay,
            profile_pooling_mode=profile_pooling_mode,
        )
        
        print('Variables to be trained:')
        for x in tf.global_variables():
            print('\t', x)            
        
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=0.99,
            allow_growth=True
        )
        config = tf.ConfigProto(gpu_options=gpu_options)        
        with tf.Session(config=config) as sess:
            try:
                saver = tf.train.Saver()            
                saver.restore(sess, tf.train.latest_checkpoint(model_path))
                print('model successfully restored from checkpoint!')
            except ValueError:
                print('no checkpoint found: initializing variables with random values')
                os.makedirs(model_path, exist_ok=True)
                sess.run(tf.global_variables_initializer())            
            trainlogger = TrainLogger(model_path + 'train_logs.csv')

            # ========= BEFORE TRAINING ============
            
            initial_test_acc = 0.
            for profile_indexes, profile_size, positive_index, negative_index in zip(
                test_minibatches['profile_indexes_batches'],
                test_minibatches['profile_size_batches'],
                test_minibatches['positive_index_batches'],
                test_minibatches['negative_index_batches']
            ):
                minibatch_test_acc = network.get_test_accuracy(
                    sess, pretrained_embeddings, profile_indexes, profile_size, positive_index, negative_index)
                initial_test_acc += minibatch_test_acc
            initial_test_acc /= n_test_instances

            print("Before training: test_accuracy = %f" % initial_test_acc)
            
            best_test_acc = initial_test_acc
            seconds_training = 0
            elapsed_seconds_from_last_check = 0
            checks_with_no_improvement = 0
            last_improvement_loss = None
            
            # ========= TRAINING ============
            
            print ('Starting training ...')
            n_lr = len(learning_rates)
            lr_i = 0
            train_loss_ema = 0. # exponential moving average
            
            while seconds_training < max_seconds_training:
                
                for train_i, (profile_indexes, profile_size, positive_index, negative_index) in enumerate(zip(
                    train_minibatches['profile_indexes_batches'],
                    train_minibatches['profile_size_batches'],
                    train_minibatches['positive_index_batches'],
                    train_minibatches['negative_index_batches']
                )):
                    # optimize and get traing loss
                    start_t = time.time()
                    _, minibatch_train_loss = network.optimize_and_get_train_loss(
                        sess, learning_rates[lr_i], pretrained_embeddings, profile_indexes,
                        profile_size, positive_index, negative_index)
                    delta_t = time.time() - start_t
                    
                    # update train loss exponential moving average
                    train_loss_ema = 0.999 * train_loss_ema + 0.001 * minibatch_train_loss
                    
                    # update time tracking variables
                    seconds_training += delta_t
                    elapsed_seconds_from_last_check += delta_t
                    
                    # check for improvements using test set if it's time to do so
                    if elapsed_seconds_from_last_check >= min_seconds_to_check_improvement:
                        
                        # --- testing
                        test_acc = 0.
                        for _profile_indexes, _profile_size, _positive_index, _negative_index in zip(
                            test_minibatches['profile_indexes_batches'],
                            test_minibatches['profile_size_batches'],
                            test_minibatches['positive_index_batches'],
                            test_minibatches['negative_index_batches']
                        ):
                            minibatch_test_acc = network.get_test_accuracy(
                                sess, pretrained_embeddings, _profile_indexes,
                                _profile_size, _positive_index, _negative_index)                            
                            test_acc += minibatch_test_acc
                        test_acc /= n_test_instances
                    
                        print(("train_i=%d, train_loss = %.12f, test_accuracy = %.7f,"
                               " check_secs = %.2f, total_secs = %.2f") % (
                                train_i, train_loss_ema, test_acc, elapsed_seconds_from_last_check, seconds_training))                        
                        
                        # check for improvements
                        if (test_acc > best_test_acc) or (
                            test_acc == best_test_acc and (
                                last_improvement_loss is not None and\
                                last_improvement_loss > train_loss_ema
                            )
                        ):  
                            last_improvement_loss = train_loss_ema
                            best_test_acc = test_acc
                            checks_with_no_improvement = 0
                            saver = tf.train.Saver()
                            save_path = saver.save(sess, model_path)                    
                            print("   ** improvement detected: model saved to path ", save_path)
                            model_updated = True
                        else:
                            checks_with_no_improvement += 1                            
                            model_updated = False

                        # --- logging ---                        
                        trainlogger.log_update(
                            train_loss_ema, test_acc, n_train_instances, n_test_instances,
                            elapsed_seconds_from_last_check, batch_size, learning_rates[lr_i], 't' if model_updated else 'f')
                        
                        # --- check for early stopping
                        if checks_with_no_improvement >= early_stopping_checks:
                            if lr_i + 1 < len(learning_rates):
                                lr_i += 1
                                checks_with_no_improvement = 0
                                print("   *** %d checks with no improvements -> using a smaller learning_rate = %.8f" % (
                                    early_stopping_checks, learning_rates[lr_i]))
                            else:
                                print("   *** %d checks with no improvements -> early stopping :(" % early_stopping_checks)
                                return
                        
                        # --- reset check variables
                        elapsed_seconds_from_last_check = 0
            print('====== TIMEOUT ======')

In [61]:
train_minibatches = generate_minibatches(train_instances, max_users_items_per_batch=5000*10)
sanity_check_minibatches(train_minibatches)

n_tuples =  10019494
n_batches =  802


In [62]:
test_minibatches = generate_minibatches(test_instances, max_users_items_per_batch=5000*10)
sanity_check_minibatches(test_minibatches)

n_tuples =  505753
n_batches =  41


In [63]:
learning_rates = get_decaying_learning_rates(1e-4, 1e-6, 0.6)
learning_rates

[0.0001,
 6e-05,
 3.6e-05,
 2.16e-05,
 1.296e-05,
 7.776e-06,
 4.6656e-06,
 2.79936e-06,
 1.679616e-06,
 1.0077696e-06]

In [64]:
FINE_GRAINED_THRESHOLD, VISUAL_CONFIDENCE_THRESHOLD

(0.7, 0.1)

In [131]:
# !ls /mnt/workspace/pamessina_models/ugallery/youtube_like/

In [65]:
# MODEL_PATH = ('/mnt/workspace/pamessina_models/ugallery/youtube_like/v9(10M-300K,avg+max,'
#              '+prof(rl&fk)-nprof,+nproffavc(rl&fk)-nfavc_loc,+nxt+hide1+hideacc-nfavc_glob)/')
# MODEL_PATH = ('/mnt/workspace/pamessina_models/ugallery/youtube_like/v16c(10M-400K,avg+max,rsnt50+alxnt+incepv3,'
#               'u(300,300,200)i(200,200),+p(rl&fk)-np,+npfavc(rl&fk)-nfavc,+nxt+hdaccklst-nfavc,fg.6,vcf.1)/')
# MODEL_PATH = ('/mnt/workspace/pamessina_models/ugallery/youtube_like/v26(10M-500K,avg+max,rsnt50+grmmtb1c1,'
#               'u(300,300,200)i(200,200),+p(fk)-np,+npfavc(fk)-nfavc,fg.6,vcf.1,wd.0001)/')
MODEL_PATH = ('/mnt/workspace/pamessina_models/ugallery/youtube_like/v30(10M-500K(fk=1.),avg+max,rsnt50+grmmtb1c1,'
              'u(300,300,200)i(200,200),+p(fk)-np,+npfavc(fk)-nfavc,fg.7,vcf.1,wd.0001)/')
MODEL_PATH

'/mnt/workspace/pamessina_models/ugallery/youtube_like/v30(10M-500K(fk=1.),avg+max,rsnt50+grmmtb1c1,u(300,300,200)i(200,200),+p(fk)-np,+npfavc(fk)-nfavc,fg.7,vcf.1,wd.0001)/'

In [66]:
avg_train_batch_size = ceil(np.mean([b.shape[0] for b in train_minibatches['profile_indexes_batches']]))
avg_train_batch_size

12494

In [67]:
train_network(
    train_minibatches, test_minibatches,
    len(train_instances), len(test_instances),
    batch_size=avg_train_batch_size,
    pretrained_embeddings=concat_featmat,
    user_layer_units=[300,300,200],
    item_layer_units=[200,200],
    profile_pooling_mode='AVG+MAX',
    model_path = MODEL_PATH,
    max_seconds_training=3600 * 5,
    min_seconds_to_check_improvement=90,
    early_stopping_checks=2,
    weight_decay=.0001,
    learning_rates=learning_rates,
)

learning_rates =  [0.0001, 6e-05, 3.6e-05, 2.16e-05, 1.296e-05, 7.776e-06, 4.6656e-06, 2.79936e-06, 1.679616e-06, 1.0077696e-06]
Variables to be trained:
	 <tf.Variable 'trainable_item_embedding/fc1/kernel:0' shape=(6144, 200) dtype=float32_ref>
	 <tf.Variable 'trainable_item_embedding/fc1/bias:0' shape=(200,) dtype=float32_ref>
	 <tf.Variable 'trainable_item_embedding/fc2/kernel:0' shape=(200, 200) dtype=float32_ref>
	 <tf.Variable 'trainable_item_embedding/fc2/bias:0' shape=(200,) dtype=float32_ref>
	 <tf.Variable 'user_hidden_1/kernel:0' shape=(400, 300) dtype=float32_ref>
	 <tf.Variable 'user_hidden_1/bias:0' shape=(300,) dtype=float32_ref>
	 <tf.Variable 'user_hidden_2/kernel:0' shape=(300, 300) dtype=float32_ref>
	 <tf.Variable 'user_hidden_2/bias:0' shape=(300,) dtype=float32_ref>
	 <tf.Variable 'user_vector/kernel:0' shape=(300, 200) dtype=float32_ref>
	 <tf.Variable 'user_vector/bias:0' shape=(200,) dtype=float32_ref>
	 <tf.Variable 'beta1_power:0' shape=() dtype=float32_ref>


train_i=114, train_loss = 0.070105076726, test_accuracy = 0.9993594, check_secs = 90.04, total_secs = 1713.05
   ** improvement detected: model saved to path  /mnt/workspace/pamessina_models/ugallery/youtube_like/v30(10M-500K(fk=1.),avg+max,rsnt50+grmmtb1c1,u(300,300,200)i(200,200),+p(fk)-np,+npfavc(fk)-nfavc,fg.7,vcf.1,wd.0001)/
train_i=412, train_loss = 0.068724140850, test_accuracy = 0.9993930, check_secs = 90.11, total_secs = 1803.16
   ** improvement detected: model saved to path  /mnt/workspace/pamessina_models/ugallery/youtube_like/v30(10M-500K(fk=1.),avg+max,rsnt50+grmmtb1c1,u(300,300,200)i(200,200),+p(fk)-np,+npfavc(fk)-nfavc,fg.7,vcf.1,wd.0001)/
train_i=711, train_loss = 0.067543312405, test_accuracy = 0.9993772, check_secs = 90.25, total_secs = 1893.41
train_i=206, train_loss = 0.066512797893, test_accuracy = 0.9993475, check_secs = 90.16, total_secs = 1983.56
   *** 2 checks with no improvements -> using a smaller learning_rate = 0.00003600
train_i=504, train_loss = 0.06546