In [1]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
cp -r drive/My\ Drive/workspace/deep\ learning\ based\ recommendation\ systems/CML/citeulike-t/ .

In [1]:
!nvidia-smi

Wed Dec 26 03:11:30 2018       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 396.44                 Driver Version: 396.44                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P8    30W / 149W |      0MiB / 11441MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [0]:
!apt-get install psmisc

In [41]:
!fuser -v /dev/nvidia*

                     USER        PID ACCESS COMMAND
/dev/nvidia0:        root         75 F...m python3
/dev/nvidiactl:      root         75 F...m python3
/dev/nvidia-uvm:     root         75 F.... python3


In [0]:
!kill -9 75

# Data Pre-process

## Read Data

In [0]:
from collections import defaultdict
import numpy as np
from scipy.sparse import dok_matrix, lil_matrix
from tqdm import tqdm

In [0]:
def citeulike(tag_occurence_thres=10):
    user_dict = defaultdict(set)
    
    # read user-item interactions
    for u, item_list in enumerate(open("citeulike-t/users.dat").readlines()):
        items = item_list.strip().split(" ")
        # ignore the first element in each line, which is the number of items the user liked. 
        for item in items[1:]:
            user_dict[u].add(int(item))

    n_users = len(user_dict)
    n_items = max([item for items in user_dict.values() for item in items]) + 1
    
    # store in the sparse matrix, great!!!
    user_item_matrix = dok_matrix((n_users, n_items), dtype=np.int32)
    for u, item_list in enumerate(open("citeulike-t/users.dat").readlines()):
        items = item_list.strip().split(" ")
        # ignore the first element in each line, which is the number of items the user liked. 
        for item in items[1:]:
            user_item_matrix[u, int(item)] = 1

            
    # each line is: tag no. item1 item2 item3 ... ... (these item has this tag)    
    # only use tag which has at least tag_occurence_thres items 
    n_features = 0
    for l in open("citeulike-t/tag-item.dat").readlines():
        items = l.strip().split(" ")
        if int(items[0]) >= tag_occurence_thres:
            n_features += 1
    print("{} features over tag_occurence_thres ({})".format(n_features, tag_occurence_thres))
    features = dok_matrix((n_items, n_features), dtype=np.int32)
    feature_index = 0
    for l in open("citeulike-t/tag-item.dat").readlines():
        items = l.strip().split(" ")
        if int(items[0]) >= tag_occurence_thres:
            features[[int(i) for i in items[1:]], feature_index] = 1 # reverse
            feature_index += 1

    return user_item_matrix, features

In [4]:
user_item_matrix, features = citeulike(tag_occurence_thres=5)

8311 features over tag_occurence_thres (5)


In [5]:
n_users, n_items = user_item_matrix.shape # 7947 users, 25975 items
n_users, n_items 

(7947, 25975)

In [0]:
# make feature as dense matrix, add a small number to smooth
dense_features = features.toarray() + 1E-10

In [8]:
dense_features.shape # 25975 items, 10403 tags

(25975, 8311)

## Split Data

In [0]:
def split_data(user_item_matrix, split_ratio=(3, 1, 1), seed=1):
    # set the seed to have deterministic results
    np.random.seed(seed)
    train = dok_matrix(user_item_matrix.shape)
    validation = dok_matrix(user_item_matrix.shape)
    test = dok_matrix(user_item_matrix.shape)
    # convert it to lil format for fast row access
    user_item_matrix = lil_matrix(user_item_matrix)
    for user in tqdm(range(user_item_matrix.shape[0]), desc="Split data into train/valid/test"):
        items = list(user_item_matrix.rows[user]) # api of lil_matrix, rows fetch non-zero elements
        # if this user has more than 5 interactions
        if len(items) >= 5:

            np.random.shuffle(items) # shuffle

            train_count = int(len(items) * split_ratio[0] / sum(split_ratio))
            valid_count = int(len(items) * split_ratio[1] / sum(split_ratio))

            for i in items[0: train_count]:
                train[user, i] = 1
            for i in items[train_count: train_count + valid_count]:
                validation[user, i] = 1
            for i in items[train_count + valid_count:]:
                test[user, i] = 1
    print("{}/{}/{} train/valid/test samples".format(
        len(train.nonzero()[0]),
        len(validation.nonzero()[0]),
        len(test.nonzero()[0])))
    return train, validation, test

In [10]:
# get train/valid/test user-item matrices
train, valid, test = split_data(user_item_matrix)

Split data into train/valid/test: 100%|██████████| 7947/7947 [00:01<00:00, 5518.17it/s]


73427/23311/28842 train/valid/test samples


## Evaluator

In [0]:
from scipy.sparse import lil_matrix


class RecallEvaluator(object):
    def __init__(self, model, train_user_item_matrix, test_user_item_matrix):
        """
        Create a evaluator for recall@K evaluation
        :param model: the model we are going to evaluate
        :param train_user_item_matrix: the user-item pairs used in the training set. These pairs will be ignored
               in the recall calculation
        :param test_user_item_matrix: the held-out user-item pairs we make prediction against
        """
        self.model = model
        self.train_user_item_matrix = lil_matrix(train_user_item_matrix)
        self.test_user_item_matrix = lil_matrix(test_user_item_matrix)
        n_users = train_user_item_matrix.shape[0]
        self.user_to_test_set = {u: set(self.test_user_item_matrix.rows[u])
                                 for u in range(n_users) if self.test_user_item_matrix.rows[u]}

        if self.train_user_item_matrix is not None:
            self.user_to_train_set = {u: set(self.train_user_item_matrix.rows[u])
                                      for u in range(n_users) if self.train_user_item_matrix.rows[u]}
            self.max_train_count = max(len(row) for row in self.train_user_item_matrix.rows)
        else:
            self.max_train_count = 0

    def eval(self, sess, users, k=50):
        """
        Compute the Top-K recall for a particular user given the predicted scores to items
        :param users: the users to eval the recall
        :param k: compute the recall for the top K items
        :return: recall@K
        """
        # compute the top (K +  Max Number Of Training Items for any user) items for each user

        _, user_tops = sess.run(tf.nn.top_k(self.model.item_scores, k + self.max_train_count),
                                {self.model.score_user_ids: users})
        recalls = []
        for user_id, tops in zip(users, user_tops):
            train_set = self.user_to_train_set.get(user_id, set())
            test_set = self.user_to_test_set.get(user_id, set())
            top_n_items = 0
            hits = 0
            for i in tops:
                # ignore item in the training set
                if i in train_set:
                    continue
                elif i in test_set:
                    hits += 1
                top_n_items += 1
                if top_n_items == k:
                    break
            recalls.append(hits / float(len(test_set)))
        return recalls

## Sampler

In [0]:
import numpy
from multiprocessing import Process, Queue
from scipy.sparse import lil_matrix

In [0]:
# each process will run this function in parallel to generate 
# (user, pos-item) and for each (user,pos-item) pair, fetch N negatives

def sample_function(user_item_matrix, batch_size, n_negative, result_queue, check_negative=True):
    """

    :param user_item_matrix: the user-item matrix for positive user-item pairs
    :param batch_size: number of samples to return
    :param n_negative: number of negative samples per user-positive-item pair
    :param result_queue: the output queue
    :return: None
    """
    user_item_matrix = lil_matrix(user_item_matrix)
    # nonzero() return 2 tuples, first: user index, secend: item index. they are alignd.
    # user_item_pairs's shape: (interaction_num, 2)
    user_item_pairs = numpy.asarray(user_item_matrix.nonzero()).T # (134860,2)
    # key: user, value: items
    user_to_positive_set = {u: set(row) for u, row in enumerate(user_item_matrix.rows)}
    
    # while true to block!!!!
    while True: 
        numpy.random.shuffle(user_item_pairs) # shuffle
        for i in range(int(len(user_item_pairs) / batch_size)):

            user_positive_items_pairs = user_item_pairs[i * batch_size: (i + 1) * batch_size, :]

            # sample negative samples
            negative_samples = numpy.random.randint(
                0,
                user_item_matrix.shape[1],
                size=(batch_size, n_negative))

            # Check if we sample any positive items as negative samples.
            # Note: this step can be optional as the chance that we sample a positive item is fairly low given a
            # large item set.
            if check_negative:
                for user_positive, negatives, i in zip(user_positive_items_pairs,
                                                       negative_samples,
                                                       range(len(negative_samples))):
                    user = user_positive[0]
                    for j, neg in enumerate(negatives):
                        while neg in user_to_positive_set[user]:
                            negative_samples[i, j] = neg = numpy.random.randint(0, user_item_matrix.shape[1])
            result_queue.put((user_positive_items_pairs, negative_samples))



In [0]:
# create warp sampler
# A generator that, in parallel, generates tuples: 
# user-positive-item pairs, negative-items
# of the shapes (Batch Size, 2) and (Batch Size, N_Negative)

class WarpSampler(object):

    def __init__(self, user_item_matrix, batch_size=10000, n_negative=10, n_workers=5, check_negative=True):
        self.result_queue = Queue(maxsize=n_workers*2) # 10 mini-batch??
        self.processors = []
        for i in range(n_workers):
            self.processors.append(
                Process(target=sample_function, args=(user_item_matrix,
                                                      batch_size,
                                                      n_negative,
                                                      self.result_queue,
                                                      check_negative)))
            self.processors[-1].start()

    def next_batch(self):
        return self.result_queue.get()

    def close(self):
        for p in self.processors:  # type: Process
            p.terminate()
            p.join()

# Model

In [0]:
import functools
import numpy
import tensorflow as tf
import toolz
from tqdm import tqdm

In [0]:

def doublewrap(function):
    """
    A decorator decorator, allowing to use the decorator to be used without
    parentheses if not arguments are provided. All arguments must be optional.
    """

    @functools.wraps(function)
    def decorator(*args, **kwargs):
        if len(args) == 1 and len(kwargs) == 0 and callable(args[0]):
            return function(args[0])
        else:
            return lambda wrapee: function(wrapee, *args, **kwargs)

    return decorator


@doublewrap
def define_scope(function, scope=None, *args, **kwargs):
    """
    A decorator for functions that define TensorFlow operations. The wrapped
    function will only be executed once. Subsequent calls to it will directly
    return the result so that operations are added to the graph only once.
    The operations added by the function live within a tf.variable_scope(). If
    this decorator is used with arguments, they will be forwarded to the
    variable scope. The scope name defaults to the name of the wrapped
    function.
    """
    attribute = '_cache_' + function.__name__
    name = scope or function.__name__

    @property
    @functools.wraps(function)
    def decorator(self):
        if not hasattr(self, attribute):
            with tf.variable_scope(name, *args, **kwargs):
                setattr(self, attribute, function(self))
        return getattr(self, attribute)

    return decorator


In [0]:
class CML(object):
    def __init__(self,
                 n_users,
                 n_items,
                 embed_dim=20,
                 features=None,
                 margin=1.5,
                 master_learning_rate=0.1,
                 clip_norm=1.0,
                 hidden_layer_dim=128,
                 dropout_rate=0.2,
                 feature_l2_reg=0.1,
                 feature_projection_scaling_factor=0.5,
                 use_rank_weight=True,
                 use_cov_loss=True,
                 cov_loss_weight=0.1
                 ):
        """

        :param n_users: number of users i.e. |U|
        :param n_items: number of items i.e. |V|
        :param embed_dim: embedding size i.e. K (default 20)
        :param features: (optional) the feature vectors of items, shape: (|V|, N_Features).
               Set it to None will disable feature loss(default: None)
        :param margin: hinge loss threshold i.e. z
        :param master_learning_rate: master learning rate for AdaGrad
        :param clip_norm: clip norm threshold (default 1.0)
        :param hidden_layer_dim: the size of feature projector's hidden layer (default: 128)
        :param dropout_rate: the dropout rate between the hidden layer to final feature projection layer
        :param feature_l2_reg: feature loss weight
        :param feature_projection_scaling_factor: scale the feature projection before compute l2 loss. Ideally,
               the scaled feature projection should be mostly within the clip_norm
        :param use_rank_weight: whether to use rank weight
        :param use_cov_loss: use covariance loss to discourage redundancy in the user/item embedding
        """
        self.n_users = n_users
        self.n_items = n_items
        self.embed_dim = embed_dim

        self.clip_norm = clip_norm
        self.margin = margin
        if features is not None:
            self.features = tf.constant(features, dtype=tf.float32)
        else:
            self.features = None

        self.master_learning_rate = master_learning_rate
        self.hidden_layer_dim = hidden_layer_dim
        self.dropout_rate = dropout_rate
        self.feature_l2_reg = feature_l2_reg
        self.feature_projection_scaling_factor = feature_projection_scaling_factor
        self.use_rank_weight = use_rank_weight
        self.use_cov_loss = use_cov_loss
        self.cov_loss_weight = cov_loss_weight


        self.user_positive_items_pairs = tf.placeholder(tf.int32, [None, 2], 'user_pos_items')
        self.negative_samples = tf.placeholder(tf.int32, [None, None], name='neg_items')
        self.score_user_ids = tf.placeholder(tf.int32, [None], name='eval_user_ids')


        self.user_embeddings
        self.item_embeddings
        self.embedding_loss
        self.feature_loss
        self.loss
        self.optimize


    @define_scope
    def user_embeddings(self):
        return tf.Variable(tf.random_normal([self.n_users, self.embed_dim],
                                            stddev=1 / (self.embed_dim ** 0.5), dtype=tf.float32))

    @define_scope
    def item_embeddings(self):
        return tf.Variable(tf.random_normal([self.n_items, self.embed_dim],
                                            stddev=1 / (self.embed_dim ** 0.5), dtype=tf.float32))

    @define_scope
    def mlp_layer_1(self):
        return tf.layers.dense(inputs=self.features,
                               units=self.hidden_layer_dim,
                               activation=tf.nn.relu, name="mlp_layer_1")

    @define_scope
    def mlp_layer_2(self):
        dropout = tf.layers.dropout(inputs=self.mlp_layer_1, rate=self.dropout_rate)
        return tf.layers.dense(inputs=dropout, units=self.embed_dim, name="mlp_layer_2")

    @define_scope
    def feature_projection(self):
        """
        :return: the projection of the feature vectors to the user-item embedding
        """

        # feature loss
        if self.features is not None:
            # fully-connected layer
            output = self.mlp_layer_2 * self.feature_projection_scaling_factor

            # projection to the embedding
            return tf.clip_by_norm(output, self.clip_norm, axes=[1], name="feature_projection")

    @define_scope
    def feature_loss(self):
        """
        :return: the l2 loss of the distance between items' their embedding and their feature projection
        """
        loss = tf.constant(0, dtype=tf.float32)
        if self.feature_projection is not None:
            # the distance between feature projection and the item's actual location in the embedding
            feature_distance = tf.reduce_sum(tf.squared_difference(
                self.item_embeddings,
                self.feature_projection), 1)

            # apply regularization weight
            loss += tf.reduce_sum(feature_distance, name="feature_loss") * self.feature_l2_reg

        return loss
    @define_scope
    def covariance_loss(self):

        X = tf.concat((self.item_embeddings, self.user_embeddings), 0)
        n_rows = tf.cast(tf.shape(X)[0], tf.float32)
        X = X - (tf.reduce_mean(X, axis=0))
        cov = tf.matmul(X, X, transpose_a=True) / n_rows

        return tf.reduce_sum(tf.matrix_set_diag(cov, tf.zeros(self.embed_dim, tf.float32))) * self.cov_loss_weight

    @define_scope
    def embedding_loss(self):
        """
        :return: the distance metric loss
        """
        # Let
        # N = batch size,
        # K = embedding size,
        # W = number of negative samples per a user-positive-item pair

        # user embedding (N, K)
        users = tf.nn.embedding_lookup(self.user_embeddings,
                                       self.user_positive_items_pairs[:, 0],
                                       name="users")

        # positive item embedding (N, K)
        pos_items = tf.nn.embedding_lookup(self.item_embeddings, self.user_positive_items_pairs[:, 1],
                                           name="pos_items")
        # positive item to user distance (N)
        pos_distances = tf.reduce_sum(tf.squared_difference(users, pos_items), 1, name="pos_distances")

        # negative item embedding (N, K, W)
        neg_items = tf.transpose(tf.nn.embedding_lookup(self.item_embeddings, self.negative_samples),
                                 (0, 2, 1), name="neg_items")
        # distance to negative items (N x W)
        distance_to_neg_items = tf.reduce_sum(tf.squared_difference(tf.expand_dims(users, -1), neg_items), 1,
                                              name="distance_to_neg_items")

        # best negative item (among W negative samples) their distance to the user embedding (N)
        closest_negative_item_distances = tf.reduce_min(distance_to_neg_items, 1, name="closest_negative_distances")

        # compute hinge loss (N)
        loss_per_pair = tf.maximum(pos_distances - closest_negative_item_distances + self.margin, 0,
                                   name="pair_loss")

        if self.use_rank_weight:
            # indicator matrix for impostors (N x W)
            impostors = (tf.expand_dims(pos_distances, -1) - distance_to_neg_items + self.margin) > 0
            # approximate the rank of positive item by (number of impostor / W per user-positive pair)
            rank = tf.reduce_mean(tf.cast(impostors, dtype=tf.float32), 1, name="rank_weight") * self.n_items
            # apply rank weight
            loss_per_pair *= tf.log(rank + 1)

        # the embedding loss
        loss = tf.reduce_sum(loss_per_pair, name="loss")

        return loss

    @define_scope
    def loss(self):
        """
        :return: the total loss = embedding loss + feature loss
        """
        loss = self.embedding_loss + self.feature_loss
        if self.use_cov_loss:
            loss += self.covariance_loss
        return loss

    @define_scope
    def clip_by_norm_op(self):
        return [tf.assign(self.user_embeddings, tf.clip_by_norm(self.user_embeddings, self.clip_norm, axes=[1])),
                tf.assign(self.item_embeddings, tf.clip_by_norm(self.item_embeddings, self.clip_norm, axes=[1]))]

    @define_scope
    def optimize(self):
        # have two separate learning rates. The first one for user/item embedding is un-normalized.
        # The second one for feature projector NN is normalized by the number of items.
        gds = []

        # fix feature_embeddings and update user_embedding and item_embedding
        gds.append(tf.train
                   .AdamOptimizer(self.master_learning_rate)
                   .minimize(self.loss, var_list=[self.user_embeddings, self.item_embeddings]))

        # both update feature_embedding and item_embedding, Coordinate Ascent
        if self.feature_projection is not None:
            gds.append(tf.train
                       .AdamOptimizer(self.master_learning_rate)
                       .minimize(self.feature_loss / self.n_items))

        with tf.control_dependencies(gds):
            return gds + [self.clip_by_norm_op]

    @define_scope
    def item_scores(self):
        # (N_USER_IDS, 1, K)
        user = tf.expand_dims(tf.nn.embedding_lookup(self.user_embeddings, self.score_user_ids), 1)
        # (1, N_ITEM, K)
        item = tf.expand_dims(self.item_embeddings, 0)
        # score = minus distance (N_USER, N_ITEM)
        return -tf.reduce_sum(tf.squared_difference(user, item), 2, name="scores")
        

In [0]:
BATCH_SIZE = 50000
N_NEGATIVE = 20
EVALUATION_EVERY_N_BATCHES = 30
EMBED_DIM = 100

In [0]:
sampler = WarpSampler(train, batch_size=BATCH_SIZE, n_negative=N_NEGATIVE, check_negative=True)

In [0]:
def optimize(model, sampler, train, valid):
    """
    Optimize the model. TODO: implement early-stopping
    :param model: model to optimize
    :param sampler: mini-batch sampler
    :param train: train user-item matrix
    :param valid: validation user-item matrix
    :return: None
    """
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    if model.feature_projection is not None:
        # initialize item embedding with feature projection
        sess.run(tf.assign(model.item_embeddings, model.feature_projection))

    # sample some users to calculate recall validation
    valid_users = numpy.random.choice(list(set(valid.nonzero()[0])), size=1000, replace=False)

    while True:
        # create evaluator on validation set
        validation_recall = RecallEvaluator(model, train, valid)
        # compute recall on validate set
        valid_recalls = []

        # compute recall in chunks to utilize speedup provided by Tensorflow
        for user_chunk in toolz.partition_all(100, valid_users):
            valid_recalls.extend([validation_recall.eval(sess, user_chunk)])
        print("\nRecall on (sampled) validation set: {}".format(numpy.mean(valid_recalls)))
        # TODO: early stopping based on validation recall

        # train model
        losses = []
        # run n mini-batches
        for _ in tqdm(range(EVALUATION_EVERY_N_BATCHES), desc="Optimizing..."):
            user_pos, neg = sampler.next_batch()
            _, loss = sess.run((model.optimize, model.loss),
                               {model.user_positive_items_pairs: user_pos,
                                model.negative_samples: neg})

            losses.append(loss)

        print("\nTraining loss {}".format(numpy.mean(losses)))



## Run

In [0]:
# WITHOUT features
# Train a user-item joint embedding, where the items a user likes will be pulled closer to this users.
# Once the embedding is trained, the recommendations are made by finding the k-Nearest-Neighbor to each user.

'''
model = CML(n_users,
            n_items,
            # set features to None to disable feature projection
            features=None,
            # size of embedding
            embed_dim=EMBED_DIM,
            # the size of hinge loss margin.
            margin=1.9,
            # clip the embedding so that their norm <= clip_norm
            clip_norm=1,
            # learning rate for Adam
            master_learning_rate=0.001,

            # whether to enable rank weight. If True, the loss will be scaled by the estimated
            # log-rank of the positive items. If False, no weight will be applied.

            # This is particularly useful to speed up the training for large item set.

            # Weston, Jason, Samy Bengio, and Nicolas Usunier.
            # "Wsabie: Scaling up to large vocabulary image annotation." IJCAI. Vol. 11. 2011.
            use_rank_weight=True,

            # whether to enable covariance regularization to encourage efficient use of the vector space.
            # More useful when the size of embedding is smaller (e.g. < 20 ).
            use_cov_loss=False,

            # weight of the cov_loss
            cov_loss_weight=1
            )
'''

#optimize(model, sampler, train, valid)

# WITH features
# In this case, we additionally train a feature projector to project raw item features into the
# embedding. The projection serves as "a prior" to inform the item's potential location in the embedding.
# We use a two fully-connected layers NN as our feature projector. (This model is much more computation intensive.
# A GPU machine is recommended)
model = CML(n_users,
            n_items,
            # enable feature projection
            features=dense_features,
            embed_dim=EMBED_DIM,
            margin=2.0,
            clip_norm=1.1,
            master_learning_rate=0.001,
            # the size of the hidden layer in the feature projector NN
            hidden_layer_dim=512,
            # dropout rate between hidden layer and output layer in the feature projector NN
            dropout_rate=0.3,
            # scale the output of the NN so that the magnitude of the NN output is closer to the item embedding
            feature_projection_scaling_factor=1,
            # the penalty to the distance between projection and item's actual location in the embedding
            # tune this to adjust how much the embedding should be biased towards the item features.
            feature_l2_reg=0.1,

            # whether to enable rank weight. If True, the loss will be scaled by the estimated
            # log-rank of the positive items. If False, no weight will be applied.

            # This is particularly useful to speed up the training for large item set.

            # Weston, Jason, Samy Bengio, and Nicolas Usunier.
            # "Wsabie: Scaling up to large vocabulary image annotation." IJCAI. Vol. 11. 2011.
            use_rank_weight=True,

            # whether to enable covariance regularization to encourage efficient use of the vector space.
            # More useful when the size of embedding is smaller (e.g. < 20 ).
            use_cov_loss=True,

            # weight of the cov_loss
            cov_loss_weight=1
            )

In [25]:
optimize(model, sampler, train, valid)

Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.0016383116883116884


Optimizing...: 100%|██████████| 30/30 [00:49<00:00,  1.88s/it]



Training loss 1124511.75


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.0032647546454727307


Optimizing...: 100%|██████████| 30/30 [01:02<00:00,  1.93s/it]



Training loss 1028574.1875


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.029292800587997603


Optimizing...: 100%|██████████| 30/30 [01:02<00:00,  2.05s/it]



Training loss 970049.5625


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.08448525566066117


Optimizing...: 100%|██████████| 30/30 [01:02<00:00,  2.37s/it]



Training loss 923925.625


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.13494490023424277


Optimizing...: 100%|██████████| 30/30 [01:00<00:00,  1.87s/it]



Training loss 883120.75


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.17766364605711704


Optimizing...: 100%|██████████| 30/30 [01:02<00:00,  2.06s/it]



Training loss 843492.75


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.21492896631396785


Optimizing...: 100%|██████████| 30/30 [01:01<00:00,  1.94s/it]



Training loss 803676.0625


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.2515709561003286


Optimizing...: 100%|██████████| 30/30 [01:01<00:00,  1.77s/it]



Training loss 763691.9375


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.2667060986743216


Optimizing...: 100%|██████████| 30/30 [01:03<00:00,  2.26s/it]



Training loss 723543.3125


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.28197055978245406


Optimizing...: 100%|██████████| 30/30 [01:02<00:00,  2.29s/it]



Training loss 683006.5625


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.2850923503213439


Optimizing...: 100%|██████████| 30/30 [01:01<00:00,  1.88s/it]



Training loss 642707.0


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.2936665777129699


Optimizing...: 100%|██████████| 30/30 [01:03<00:00,  2.33s/it]



Training loss 602380.0


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.3007926875543155


Optimizing...: 100%|██████████| 30/30 [01:01<00:00,  1.96s/it]



Training loss 569565.6875


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.3062782155450583


Optimizing...: 100%|██████████| 30/30 [01:01<00:00,  2.19s/it]



Training loss 543835.375


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.3043429385742585


Optimizing...: 100%|██████████| 30/30 [01:00<00:00,  1.66s/it]



Training loss 528104.125


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.3077634118627458


Optimizing...: 100%|██████████| 30/30 [01:02<00:00,  1.90s/it]



Training loss 515471.6875


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.30559217742387945


Optimizing...: 100%|██████████| 30/30 [01:03<00:00,  2.31s/it]



Training loss 506996.4375


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.3066431415350338


Optimizing...: 100%|██████████| 30/30 [01:02<00:00,  2.37s/it]



Training loss 499637.09375


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.30524213879912054


Optimizing...: 100%|██████████| 30/30 [01:01<00:00,  2.32s/it]



Training loss 494370.3125


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.305382014411204


Optimizing...: 100%|██████████| 30/30 [01:02<00:00,  2.39s/it]



Training loss 488775.21875


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.3059263902095092


Optimizing...: 100%|██████████| 30/30 [01:01<00:00,  2.38s/it]



Training loss 484852.3125


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.30499439454282506


Optimizing...: 100%|██████████| 30/30 [01:01<00:00,  1.94s/it]



Training loss 480915.5625


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.30254963214458375


Optimizing...: 100%|██████████| 30/30 [01:02<00:00,  2.33s/it]



Training loss 477416.125


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.29860009523478265


Optimizing...: 100%|██████████| 30/30 [01:02<00:00,  2.43s/it]



Training loss 474817.03125


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.2974426060577856


Optimizing...: 100%|██████████| 30/30 [01:00<00:00,  2.27s/it]



Training loss 471428.53125


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.30583635876551957


Optimizing...:  17%|█▋        | 5/30 [00:07<00:44,  1.79s/it]


KeyboardInterrupt: ignored

In [29]:
optimize(model, sampler, train, valid)

Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.00040854978354978356


Optimizing...: 100%|██████████| 30/30 [00:50<00:00,  1.57s/it]



Training loss 1124872.125


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.0045845633616517795


Optimizing...: 100%|██████████| 30/30 [01:00<00:00,  1.60s/it]



Training loss 1028325.5625


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.03862928372630982


Optimizing...: 100%|██████████| 30/30 [01:00<00:00,  1.65s/it]



Training loss 968517.625


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.08326469167923764


Optimizing...: 100%|██████████| 30/30 [01:00<00:00,  1.71s/it]



Training loss 922649.9375


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.14150332877752242


Optimizing...: 100%|██████████| 30/30 [01:01<00:00,  2.09s/it]



Training loss 881353.0


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.19787792202355747


Optimizing...: 100%|██████████| 30/30 [01:00<00:00,  1.99s/it]



Training loss 841243.0625


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.23785660051875832


Optimizing...: 100%|██████████| 30/30 [01:00<00:00,  1.96s/it]



Training loss 801391.875


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.25938440136905677


Optimizing...: 100%|██████████| 30/30 [01:00<00:00,  2.20s/it]



Training loss 760502.0


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.2854228630997434


Optimizing...: 100%|██████████| 30/30 [01:00<00:00,  2.41s/it]



Training loss 719331.0


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.2958023261147635


Optimizing...: 100%|██████████| 30/30 [00:59<00:00,  2.07s/it]



Training loss 679528.4375


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.3082457127193138


Optimizing...: 100%|██████████| 30/30 [01:00<00:00,  1.85s/it]



Training loss 638196.9375


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.31032007736535117


Optimizing...: 100%|██████████| 30/30 [00:59<00:00,  2.01s/it]



Training loss 598607.625


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.32097388446742375


Optimizing...: 100%|██████████| 30/30 [01:00<00:00,  1.84s/it]



Training loss 566449.5625


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.32634969700589744


Optimizing...: 100%|██████████| 30/30 [01:01<00:00,  2.33s/it]



Training loss 543756.5625


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.3297015587222254


Optimizing...: 100%|██████████| 30/30 [00:59<00:00,  2.21s/it]



Training loss 527899.75


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.3212603423730331


Optimizing...: 100%|██████████| 30/30 [00:59<00:00,  2.09s/it]



Training loss 515877.96875


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.32504738989942733


Optimizing...: 100%|██████████| 30/30 [00:59<00:00,  1.91s/it]



Training loss 508100.90625


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.3229886360710546


Optimizing...: 100%|██████████| 30/30 [01:00<00:00,  2.15s/it]



Training loss 500490.28125


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.3200317332026791


Optimizing...: 100%|██████████| 30/30 [00:59<00:00,  1.97s/it]



Training loss 494982.4375


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.3202092644756774


Optimizing...: 100%|██████████| 30/30 [01:00<00:00,  2.09s/it]



Training loss 489810.34375


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.32317859678766947


Optimizing...: 100%|██████████| 30/30 [00:58<00:00,  1.74s/it]



Training loss 486082.15625


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.32274721703005205


Optimizing...: 100%|██████████| 30/30 [00:59<00:00,  1.84s/it]



Training loss 482077.9375


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.319208460155125


Optimizing...: 100%|██████████| 30/30 [01:00<00:00,  2.17s/it]



Training loss 478802.21875


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.3212613430316729


Optimizing...: 100%|██████████| 30/30 [01:01<00:00,  2.62s/it]



Training loss 475900.15625


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.3186211369525877


Optimizing...: 100%|██████████| 30/30 [00:58<00:00,  2.34s/it]



Training loss 471882.84375


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.3253496016869533


Optimizing...: 100%|██████████| 30/30 [00:58<00:00,  1.87s/it]



Training loss 469329.59375


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.31989974640891855


Optimizing...: 100%|██████████| 30/30 [00:59<00:00,  2.01s/it]



Training loss 467386.46875


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.31916385493496235


Optimizing...: 100%|██████████| 30/30 [00:58<00:00,  1.99s/it]



Training loss 464687.5


Optimizing...:   0%|          | 0/30 [00:00<?, ?it/s]


Recall on (sampled) validation set: 0.3177630212563393


Optimizing...:  13%|█▎        | 4/30 [00:05<00:34,  1.34s/it]


KeyboardInterrupt: ignored

# Visual

In [0]:
from IPython.display import clear_output, Image, display, HTML
import tensorflow as tf
import numpy as np
from google.colab import files

def strip_consts(graph_def, max_const_size=32):
    """Strip large constant values from graph_def."""
    strip_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = strip_def.node.add() 
        n.MergeFrom(n0)
        if n.op == 'Const':
            tensor = n.attr['value'].tensor
            size = len(tensor.tensor_content)
            if size > max_const_size:
                tensor.tensor_content = bytes("<stripped %d bytes>"%size, encoding = "utf8")
    return strip_def

def show_graph(train_graph, max_const_size=32):
    graph_def = train_graph.as_graph_def()
    
    """Visualize TensorFlow graph."""
    if hasattr(graph_def, 'as_graph_def'):
        graph_def = graph_def.as_graph_def()
    strip_def = strip_consts(graph_def, max_const_size=max_const_size)
    code = """
        <script>
          function load() {{
            document.getElementById("{id}").pbtxt = {data};
          }}
        </script>
        <link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
        <div style="height:600px">
          <tf-graph-basic id="{id}"></tf-graph-basic>
        </div>
    """.format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))

    iframe = """
        <iframe seamless style="width:1200px;height:620px;border:0" srcdoc="{}"></iframe>
    """.format(code.replace('"', '&quot;'))
    display(HTML(iframe))

In [24]:
show_graph(tf.get_default_graph())