data from http://jmcauley.ucsd.edu/data/tradesy/
```
wget http://jmcauley.ucsd.edu/data/tradesy/tradesy.json.gz
wget http://jmcauley.ucsd.edu/data/tradesy/tradesy_item_urls.json.gz
```


In [1]:
import tensorflow as tf
import os
import pickle
import random
import gzip
import struct
from collections import defaultdict
import gensim as gs
import numpy as np
try:
    # noinspection PyUnresolvedReferences
    if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
        print("notebook")
        from tqdm import tqdm_notebook as tqdm
    else:
        raise RuntimeError
except (NameError, RuntimeError):
    from tqdm import tqdm

notebook


In [2]:
user_data = eval(gzip.open("./tradesy.json.gz", 'r').read())

In [3]:
def load_data(user_data, max_uid=1000000):
    line_list =[]
    user_list = []
    item_dic = {}
    few_buyers =[]
    data = defaultdict(set)
    for d  in user_data:
        user_id = d["uid"]
        items = d["lists"]["bought"]
        
        item_list = [int(i) for i in items]
        if item_list:
            max_i = max(item_list)
            if max_i < max_uid:
                user_list.append(user_id)
                line_list.append(items)
            
    dictionary = gs.corpora.Dictionary(line_list)
    dictionary.filter_extremes(no_below=1)
    dictionary.compactify()
    for u, items in zip(user_list, line_list):
        data[u].update([dictionary.token2id[item] for item in items if item in dictionary.token2id])
    for u,i in data.items():
        if len(i) < 5:   # 5 same as the paper's
            few_buyers.append(u)
    for u in few_buyers:
        del data[u]
    d = {}
    user_list = []
    for idx,(u,i) in enumerate(data.items()):
        d[idx] = i
        user_list.append(u)
    user_count = len(data.keys())
    item_count = len(dictionary)
    return user_count, item_count, d, dictionary, user_list

In [4]:
def generate_test(data):
    user_test = dict()
    for u, i_list in data.items():
        if i_list:
            user_test[u] = np.random.choice(list(i_list))
    return user_test

In [5]:
max_uid=1000000
user_count, item_count, data, dictionary,u_list  = load_data(user_data, max_uid)
print("item count: ", item_count)
print("user count: ", user_count)
ui_test = generate_test(data)

item count:  32807
user count:  1076


In [6]:
#file_name = "./items.pickle"
#with open(file_name ,mode='wb') as f:
#    pickle.dump(list(dictionary.token2id.keys()), f, protocol=4)

In [7]:
def readImageFeatures(path, dictionary):
    f = open(path, 'rb')
    imgs = {}
    uids = dictionary.token2id.keys()
    count = 0
    while f:
        userId = f.read(10)
        userId = userId.strip()
        if userId == '':
            break
        uid =  userId.decode('ascii')
        if uid in uids:
            feature = [struct.unpack('f', f.read(4)) for _ in range(4096)]
            imgs[dictionary.token2id[uid]] = feature
            count += 1
            if count == len(uids):
                break
        else:
            f.read(4*4096)
            
    file_name = "./images.pickle"
    with open(file_name ,mode='wb') as f:
        pickle.dump(image_features, f, protocol=4)
    return imgs

In [8]:
#image_features = readImageFeatures("./image_features_tradesy.b", dictionary)

In [9]:
file_name = "./images.pickle"
with open(file_name, mode='rb') as f:
          image_features =  pickle.load(f)

In [10]:
len(image_features)

32807

In [11]:
len(image_features[13348])

4096

In [12]:
def uniform_sample_batch(train_ratings, item_count, image_features, sample_count=20000, batch_size=5):
    for i in range(sample_count):
        t = []
        iv = []
        jv = []
        for b in range(batch_size):
            u = random.sample(train_ratings.keys(), 1)[0]
            i = random.sample(train_ratings[u], 1)[0]
            j = random.randint(0, item_count-1)
            while j in train_ratings[u]:
                j = random.randint(0, item_count-1)
            t.append([u, i, j])
            iv.append(image_features[i])
            jv.append(image_features[j])
        yield np.asarray(t), np.hstack(tuple(iv)), np.hstack(tuple(jv))

def test_batch_generator_by_user(train_ratings, test_ratings, item_count, image_features, n_user=10):  
    for u in np.random.choice(list(test_ratings.keys()), n_user):
        i = test_ratings[u]
        t = []
        ilist = []
        jlist = []
        for j in range(item_count):
            if j != test_ratings[u] and not (j in train_ratings[u]):
                t.append([u, i, j])
                ilist.append(image_features[i])
                jlist.append(image_features[j])
        yield np.asarray(t), np.hstack(tuple(ilist)), np.hstack(tuple(jlist))

In [13]:
def weight_variable(shape):
    return tf.Variable(tf.random_normal(shape, mean=0.0, stddev=0.01))

def bias_variable(shape):
    return tf.Variable(tf.random_normal(shape, mean=0.0, stddev=0.01))

In [14]:
def vbpr(user_count, item_count, hidden_dim=20, hidden_img_dim=128, learning_rate = 0.001,l2_regulization = 1.0):
    image_dim = 4096
    u = tf.placeholder(tf.int32, [None])
    i = tf.placeholder(tf.int32, [None])
    j = tf.placeholder(tf.int32, [None])
    iv = tf.placeholder(tf.float32, [4096, None])
    jv = tf.placeholder(tf.float32, [4096, None])
    
    with tf.device("/cpu:0"):
        
        user_emb_w = weight_variable([user_count+1, hidden_dim])
        user_img_w = weight_variable([user_count+1, hidden_img_dim])
        item_emb_w = weight_variable([item_count+1, hidden_dim])
        item_b = bias_variable([item_count+1, 1])
        
        u_emb = tf.nn.embedding_lookup(user_emb_w, u)
        u_img = tf.nn.embedding_lookup(user_img_w, u)
        
        i_emb = tf.nn.embedding_lookup(item_emb_w, i)
        i_b = tf.nn.embedding_lookup(item_b, i)
        j_emb = tf.nn.embedding_lookup(item_emb_w, j)
        j_b = tf.nn.embedding_lookup(item_b, j)
    
    with tf.device("/cpu:0"):
   
        img_emb_w = weight_variable([4096, hidden_img_dim])

        img_i_j = tf.matmul(tf.transpose(iv - jv),img_emb_w)

        x = i_b - j_b + tf.reduce_sum(tf.matmul(u_emb, tf.transpose(i_emb - j_emb)), 1, keep_dims=True) +\
            tf.reduce_sum(tf.matmul(u_img, tf.transpose(img_i_j)),1, keep_dims=True)

        auc = tf.reduce_mean(tf.to_float(x > 0))

        l2_norm = tf.add_n([
                tf.reduce_sum(tf.norm(u_emb)), 
                tf.reduce_sum(tf.norm(u_img)),
                tf.reduce_sum(tf.norm(i_emb)),
                tf.reduce_sum(tf.norm(j_emb)),
                tf.reduce_sum(tf.norm(img_emb_w)),
                tf.reduce_sum(tf.norm(i_b)),
                tf.reduce_sum(tf.norm(j_b))
            ])

        loss = l2_regulization * l2_norm - tf.reduce_mean(tf.log(tf.sigmoid(x)))
    train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    return u, i, j, iv, jv, loss, auc, train_op

In [15]:
test_n_user = 10 #len(ui_test)
sample_count = 500
n_epoch = 500


In [None]:
with tf.Graph().as_default(), tf.Session() as session:
    with tf.variable_scope('vbpr'):
        u, i, j, iv, jv, loss, auc, train_op = vbpr(user_count, item_count,learning_rate = 0.0001)
    
    session.run(tf.global_variables_initializer())
    for epoch in tqdm(range(n_epoch)):
        
        _loss_train = 0.0
        for d,i_img,j_img  in uniform_sample_batch(data, item_count, image_features, sample_count=sample_count):
            _loss, _ = session.run([loss, train_op], feed_dict={
                    u:d[:,0], i:d[:,1], j:d[:,2], iv: i_img, jv: j_img
                })
            _loss_train += _loss
            
        print("epoch ", epoch, " train_loss:", _loss_train/sample_count)
        
        if epoch % 10 == 0 and epoch != 0:
            _auc_all = 0.0
            _loss_test = 0.0
            for d,i_img,j_img in tqdm(test_batch_generator_by_user(data, ui_test, item_count, image_features, n_user=test_n_user)):
                _loss, _auc = session.run([loss, auc], feed_dict={
                        u:d[:,0], i:d[:,1], j:d[:,2], iv: i_img, jv: j_img
                })
                _loss_test += _loss
                _auc_all += _auc
            print( "test_loss: ", _loss_test/test_n_user, " auc: ", _auc_all/test_n_user)

epoch  0  train_loss: 4.55186868095
epoch  1  train_loss: 1.45956698442
epoch  2  train_loss: 1.27494984078
epoch  3  train_loss: 1.18805670714
epoch  4  train_loss: 1.10765860271
epoch  5  train_loss: 1.0338605653
epoch  6  train_loss: 0.973891965866
epoch  7  train_loss: 0.928053256631
epoch  8  train_loss: 0.900440600753
epoch  9  train_loss: 0.886892068744
epoch  10  train_loss: 0.878287965417
test_loss:  14.3501008034  auc:  0.499853638003
epoch  11  train_loss: 0.872071281791
epoch  12  train_loss: 0.861356754899
epoch  13  train_loss: 0.857188465357
epoch  14  train_loss: 0.851950132012
epoch  15  train_loss: 0.846854774833
epoch  16  train_loss: 0.842722093821
epoch  17  train_loss: 0.838809738874
epoch  18  train_loss: 0.83522105515
epoch  19  train_loss: 0.831964465499
epoch  20  train_loss: 0.827382264137
test_loss:  10.7508340836  auc:  0.5
epoch  21  train_loss: 0.824874441266
epoch  22  train_loss: 0.823237832308
epoch  23  train_loss: 0.82103567636
epoch  24  train_loss: