data from http://jmcauley.ucsd.edu/data/tradesy/
```
wget http://jmcauley.ucsd.edu/data/tradesy/tradesy.json.gz
wget http://jmcauley.ucsd.edu/data/tradesy/tradesy_item_urls.json.gz
```


In [23]:
import gzip
import numpy as np
import tensorflow as tf
from collections import defaultdict
import gensim as gs
try:
    # noinspection PyUnresolvedReferences
    if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
        print("notebook")
        from tqdm import tqdm_notebook as tqdm
    else:
        raise RuntimeError
except (NameError, RuntimeError):
    from tqdm import tqdm

notebook


In [24]:
user_data = eval(gzip.open("./tradesy.json.gz", 'r').read())

In [25]:
len(user_data)

128152

In [26]:
user_data[5]

{'lists': {'bought': ['845', '833', '829'],
  'selling': [],
  'sold': [],
  'want': []},
 'uid': '6'}

In [27]:
item_set = set()
user_set = set()
for d in user_data:
    item_set.update(d["lists"]["bought"])
    if len(d["lists"]["bought"]) >= 5:
        user_set.add(d["uid"])
len(item_set)
len(user_set)

2777

In [28]:
import struct

def readImageFeatures(path):
    f = open(path, 'rb')
    while True:
        userId = f.read(10)
        userId = userId.strip()
        if userId == '': break
        feature = []
        for i in range(4096):
            feature.append(struct.unpack('f', f.read(4)))
        yield userId, feature

In [29]:
#data = {}
#max_uid = -1
#max_iid = -1
#for d in userData:
#   if len(d["lists"]["bought"]) > 5:
#        data[d["uid"]] = d["lists"]["bought"]
#        max_iid = max(max_iid,max([int(itm) for itm in d["lists"]["bought"]]))
#    max_uid = max(max_uid, int(d["uid"]))

In [30]:
#print("real user num:", len(data))
#print("max uid: ", max_uid)
#print("max iid: ", max_iid)
#item_count = max_iid
#user_count = max_uid

In [31]:
def load_data(data_path):
    data = defaultdict(set)
    max_u_id = -1
    max_i_id = -1
    with open(data_path, 'r') as f:
        f.readline()
        for idx, line in enumerate(f):
            u, i, _, _ = line.split(",")
            u = int(u)
            i = int(i)
            data[u].add(i)
            max_u_id = max(u, max_u_id)
            max_i_id = max(i, max_i_id)
            if idx == 1000:
                break
    return max_u_id, max_i_id, data

In [32]:
def map_data(data_path):
    line_list =[]
    user_list = []
    item_dic = {}
    few_buyers =[]
    data = defaultdict(set)
    with open(data_path, 'r') as f:
        for idx, line in enumerate(f):
            line = line[:-1] # remove \n
            l = line.split(",")
            user_id = l[0]
            items = l[1:]
            user_list.append(user_id)
            line_list.append(items)
    dictionary = gs.corpora.Dictionary(line_list)

    for u, items in zip(user_list, line_list):
        data[u].update([dictionary.token2id[item] for item in items])
    for u,i in data.items():
        if len(i) < 10:
            few_buyers.append(u)
    for u in few_buyers:
        del data[u]
    d = {}
    user_list = []
    for idx,(u,i) in enumerate(data.items()):
        d[idx] = i
        user_list.append(u)
    user_count = len(data.keys())
    item_count = len(dictionary)
    return (user_count, item_count, d)

In [33]:
def map_data_2(user_data):
    line_list =[]
    user_list = []
    item_dic = {}
    few_buyers =[]
    data = defaultdict(set)
    for d  in user_data:
        user_id = d["uid"]
        items = d["lists"]["bought"]
        user_list.append(user_id)
        line_list.append(items)
    dictionary = gs.corpora.Dictionary(line_list)
    dictionary.filter_extremes(no_below=2)
    dictionary.compactify()
    for u, items in zip(user_list, line_list):
        data[u].update([dictionary.token2id[item] for item in items if item in dictionary.token2id])
    for u,i in data.items():
        if len(i) < 5:   # 5 same as the paper's
            few_buyers.append(u)
    for u in few_buyers:
        del data[u]
    d = {}
    user_list = []
    for idx,(u,i) in enumerate(data.items()):
        d[idx] = i
        user_list.append(u)
    user_count = len(data.keys())
    item_count = len(dictionary)
    return user_count, item_count, d, dictionary.token2id, user_list

In [34]:
def generate_test(data):
    user_test = dict()
    for u, i_list in data.items():
        if i_list:
            user_test[u] = np.random.choice(list(i_list))
    return user_test

In [65]:
def generate_train_batch(data, ui_test, item_count, batch_size=512):
    batch = []
    for _ in range(batch_size):
        u = np.random.choice(list(data.keys()))
        items = list(data[u])
        i = np.random.choice(items)
        while i == ui_test[u]:
            i = np.random.choice(items)
        
        j = np.random.randint(item_count)
        while j in items:
            j = np.random.randint( item_count)
        batch.append([u, i, j])
    return np.asarray(batch)

def generate_test_batch(data, ui_test, item_count, n_user=100, max_test_dim=30000):
    for u in np.random.choice(list(ui_test.keys()), n_user):
        t = []
        i = ui_test[u]
        j_list = np.random.randint(item_count, size=max_test_dim)
        for j in j_list:
            if j not in data[u] and  j != ui_test[u]:
                t.append([u, i, j])
        yield np.asarray(t)

In [55]:
def generate_test_batch(train_ratings, test_ratings, item_count, n_user=100, max_test_dim=30000):
    # using leave one cv
    for u in test_ratings.keys():
        i = test_ratings[u]
        t = []
        ilist = []
        jlist = []
        for j in range(item_count):
            if j != test_ratings[u] and not (j in train_ratings[u]):
                # find item not in test[u] and train[u]
                t.append([u, i, j])
                #ilist.append(image_features[i])
                #jlist.append(image_features[j])
        yield np.asarray(t)#, numpy.vstack(tuple(ilist)), numpy.vstack(tuple(jlist))

In [66]:
def weight_variable(shape):
    return tf.Variable(tf.random_normal(shape, mean=0.0, stddev=0.01))

def bias_variable(shape):
    return tf.Variable(tf.random_normal(shape, mean=0.0, stddev=0.01))

In [67]:
def vbpr(user_count, item_count, hidden_dim):
    
    u = tf.placeholder(tf.int32, [None])
    i = tf.placeholder(tf.int32, [None])
    j = tf.placeholder(tf.int32, [None])

    user_w = weight_variable([user_count, hidden_dim])
    item_w = weight_variable([item_count, hidden_dim])
    item_b = bias_variable([item_count, 1])
        
        
    u_e = tf.nn.embedding_lookup(user_w, u)
        
    i_e = tf.nn.embedding_lookup(item_w, i)
    i_b = tf.nn.embedding_lookup(item_b, i)
        
    j_e = tf.nn.embedding_lookup(item_w, j)
    j_b = tf.nn.embedding_lookup(item_b, j)
    
    #hidden_img_dim=128
    #iv = tf.placeholder(tf.float32, [None, 4096])
    #jv = tf.placeholder(tf.float32, [None, 4096])
    #user_img_w = weight_variable([user_count+1, hidden_img_dim])
    #u_img = tf.nn.embedding_lookup(user_img_w, u)
    #img_emb_w = weight_variable([4096, hidden_img_dim])

    #img_i_j = tf.matmul(iv - jv,  img_emb_w)
    
    # MF  
    x = i_b - j_b + tf.reduce_sum(tf.matmul(u_e, tf.transpose((i_e - j_e))), 1, keep_dims=True)
        #+tf.reduce_sum(tf.mul(u_img, img_i_j),1, keep_dims=True)
    
    auc_per_user = tf.reduce_mean(tf.cast(x > 0,"float"))
    #auc_per_user = tf.reduce_sum(tf.cast(x > 0,"float"))
    
    
    regu_rate = 0.001
    l2_norm = tf.add_n([
            tf.reduce_sum(tf.norm(u_e)), 
            tf.reduce_sum(tf.norm(i_e)),
            tf.reduce_sum(tf.norm(j_e)),
            regu_rate * tf.reduce_sum(tf.norm(i_b)),
            regu_rate * tf.reduce_sum(tf.norm(j_b))
            #,tf.reduce_sum(tf.mul(u_img, u_img)),
            #tf.reduce_sum(tf.mul(img_emb_w, img_emb_w)),
        ])
    

    loss = - tf.reduce_mean(tf.log(tf.sigmoid(x)+1.0e-10)) + l2_norm
    
    train_op = tf.train.AdamOptimizer(0.001).minimize(loss)
    return u, i, j, auc_per_user, loss, train_op, user_w, item_w, item_b, x

In [68]:
user_count, item_count, data, token2id,u_list  = map_data_2(user_data)
#user_count, item_count, data = map_data("./data.csv")
#user_count, item_count, data = load_data("./ml-20m/ratings.csv")
print("item count: ", item_count)
print("user count: ", user_count)
ui_test = generate_test(data)

item count:  851
user count:  14


In [69]:
n_epoch = 50
n_iter = 100
n_test_user = 10
if item_count > 40000:
    test_dim =  40000
else:
    test_dim =  item_count
latent_dim = 10
batch_size = 2048

In [70]:
data[1000]

KeyError: 1000

In [71]:
u_list[1000]

IndexError: list index out of range

In [72]:
for u in user_data:
    if u["uid"] == "459542":
        print(u["lists"]["bought"])

['294380', '278059', '272025', '266617', '189569', '103990', '40750']


In [73]:
token2id['294380']

KeyError: '294380'

In [74]:
with tf.Session() as session:
    u, i, j, auc_per_user, loss, train_op,W_u,W_i,b_i,x = vbpr(user_count, item_count, latent_dim)
    session.run(tf.global_variables_initializer())
    
    for epoch in range(n_epoch):
        _batch_loss = 0
        for _ in tqdm(range(n_iter)): 
            uij = generate_train_batch(data, ui_test, item_count, batch_size)
            _loss, _ = session.run([loss, train_op], feed_dict={u:uij[:,0], i:uij[:,1], j:uij[:,2]})
            _batch_loss += _loss
                   
        print("epoch: ", epoch, ", loss: ", _batch_loss / n_iter)

        _auc_sum = 0.0
        _loss_sum = 0.0
        user_count = 0
        for t_uij in tqdm(generate_test_batch(data, ui_test, item_count,n_user=n_test_user, max_test_dim=test_dim)):
            _auc_per_user, _test_loss = session.run([auc_per_user, loss], feed_dict={u:t_uij[:,0], i:t_uij[:,1], j:t_uij[:,2]})
            user_count += 1
            _auc_sum += _auc_per_user
            _loss_sum += _test_loss
            
        _auc = _auc_sum /user_count # eq (1) in the paper
        
            
        print("test loss: ", _test_loss/user_count, ", test auc: ", _auc)
    wu, wi, bi = session.run([W_u, W_i, b_i])


epoch:  0 , loss:  1.1906452167

test loss:  0.0744731128216 , test auc:  0.500191508979

epoch:  1 , loss:  0.641840184331

test loss:  0.0739879488945 , test auc:  0.62669634521

epoch:  2 , loss:  0.588314171433

test loss:  0.0788678109646 , test auc:  0.56174582094

epoch:  3 , loss:  0.543907094598

test loss:  0.0793892264366 , test auc:  0.621329340339

epoch:  4 , loss:  0.507039065063

test loss:  0.0799556076527 , test auc:  0.534989665449



KeyboardInterrupt: 

In [None]:
def get_ranking(item_id):
    #iid = self.item2id[item_id]
    iid = item_id
    item_vec = bi[iid] + wi[iid]
    scores = np.dot(wu, item_vec)
    res = [
        (np.argsort(scores)[::-1][index],  np.sort(scores)[::-1][index]) for index in range(10)]
    return res
get_ranking(1)

In [None]:
np.random.randint(item_count,size=test_dim)