In [9]:
from itertools import count # 迭代器
from collections import defaultdict # 使用dict时，如果引用的Key不存在，就会抛出KeyError。如果希望key不存在时，返回一个默认值，就可以用defaultdict
from scipy.sparse import csr # csr_matrix，全名为Compressed Sparse Row，是按行对矩阵进行压缩的。CSR需要三类数据：数值，列号，以及行偏移量。CSR是一种编码的方式，其中，数值与列号的含义，与coo里是一致的。行偏移表示某一行的第一个元素在values里面的起始偏移位置。 
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
# import tensorflow as tf
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
from tqdm import tqdm_notebook as tqdm # 可以显示循环的进度条的库

Instructions for updating:
non-resource variables are not supported in the long term


In [10]:
def vectorize_dic(dic, ix=None, p=None):
    """ 
    Creates a scipy csr matrix from a list of lists (each inner list is a set of values corresponding to a feature) 
    
    parameters:
    -----------
    dic -- dictionary of feature lists. Keys are the name of features
    ix -- index generator (default None)
    p -- dimension of featrure space (number of columns in the sparse matrix) (default None)
    """
    if (ix == None):
        d = count(0)
        ix = defaultdict(lambda: next(d)) 
        
    n = len(list(dic.values())[0]) # num samples
    g = len(list(dic.keys())) # num groups
    nz = n * g # number of non-zeros

    col_ix = np.empty(nz, dtype=int)     
    
    i = 0
    for k, lis in dic.items():     
        # append index el with k in order to prevet mapping different columns with same id to same index
        col_ix[i::g] = [ix[str(el) + str(k)] for el in lis]
        i += 1
        
    row_ix = np.repeat(np.arange(0, n), g)      
    data = np.ones(nz)
    
    if (p == None):
        p = len(ix)
        
    ixx = np.where(col_ix < p)

    return csr.csr_matrix((data[ixx],(row_ix[ixx], col_ix[ixx])), shape=(n, p)), ix

cols = ['user','item','rating','timestamp']

train = pd.read_csv('data/ua.base',delimiter='\t',names = cols)
test = pd.read_csv('data/ua.test',delimiter='\t',names = cols)

# x_train,ix = vectorize_dic({'users':train['user'].values,  'items':train['item'].values},n=len(train.index),g=2)

# x_test,ix = vectorize_dic({'users':test['user'].values,   'items':test['item'].values},ix,x_train.shape[1],n=len(test.index),g=2)

x_train,ix = vectorize_dic({'users':train['user'].values,  'items':train['item'].values})

x_test,ix = vectorize_dic({'users':test['user'].values,   'items':test['item'].values},ix,x_train.shape[1])

print(x_train)
y_train = train['rating'].values
y_test = test['rating'].values

x_train = x_train.todense() # toarray returns an ndarray; todense returns a matrix. If you want a matrix, use todense otherwise, use toarray
x_test = x_test.todense()


  (0, 0)	1.0
  (0, 943)	1.0
  (1, 0)	1.0
  (1, 944)	1.0
  (2, 0)	1.0
  (2, 945)	1.0
  (3, 0)	1.0
  (3, 946)	1.0
  (4, 0)	1.0
  (4, 947)	1.0
  (5, 0)	1.0
  (5, 948)	1.0
  (6, 0)	1.0
  (6, 949)	1.0
  (7, 0)	1.0
  (7, 950)	1.0
  (8, 0)	1.0
  (8, 951)	1.0
  (9, 0)	1.0
  (9, 952)	1.0
  (10, 0)	1.0
  (10, 953)	1.0
  (11, 0)	1.0
  (11, 954)	1.0
  (12, 0)	1.0
  :	:
  (90557, 1765)	1.0
  (90558, 942)	1.0
  (90558, 1773)	1.0
  (90559, 942)	1.0
  (90559, 1862)	1.0
  (90560, 942)	1.0
  (90560, 1874)	1.0
  (90561, 942)	1.0
  (90561, 1876)	1.0
  (90562, 942)	1.0
  (90562, 1943)	1.0
  (90563, 942)	1.0
  (90563, 1957)	1.0
  (90564, 942)	1.0
  (90564, 1974)	1.0
  (90565, 942)	1.0
  (90565, 1977)	1.0
  (90566, 942)	1.0
  (90566, 2006)	1.0
  (90567, 942)	1.0
  (90567, 2116)	1.0
  (90568, 942)	1.0
  (90568, 2154)	1.0
  (90569, 942)	1.0
  (90569, 2265)	1.0


In [11]:
def batcher(X_, y_=None, batch_size=-1):
    n_samples = X_.shape[0]

    if batch_size == -1:
        batch_size = n_samples
    if batch_size < 1:
       raise ValueError('Parameter batch_size={} is unsupported'.format(batch_size))

    for i in range(0, n_samples, batch_size):
        upper_bound = min(i + batch_size, n_samples)
        ret_x = X_[i:upper_bound]
        ret_y = None
        if y_ is not None:
            ret_y = y_[i:i + batch_size]
            yield (ret_x, ret_y)


In [12]:
n,p = x_train.shape

k = 10

x = tf.compat.v1.placeholder('float',[None,p])

y = tf.compat.v1.placeholder('float',[None,1])

w0 = tf.Variable(tf.zeros([1]))
w = tf.Variable(tf.zeros([p]))

v = tf.Variable(tf.random_normal([k,p],mean=0,stddev=0.01))

#y_hat = tf.Variable(tf.zeros([n,1]))

linear_terms = tf.add(w0,tf.reduce_sum(tf.multiply(w,x),1,keep_dims=True)) # n * 1
pair_interactions = 0.5 * tf.reduce_sum(
    tf.subtract(
        tf.pow(
            tf.matmul(x,tf.transpose(v)),2),
        tf.matmul(tf.pow(x,2),tf.transpose(tf.pow(v,2)))
    ),axis = 1 , keep_dims=True)

y_hat = tf.add(linear_terms,pair_interactions)


Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [17]:
x,y,w0,w,v,linear_terms,pair_interactions,y_hat

(<tf.Tensor 'Placeholder:0' shape=(?, 2623) dtype=float32>,
 <tf.Tensor 'Placeholder_1:0' shape=(?, 1) dtype=float32>,
 <tf.Variable 'Variable:0' shape=(1,) dtype=float32_ref>,
 <tf.Variable 'Variable_1:0' shape=(2623,) dtype=float32_ref>,
 <tf.Variable 'Variable_2:0' shape=(10, 2623) dtype=float32_ref>,
 <tf.Tensor 'Add:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'mul_1:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'Add_1:0' shape=(?, 1) dtype=float32>)

In [18]:
lambda_w = tf.constant(0.001,name='lambda_w')
lambda_v = tf.constant(0.001,name='lambda_v')

l2_norm = tf.reduce_sum(
    tf.add(
        tf.multiply(lambda_w,tf.pow(w,2)),
        tf.multiply(lambda_v,tf.pow(v,2))
    )
)

error = tf.reduce_mean(tf.square(y-y_hat))
loss = tf.add(error,l2_norm)

train_op = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(loss)


In [19]:
epochs = 10
batch_size = 1000

# Launch the graph
init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)

    for epoch in tqdm(range(epochs), unit='epoch'):
        perm = np.random.permutation(x_train.shape[0]) # 函数shuffle与permutation都是对原来的数组进行重新洗牌（即随机打乱原来的元素顺序）；区别在于shuffle直接在原来的数组上进行操作，改变原来数组的顺序，无返回值。而permutation不直接在原来的数组上进行操作，而是返回一个新的打乱顺序的数组，并不改变原来的数组。
        # iterate over batches
        for bX, bY in batcher(x_train[perm], y_train[perm], batch_size):
            _,t = sess.run([train_op,loss], feed_dict={x: bX.reshape(-1, p), y: bY.reshape(-1, 1)})
            print(t)


    errors = []
    for bX, bY in batcher(x_test, y_test):
        errors.append(sess.run(error, feed_dict={x: bX.reshape(-1, p), y: bY.reshape(-1, 1)}))
        print(errors)
    RMSE = np.sqrt(np.array(errors).mean())
    print (RMSE)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if __name__ == '__main__':


  0%|          | 0/10 [00:00<?, ?epoch/s]

13.637489
13.219606
12.790924
12.416634
12.022548
11.359671
10.8659
10.624169
10.000281
9.934617
9.638191
9.3424
8.7310295
8.61268
8.156904
8.148585
8.031114
7.263902
7.2327795
7.0194516
6.686267
6.4089303
6.320253
6.362725
6.0588274
5.711551
5.5643554
5.3654647
5.1732507
4.925365
5.0028114
5.005258
4.7162566
4.4186916
4.126847
4.5740647
4.0861883
3.9602137
3.7781017
3.8134696
3.8454242
3.6713402
3.6564813
3.2993655
3.2562907
3.2040613
3.1602733
3.0038388
3.0109925
2.9495778
3.0443244
2.9650962
2.7029681
2.649049
2.654093
2.6453044
2.565784
2.514612
2.5396843
2.4336364
2.255894
2.3072667
2.281745
2.2080207
2.1849387
2.2642558
2.0877209
2.0225976
2.1096034
2.0502627
2.0714374
1.9867121
1.9562231
1.8763671
1.9546022
1.8212669
1.9017518
1.8069891
1.7511406
1.7504866
1.7185751
1.7495823
1.6398888
1.6476532
1.7036997
1.693061
1.6302996
1.6642617
1.6186167
1.6763173
1.6044655
1.5477719
1.5358776
1.5388563
1.5677267
1.529953
1.6532717
1.5960456
1.4823977
1.532373
1.467837
1.466068
1.4772359
1