In [1]:
import os
import re
import sys
import time
import implicit
import numpy as np
import tensorflow as tf
import pandas as pd
from scipy import sparse
from tqdm import  tqdm

In [3]:
IS_LOCAL = False
HOME_DIR = '/mnt/E/Projects/Content-based-Neural-Recommender-Systems/' if IS_LOCAL else '../'
os.environ['HOME_DIR'] = HOME_DIR

sys.path.append("..")
from utils.prepare_data import get_zen_data, zen_text_iterator

DATA_DIR = f'{HOME_DIR}data/zen/'
WORKING_DIR = f'{HOME_DIR}models/'
os.chdir(WORKING_DIR)

In [4]:
!pwd

/data/home/Xetd71/Content-based-Neural-Recommender-Systems/models


## Load data

In [5]:
PREPROC_DIR = f'{DATA_DIR}preproc/'

In [6]:
users_matrix = np.load(f'{PREPROC_DIR}users_matrix.npy')
items_matrix2 = np.load(f'{PREPROC_DIR}items_matrix2.npy')
ratings_train_matrix = np.load(f'{PREPROC_DIR}ratings_train_matrix.npy')
ratings_test_matrix = np.load(f'{PREPROC_DIR}ratings_test_matrix.npy')

In [7]:
# users_matrix.shape, 
items_matrix2.shape, ratings_train_matrix.shape, ratings_test_matrix.shape

((328050, 160), (54189923, 3), (13590245, 3))

In [8]:
def get_train_data_generator(batch_size=64):
    while True:
        idxs = np.random.randint(0, ratings_train_matrix.shape[0], batch_size)
        items = ratings_train_matrix[idxs]
#         yield users_matrix[items[:, 0]], items_matrix[items[:, 1]], items[:, 2].reshape((-1,1))
        yield items[:, 0], items_matrix2[items[:, 1]], items[:, 2].reshape((-1,1))

In [9]:
# def get_simple_train_data_generator(batch_size=64):
#     while True:
#         idxs = np.random.randint(0, ratings_train_matrix.shape[0], batch_size)
#         items = ratings_train_matrix[idxs]
#         yield np.concatenate((users_matrix[items[:, 0]], items_matrix[items[:, 1]]), axis=1), items[:, 2].reshape((-1,1))

In [10]:
def get_test_data_generator(batch_size=64):
    for idx in tqdm(np.arange(0, ratings_test_matrix.shape[0], batch_size)):
        items = ratings_test_matrix[idx:idx+batch_size]
#         yield users_matrix[items[:, 0]], items_matrix[items[:, 1]], items[:, 2].reshape((-1,1))
        yield items[:, 0], items_matrix2[items[:, 1]], items[:, 2].reshape((-1,1))

In [11]:
def reset_tf_session():
    curr_session = tf.get_default_session()
    # close current session
    if curr_session is not None:
        curr_session.close()
    # reset graph
    tf.reset_default_graph()

In [12]:
class MLP():
    def __init__(self, n_users, user_embedding_size, item_embedding_size, layers, sess, batch_size, log_file, epsilon=1e-5):
        self.n_users = n_users
        self.user_embedding_size = user_embedding_size
        self.item_embedding_size = item_embedding_size
        self.layers = layers
        self.sess = sess
        self.batch_size = batch_size
        self.log_file = open(log_file, 'w')
        self.epsilon = epsilon

    def define_graph(self, learning_rate):
        self.user_id = tf.placeholder(tf.int32, shape=[None], name='user_id')
        self.user_embeddings = tf.get_variable(
            'user_embeddings',
            shape=[self.n_users, self.user_embedding_size],
            dtype=tf.float32
        )
        user_embedding = tf.nn.embedding_lookup(
            self.user_embeddings, self.user_id)
        self.item_embedding = tf.placeholder(
            tf.float32, shape=[None, self.item_embedding_size])
        self.y = tf.placeholder(tf.float32, name='y')
        

        layer = tf.concat([user_embedding, self.item_embedding], axis=1)
        for layer_size in self.layers:
            layer = tf.layers.dense(
                layer, layer_size, tf.nn.elu, kernel_initializer=tf.random_normal_initializer)
            layer = tf.layers.dropout(layer, 0.1)
        self.p = tf.layers.dense(layer, 1, tf.nn.sigmoid, kernel_initializer=tf.random_normal_initializer)
        self.p = tf.clip_by_value(self.p, self.epsilon, 1 - self.epsilon)
        self.loss = -tf.reduce_sum(self.y * tf.log(self.p) + (1 - self.y) * tf.log(1 - self.p))
        self.optimizer = tf.train.AdagradOptimizer(learning_rate).minimize(self.loss)
    
    def train(self, steps):
        loss = []
        start_time = time.time()
        for it, (batch_user, batch_item, batch_rating) in enumerate(get_train_data_generator(self.batch_size)):
            if (it + 1) % steps == 0:
                break
            _, l = self.sess.run((self.optimizer, self.loss),
                                feed_dict={self.user_id: batch_user, self.item_embedding: batch_item, self.y: batch_rating})
            loss.append(l)
            if (it + 1) % 10_000 == 0:
                log_info = 'Iteration: {}, loss: {:.5f}, elapsed time: {:.5f}'.format(
                    it+1, np.mean(loss) / self.batch_size, time.time() - start_time)
                print(log_info)
                print(log_info, file=self.log_file, flush=True)
                start_time = time.time()
                

    def execute(self, epochs, epoch_steps, learning_rate):
        self.define_graph(learning_rate)
        init = tf.global_variables_initializer()
        self.sess.run(init)
        
        for epoch in np.arange(epochs)+1:
            self.train(epoch_steps)
            print(f"Epoch: {epoch}")
            model.save(f'/data/home/Xetd71/model1/model{epoch}')
            
    
    def test(self, steps=None):
        loss = []
        for it, (user_id, item, rating) in enumerate(get_test_data_generator(self.batch_size)):
            if steps and (it + 1) % steps == 0:
                break
            p = self.predict(user_id, item)
            loss.append(rating * np.log(p) + (1 - rating) * tf.log(1 - p))
        return np.mean(loss)
    
    
    def predict(self, user_id, item):
        return self.sess.run([self.p], feed_dict={self.user_id: user_id, self.item_embedding: item})[0]
    
    def save(self, path):
        saver = tf.train.Saver()
        saver.save(self.sess, path)

In [13]:
len(set(ratings_train_matrix[:,0]))

42977

In [14]:
ratings_train_matrix.shape[0] // 64

846717

In [15]:
items_matrix2[0].shape

(160,)

In [16]:
160+96

256

In [None]:
reset_tf_session()
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
model = MLP(
    n_users = 42977,
    user_embedding_size=96,
    item_embedding_size=160,
    layers=[256, 128, 64],
    sess=sess,
    batch_size=64,
    log_file='/data/home/Xetd71/model1/log.txt'
)
model.execute(
    epochs=10,
    epoch_steps=100_000,
    learning_rate=0.01
)

Iteration: 10000, loss: 1.10621, elapsed time: 20.20563
Iteration: 20000, loss: 1.10033, elapsed time: 19.85891
Iteration: 30000, loss: 1.09843, elapsed time: 20.34593
Iteration: 40000, loss: 1.09820, elapsed time: 20.40916
Iteration: 50000, loss: 1.09707, elapsed time: 20.31811
Iteration: 60000, loss: 1.09513, elapsed time: 308.38502
Iteration: 70000, loss: 1.09552, elapsed time: 35.58398
Iteration: 80000, loss: 1.09506, elapsed time: 35.69402
Iteration: 90000, loss: 1.09517, elapsed time: 27.44035
Epoch: 1
Iteration: 10000, loss: 1.09363, elapsed time: 27.22769
Iteration: 20000, loss: 1.09688, elapsed time: 27.12457
Iteration: 30000, loss: 1.09644, elapsed time: 26.93956
Iteration: 40000, loss: 1.09557, elapsed time: 26.86666
Iteration: 50000, loss: 1.09354, elapsed time: 26.95596
Iteration: 60000, loss: 1.09492, elapsed time: 26.96832


In [None]:
model.test()