In [1]:
import os
import re
import sys
import time
import implicit
import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import  tqdm

In [2]:
IS_LOCAL = False
HOME_DIR = ('/mnt/E/Projects/' if IS_LOCAL else '/home/Xetd71/') + 'Content-based-Neural-Recommender-Systems/'
os.environ['HOME_DIR'] = HOME_DIR

sys.path.append("../..")
from utils.prepare_data import zen, tokenize
from utils.evaluate import test
from utils import tqdm_utils

DATA_DIR = f'{HOME_DIR}data/zen/'
WORKING_DIR = f'{HOME_DIR}models/dssm'
os.chdir(WORKING_DIR)

100%|██████████| 4349/4349 [00:13<00:00, 328.05it/s]


# Load data

In [3]:
N_USERS = 42977

In [4]:
items_df = zen.items_df()

loading items: 328050it [00:35, 9239.84it/s] 


In [5]:
PREPROC_DIR = f'{DATA_DIR}preproc/'

In [6]:
# users
users_factors = np.load(f'{PREPROC_DIR}users_factors.npy')
users_mean_ratings = np.load(f'{PREPROC_DIR}users_mean_ratings.npy')
print(users_factors.shape, users_mean_ratings.shape)

# ratings
ratings_matrix = np.load(f'{PREPROC_DIR}ratings_matrix.npy')
ratings_0_matrix = np.load(f'{PREPROC_DIR}ratings_0_matrix.npy')
ratings_1_matrix = np.load(f'{PREPROC_DIR}ratings_1_matrix.npy')
print(ratings_matrix.shape, ratings_0_matrix.shape, ratings_1_matrix.shape)

(42977, 64) (42977, 1)
(67780168, 3) (61329513, 3) (6450655, 3)


In [7]:
w1 = ratings_1_matrix.shape[0]/ratings_matrix.shape[0]
w0 = ratings_0_matrix.shape[0]/ratings_matrix.shape[0]
w0, w1

(0.9048297578725387, 0.09517024212746124)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
items_texts = (items_df['title'] + ' ' + items_df['content']).values

In [10]:
%%time
remove_f = 0.000001
VOCAB_SIZE = 1_000_000
bag_of_words = CountVectorizer(tokenizer=tokenize, min_df=remove_f, max_df=1-remove_f, max_features=VOCAB_SIZE)
items_matrix = bag_of_words.fit_transform(items_texts)

CPU times: user 1min 32s, sys: 954 ms, total: 1min 33s
Wall time: 1min 33s


In [11]:
items_matrix

<328050x1000000 sparse matrix of type '<class 'numpy.int64'>'
	with 53898613 stored elements in Compressed Sparse Row format>

In [12]:
from scipy import sparse

In [13]:
user_items_ids = {}
for user, item, _ in tqdm(ratings_1_matrix):
    if user in user_items_ids:
        user_items_ids[user].append(item)
    else:
        user_items_ids[user] = [item]

100%|██████████| 6450655/6450655 [00:10<00:00, 627753.73it/s]


In [14]:
users_matrix = []
for user in tqdm(np.arange(N_USERS)):
    users_matrix.append(sparse.csr_matrix(items_matrix[user_items_ids[user]].mean(axis=0)))
users_matrix = sparse.vstack(users_matrix)

100%|██████████| 42977/42977 [09:33<00:00, 74.95it/s]


In [15]:
users_matrix.shape

(42977, 1000000)

In [16]:
1000000-1e6

0.0

In [17]:
625489

625489

In [18]:
def ratings_batch(ratings_matrix=ratings_matrix, batch_size=64):
    idxs = np.random.randint(0, ratings_matrix.shape[0], batch_size)
    items = ratings_matrix[idxs]
    return users_matrix[items[:, 0]].toarray(), items_matrix[items[:, 1]].toarray(), items[:, 2].T

def normalized_ratings_batch(batch_size=64):
    sparse.vstack
    return tuple(map(lambda x: np.concatenate((x[0], x[1])), zip(
        ratings_batch(ratings_0_matrix, batch_size // 2), 
        ratings_batch(ratings_1_matrix, batch_size - batch_size // 2)
    )))

## Train model

In [32]:
import tensorflow as tf
import keras
from keras import backend as K
from keras.callbacks import TensorBoard

In [33]:
# reset graph when you change architecture!
def reset_tf_session():
    curr_session = tf.get_default_session()
    # close current session
    if curr_session is not None:
        curr_session.close()
    # reset graph
    K.clear_session()
    # create new session
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    s = tf.InteractiveSession(config=config)
    K.set_session(s)
    return s

In [34]:
# import necessary building blocks
from keras.models import Model
from keras.layers import Dense, Input, Embedding, dot, Reshape, Dropout, Activation, Conv1D

In [35]:
def make_model():
    user_embedding = Input(dtype='float32', shape=(VOCAB_SIZE,), name='user_embedding') 
    item_embedding = Input(dtype='float32', shape=(VOCAB_SIZE,), name='item_embedding')
    
    user_layer = Dense(300, activation="elu", input_shape=(VOCAB_SIZE,))(user_embedding)
    user_layer = Dense(300, activation="elu", input_shape=(300,))(user_layer)
    user_layer = Dropout(0.3)(user_layer)
    user_layer = Dense(128, activation=None, input_shape=(300,))(user_layer)
    
    item_layer = Dense(300, activation="elu", input_shape=(VOCAB_SIZE,))(item_embedding)
    item_layer = Dense(300, activation="elu", input_shape=(300,))(item_layer)
    item_layer = Dropout(0.3)(item_layer)
    item_layer = Dense(128, activation=None, input_shape=(300,))(item_layer)
    
    user_item_product = dot([user_layer, item_layer], axes=1)
    pred_y = Activation("sigmoid")(user_item_product)
    
    model = Model(inputs=[user_embedding, item_embedding], outputs=pred_y)
    
    return model

In [36]:
# describe model
s = reset_tf_session()  # clear default graph
model = make_model()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_embedding (InputLayer)     (None, 1000000)      0                                            
__________________________________________________________________________________________________
item_embedding (InputLayer)     (None, 1000000)      0                                            
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 300)          300000300   user_embedding[0][0]             
__________________________________________________________________________________________________
dense_4 (Dense)                 (None, 300)          300000300   item_embedding[0][0]             
__________________________________________________________________________________________________
dense_2 (D

In [37]:
BATCH_SIZE = 32
STEPS_PER_EPOCH = 10_000
EPOCHS = 100

s = reset_tf_session()  # clear default graph
model = make_model()  # define our model

# prepare model for fitting (loss, optimizer, etc)
model.compile(
    loss='binary_crossentropy',
    optimizer=keras.optimizers.adam(),  # gradient clipping just in case
)

In [38]:
# for saving the model after every epoch
from keras.models import save_model

class ModelSaveCallback(keras.callbacks.Callback):
    def __init__(self, file_name, period):
        super(ModelSaveCallback, self).__init__()
        self.file_name = file_name
        self.period = period

    def on_epoch_end(self, epoch, logs):
        model_filename = self.file_name.format(epoch)
        if (epoch + 1) % self.period == 0:
            save_model(self.model, model_filename)
            print(f"Model saved in {model_filename}")

In [39]:
def dssm_predict(user_id, items):
    return model.predict([users_matrix[[user_id]*len(items)],  items_matrix[items]])[:, 0]

def prediction_hist(p, bins=5):
    hist = []
    for th in np.linspace(0, 1-1/bins, bins):
        hist.append(np.logical_and(th <= p, p <= (th+1/bins)).sum())
    return hist

def dssm_get_prediction_hist(bins=5):
    user_embedding, item_embeddings, ratings = normalized_ratings_batch(batch_size=BATCH_SIZE)
    return prediction_hist(model.predict([user_embedding, item_embeddings])[:, 0], bins)

In [40]:
# for saving the model after every epoch
from keras.models import save_model

class ModelLogFileCallback(keras.callbacks.Callback):
    def __init__(self, file_name):
        super(ModelLogFileCallback, self).__init__()
        self.file_name = file_name
        
    def on_train_begin(self, logs):
        self.file = open(self.file_name, 'a+')
        self.epoch_time = time.time()
 
    def on_train_end(self, logs):
        self.file.close()

    def on_epoch_end(self, epoch, logs):
        ndcg = test.ndcg(dssm_predict)
        p_hist = dssm_get_prediction_hist()
        cur_time = time.time()
        logs['ndcg'] = ndcg
        msg = "Epoch {},   loss: {:.4f},   val ndcg: {:.4f},   p_hist: [{}],   time: {:.4f}".format(
            epoch,
            logs.get('loss'),
            ndcg,
            ', '.join(list(map(str, p_hist))),
            cur_time - self.epoch_time,
        )
        print(msg)
        print(msg, file=self.file, flush=True)
        self.epoch_time = cur_time

In [41]:
last_finished_epoch = 0

# you can continue from snapshot!!!
# from keras.models import load_model
# s = reset_tf_session()
# last_finished_epoch = 103
# model = load_model(path("model4/model_{}".format(last_finished_epoch)), 
#                    custom_objects={"top_3_accuracy": top_3_accuracy})

In [42]:
def train_iterator(batch_size):
    while True:
        user_id, item_embeddings, ratings = normalized_ratings_batch(batch_size=batch_size)
#         user_id, item_embeddings, ratings = ratings_batch(batch_size=batch_size, ratings_matrix=ratings_matrix)
        yield [user_id, item_embeddings], ratings 

In [43]:
import time
tensorboard = TensorBoard(log_dir=f"/data/home/Xetd71/DSSM/logs/{time.time()}")

In [44]:
# fit the model with our eternal generator!
model.fit_generator(
    train_iterator(BATCH_SIZE), 
    steps_per_epoch=STEPS_PER_EPOCH,
    epochs=EPOCHS,
    callbacks=[
        ModelSaveCallback("/data/home/Xetd71/DSSM/model_{}", 1),
        ModelLogFileCallback("/data/home/Xetd71/DSSM/logs.txt"),
        tensorboard,
    ],
    verbose=1,
    initial_epoch=last_finished_epoch,
#     class_weight={0: w0, 1: w1}
)

Epoch 1/100


ResourceExhaustedError: OOM when allocating tensor with shape[1000000,300] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node training/Adam/mul_11 (defined at /anaconda/envs/py36/lib/python3.6/site-packages/keras/optimizers.py:466)  = Mul[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"](Adam/beta_1/read, training/Adam/Variable_2/read)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


Caused by op 'training/Adam/mul_11', defined at:
  File "/anaconda/envs/py36/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/anaconda/envs/py36/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/anaconda/envs/py36/lib/python3.6/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/anaconda/envs/py36/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/anaconda/envs/py36/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 505, in start
    self.io_loop.start()
  File "/anaconda/envs/py36/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 132, in start
    self.asyncio_loop.run_forever()
  File "/anaconda/envs/py36/lib/python3.6/asyncio/base_events.py", line 427, in run_forever
    self._run_once()
  File "/anaconda/envs/py36/lib/python3.6/asyncio/base_events.py", line 1440, in _run_once
    handle._run()
  File "/anaconda/envs/py36/lib/python3.6/asyncio/events.py", line 145, in _run
    self._callback(*self._args)
  File "/anaconda/envs/py36/lib/python3.6/site-packages/tornado/ioloop.py", line 758, in _run_callback
    ret = callback()
  File "/anaconda/envs/py36/lib/python3.6/site-packages/tornado/stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "/anaconda/envs/py36/lib/python3.6/site-packages/tornado/gen.py", line 1233, in inner
    self.run()
  File "/anaconda/envs/py36/lib/python3.6/site-packages/tornado/gen.py", line 1147, in run
    yielded = self.gen.send(value)
  File "/anaconda/envs/py36/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 357, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "/anaconda/envs/py36/lib/python3.6/site-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/anaconda/envs/py36/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 267, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "/anaconda/envs/py36/lib/python3.6/site-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/anaconda/envs/py36/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 534, in execute_request
    user_expressions, allow_stdin,
  File "/anaconda/envs/py36/lib/python3.6/site-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/anaconda/envs/py36/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 294, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/anaconda/envs/py36/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/anaconda/envs/py36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2819, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/anaconda/envs/py36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2845, in _run_cell
    return runner(coro)
  File "/anaconda/envs/py36/lib/python3.6/site-packages/IPython/core/async_helpers.py", line 67, in _pseudo_sync_runner
    coro.send(None)
  File "/anaconda/envs/py36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3020, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "/anaconda/envs/py36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3191, in run_ast_nodes
    if (yield from self.run_code(code, result)):
  File "/anaconda/envs/py36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3267, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-44-14f5739a2d8e>", line 12, in <module>
    initial_epoch=last_finished_epoch,
  File "/anaconda/envs/py36/lib/python3.6/site-packages/keras/legacy/interfaces.py", line 91, in wrapper
    return func(*args, **kwargs)
  File "/anaconda/envs/py36/lib/python3.6/site-packages/keras/engine/training.py", line 2080, in fit_generator
    self._make_train_function()
  File "/anaconda/envs/py36/lib/python3.6/site-packages/keras/engine/training.py", line 990, in _make_train_function
    loss=self.total_loss)
  File "/anaconda/envs/py36/lib/python3.6/site-packages/keras/legacy/interfaces.py", line 91, in wrapper
    return func(*args, **kwargs)
  File "/anaconda/envs/py36/lib/python3.6/site-packages/keras/optimizers.py", line 466, in get_updates
    m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
  File "/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/ops/variables.py", line 856, in _run_op
    return getattr(ops.Tensor, operator)(a._AsTensor(), *args)
  File "/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/ops/math_ops.py", line 878, in binary_op_wrapper
    return func(x, y, name=name)
  File "/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/ops/math_ops.py", line 1131, in _mul_dispatch
    return gen_math_ops.mul(x, y, name=name)
  File "/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/ops/gen_math_ops.py", line 5042, in mul
    "Mul", x=x, y=y, name=name)
  File "/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 488, in new_func
    return func(*args, **kwargs)
  File "/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3274, in create_op
    op_def=op_def)
  File "/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1770, in __init__
    self._traceback = tf_stack.extract_stack()

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[1000000,300] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node training/Adam/mul_11 (defined at /anaconda/envs/py36/lib/python3.6/site-packages/keras/optimizers.py:466)  = Mul[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"](Adam/beta_1/read, training/Adam/Variable_2/read)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

