In [1]:
import os
import re
import sys
import time
import gzip
import json
import implicit
import numpy as np
import tensorflow as tf
import pandas as pd
from scipy import sparse
from tqdm import tqdm_notebook as tqdm

In [2]:
IS_LOCAL = False
HOME_DIR = '/mnt/E/Projects/Content-based-Neural-Recommender-Systems/' if IS_LOCAL else '../../'
os.environ['HOME_DIR'] = HOME_DIR

sys.path.append("../..")
from utils.prepare_data import utf8_preview

DATA_DIR = f'{HOME_DIR}data/zen/'
WORKING_DIR = f'{HOME_DIR}/models/als_item_vector_prediction/'
os.chdir(WORKING_DIR)

## Load data

In [3]:
image_size = 96
items_df = []
for line in tqdm(gzip.GzipFile(f"{DATA_DIR}items.json.gz", "r"), 'loading items'):
    j = json.loads(line)
    j["content"] = j["content"].encode("utf8")  # storing in utf8 saves RAM
    j["title"] = j["title"].encode("utf8")
    if np.isnan(j["image"]).any():
        j["image"] = [0]*image_size
    items_df.append(j)
items_df = pd.DataFrame(items_df).apply(utf8_preview)

HBox(children=(IntProgress(value=1, bar_style='info', description='loading items', max=1, style=ProgressStyle(…




In [4]:
test_size = 0.2

In [5]:
split_n = int(items_df.shape[0]*(1-test_size))
items_train_df, items_test_df = items_df.iloc[:split_n], items_df.iloc[split_n:]

In [6]:
split_n

262440

In [7]:
items_train_df.shape, items_test_df.shape

((262440, 4), (65610, 4))

In [8]:
def split_items(user_items, user_ratings, split_n):
    user_items, user_ratings = np.array(user_items), np.array(user_ratings)
    train_idxs = user_items < split_n
    test_idxs = train_idxs ^ True
    return (user_items[train_idxs], user_ratings[train_idxs]), (user_items[test_idxs], user_ratings[test_idxs])

In [9]:
users_train_df, users_test_df = [], []
for line in tqdm(gzip.GzipFile(f"{DATA_DIR}train.json.gz", "r"), 'loading users'):
    j = json.loads(line)
    user_items = []
    user_ratings = []
    for item, rating in j["trainRatings"].items():
        user_items.append(int(item))
        user_ratings.append(int(rating))

    user_train, user_test = split_items(user_items, user_ratings, split_n)
    users_train_df.append({
        'userId': j["userId"],
        'userItems': np.array(user_train[0]),
        'userRatings': np.array(user_train[1]),
    })
    users_test_df.append({
        'userId': j["userId"],
        'userItems': np.array(user_test[0]),
        'userRatings': np.array(user_test[1]),
    })
users_train_df = pd.DataFrame(users_train_df)
users_test_df = pd.DataFrame(users_test_df)

HBox(children=(IntProgress(value=1, bar_style='info', description='loading users', max=1, style=ProgressStyle(…




In [10]:
train_df, test_df = users_train_df, users_test_df

In [11]:
tmp = []
for t in train_df['userItems']:
    tmp += t.tolist()
for t in test_df['userItems']:
    tmp += t.tolist()

In [12]:
max(tmp)

328049

In [13]:
len(set(tmp))

242356

In [14]:
max(train_df['userItems'].apply(max))

262436

## CSR-matrix

In [15]:
user_ids = np.concatenate([[v]*v_len for v, v_len in zip(
    train_df['userId'].values, train_df['userItems'].apply(len).values)])
items_ids = np.concatenate(train_df['userItems'].values)
ratings = np.concatenate(np.array(list(map(lambda x: x - x.mean(), train_df['userRatings'].values))))
item_user_data = sparse.csr_matrix((ratings, (items_ids, user_ids)), dtype=np.float32)

In [16]:
mean_ratings = np.array(list(map(np.mean, train_df['userRatings'].values)))

## ALS

In [17]:
item_user_data.shape

(262437, 42977)

In [18]:
users_n = train_df.shape[0]
items_n = train_df['userItems'].apply(max).max()
print(f'users_n: {users_n}\titems_n: {items_n}')

users_n: 42977	items_n: 262436


In [19]:
als_model = implicit.als.AlternatingLeastSquares(
    factors=64,
    regularization=0.01,
    iterations=20,
    calculate_training_loss=True
)
als_model.fit(item_user_data)

100%|██████████| 20.0/20 [02:20<00:00,  7.00s/it, loss=-.0051] 


In [20]:
als_model.user_factors.shape

(42977, 64)

In [21]:
als_model.item_factors.shape

(262437, 64)

In [22]:
als_item_factors = als_model.item_factors

In [48]:
als_model.item_factors

array([[-5.05262380e-03, -4.06796020e-03, -4.03186120e-03, ...,
         1.55082997e-02,  3.85848503e-03,  5.13289729e-03],
       [ 1.13298872e-03, -1.84161053e-03,  7.36503804e-04, ...,
         7.61957839e-04, -1.35281123e-03, -5.82765148e-04],
       [ 8.29866854e-04, -2.04821746e-03,  1.00235466e-03, ...,
         1.20832480e-03,  1.77613998e-04, -5.55109698e-04],
       ...,
       [ 3.12762240e-13,  7.91967240e-13,  6.88148052e-13, ...,
         3.03570672e-13,  5.03090679e-13,  5.57207275e-13],
       [-3.79268755e-03,  1.41567586e-03, -2.53510545e-03, ...,
         1.70553988e-03, -2.13955436e-03, -1.99202754e-04],
       [-1.11668836e-04, -1.10197783e-04, -5.40284556e-04, ...,
         6.78873621e-04,  2.43592658e-03,  1.34661025e-03]], dtype=float32)

In [49]:
als_model.item_factors[0]

array([-5.0526238e-03, -4.0679602e-03, -4.0318612e-03, -4.4577086e-04,
       -1.0077970e-03,  3.5765825e-03, -3.7928505e-03, -2.0355291e-03,
        5.7448326e-03,  6.0828752e-03,  7.0429774e-04,  3.8939731e-03,
       -3.0015018e-03, -1.0716072e-03,  8.4128249e-03, -1.0839609e-03,
        3.4444178e-03, -7.2769960e-03,  2.8345420e-04,  1.1460707e-03,
        3.9740289e-03, -1.2699587e-03,  7.7811764e-03,  5.7185376e-03,
        2.0686530e-03, -6.3279546e-03,  1.0030845e-02,  1.8581138e-03,
       -5.0841016e-03,  6.0986234e-03,  1.0523131e-02,  6.7430054e-04,
        1.5817288e-03, -1.9593611e-03, -5.7953843e-03, -9.1385422e-03,
        4.8826533e-04, -4.0258975e-03,  3.7696105e-03, -7.3920283e-03,
        1.6722352e-03, -7.1544624e-03,  3.5342048e-03,  4.8683264e-04,
       -2.9618014e-04,  8.4422491e-03,  3.1182331e-03, -1.2281337e-03,
       -4.5516933e-03,  4.0876297e-03,  5.8943468e-05,  2.4722945e-03,
       -2.3465338e-03,  1.6390193e-04,  5.4877187e-04,  3.7642368e-03,
      

In [50]:
als_model.item_factors[0].sum()

0.064279005

## items_context

In [23]:
PREPROC_DIR = f'{DATA_DIR}preproc/'
items_matrix = np.load(f'{PREPROC_DIR}items_matrix2.npy')

In [24]:
items_matrix.shape

(328050, 160)

In [25]:
items_matrix_train, items_matrix_test = items_matrix[:split_n], items_matrix[split_n:]

In [26]:
items_matrix_train.shape, items_matrix_test.shape

((262440, 160), (65610, 160))

In [27]:
# assert(als_item_factors.shape[0] == items_matrix_train.shape[0])

## model to predict als-embedding

In [28]:
ITEM_EMBEDDING_SHAPE = 160
ALS_EMBEDDING_SHAPE = 64

In [29]:
import tensorflow as tf

In [30]:
def reset_tf_session():
    curr_session = tf.get_default_session()
    # close current session
    if curr_session is not None:
        curr_session.close()
    # reset graph
    tf.reset_default_graph()
    
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    return tf.Session(config=config)

In [31]:
sess = reset_tf_session()
tf.set_random_seed(42)

In [32]:
class model:
    item_embedding = tf.placeholder('float32', shape=[None, ITEM_EMBEDDING_SHAPE])
    als_item = tf.placeholder('float32', shape=[None, ALS_EMBEDDING_SHAPE])
    
    layer = tf.layers.dense(item_embedding, 256, tf.nn.elu, kernel_initializer=tf.random_normal_initializer)
    layer = tf.layers.dropout(layer, 0.2)
    layer = tf.layers.dense(layer, 256, tf.nn.elu, kernel_initializer=tf.random_normal_initializer)
    layer = tf.layers.dropout(layer, 0.2)
    layer = tf.layers.dense(layer, 128, tf.nn.elu, kernel_initializer=tf.random_normal_initializer)
    layer = tf.layers.dropout(layer, 0.2)
    layer = tf.layers.dense(layer, 64, None, kernel_initializer=tf.random_normal_initializer)
    
    loss = tf.reduce_sum(tf.square(layer - als_item))

In [33]:
# define optimizer operation to minimize the loss
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
train_step = optimizer.minimize(model.loss)

# will be used to save/load network weights.
# you need to reset your default graph and define it in the same way to be able to load the saved weights!
saver = tf.train.Saver()

# intialize all variables
sess.run(tf.global_variables_initializer())

In [34]:
EPOCHS = 10
BATCH_SIZE = 64
N_BATCHES_PER_EPOCH = 90_000

# SPLIT_N = 262437
SPLIT_N = min(als_item_factors.shape[0], items_matrix_train.shape[0])
als_item_factors = als_item_factors[:SPLIT_N]
items_matrix_train = items_matrix_train[:SPLIT_N]

In [35]:
def items_embedding_als_batch():
    indxs = np.random.randint(0, SPLIT_N, BATCH_SIZE)
    return {
        model.item_embedding: items_matrix_train[indxs],
        model.als_item: als_item_factors[indxs]
    }

In [36]:
import tqdm_utils

In [37]:
# to make training reproducible
np.random.seed(42)

for epoch in range(EPOCHS):
    
    train_loss = 0
    pbar = tqdm_utils.tqdm_notebook_failsafe(range(N_BATCHES_PER_EPOCH))
    counter = 0
    for i, _ in enumerate(pbar):
        batch_train_loss, _ = sess.run(
            [model.loss, train_step], 
            items_embedding_als_batch()
        )
        train_loss += batch_train_loss
        counter += 1
        pbar.set_description("Training loss: %f" % (train_loss / counter))
        
    train_loss /= N_BATCHES_PER_EPOCH
    
    print('Epoch: {}, train loss: {}'.format(epoch, train_loss))

print("Finished!")

HBox(children=(IntProgress(value=0, max=90000), HTML(value='')))


Epoch: 0, train loss: 25839046.61228021


HBox(children=(IntProgress(value=0, max=90000), HTML(value='')))


Epoch: 1, train loss: 7.492560412728124


HBox(children=(IntProgress(value=0, max=90000), HTML(value='')))


Epoch: 2, train loss: 7.36990354383207


HBox(children=(IntProgress(value=0, max=90000), HTML(value='')))


Epoch: 3, train loss: 7.5161634179261


HBox(children=(IntProgress(value=0, max=90000), HTML(value='')))


Epoch: 4, train loss: 7.517902467620207


HBox(children=(IntProgress(value=0, max=90000), HTML(value='')))


Epoch: 5, train loss: 7.4348320710059665


HBox(children=(IntProgress(value=0, max=90000), HTML(value='')))


Epoch: 6, train loss: 7.573778399551577


HBox(children=(IntProgress(value=0, max=90000), HTML(value='')))


Epoch: 7, train loss: 7.406600799159871


HBox(children=(IntProgress(value=0, max=90000), HTML(value='')))

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceed


Epoch: 9, train loss: 7.50356530638155
Finished!


In [38]:
items_matrix_test.shape

(65610, 160)

In [39]:
%%time
als_item_factors_predict = sess.run(model.layer, feed_dict={model.item_embedding: items_matrix_test})

CPU times: user 50.5 ms, sys: 29.3 ms, total: 79.8 ms
Wall time: 78.5 ms


In [40]:
als_item_factors_predict.shape

(65610, 64)

In [41]:
split_n

262440

In [42]:
sum(als_item_factors[0]**2)

0.0017362930558815215

In [43]:
als_item_factors_predict

array([[ 0.00864422,  0.0083251 ,  0.01091623, ..., -0.01176611,
         0.00303996,  0.00153351],
       [ 0.00864422,  0.0083251 ,  0.01091623, ..., -0.01176611,
         0.00303996,  0.00153351],
       [ 0.00864422,  0.0083251 ,  0.01091623, ..., -0.01176611,
         0.00303996,  0.00153351],
       ...,
       [ 0.00864422,  0.0083251 ,  0.01091623, ..., -0.01176611,
         0.00303996,  0.00153351],
       [ 0.00864422,  0.0083251 ,  0.01091623, ..., -0.01176611,
         0.00303996,  0.00153351],
       [ 0.00864422,  0.0083251 ,  0.01091623, ..., -0.01176611,
         0.00303996,  0.00153351]], dtype=float32)

In [44]:
from sklearn.metrics import log_loss

In [45]:
loss = 0
eps = 1e-06
for _, (idd, items, ratings) in tqdm(test_df.iterrows()):
    predictied_ratings = als_item_factors_predict.dot(als_model.user_factors[idd])[items-split_n]+mean_ratings[idd]
#     predictied_ratings = (predictied_ratings - predictied_ratings.min()) / (predictied_ratings.max() - predictied_ratings.min())
    predictied_ratings = np.clip(predictied_ratings, eps, 1 - eps)
#     print(predictied_ratings[ratings==1])
    if (ratings == 0).all() or (ratings == 1).all():
        continue
    loss += log_loss(ratings.astype(np.float64), predictied_ratings)
loss /= test_df.shape[0]

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [46]:
loss

0.3671378158508988

In [47]:
predictied_ratings[ratings==1]

array([0.03495412, 0.03495412, 0.03495412, 0.03495412, 0.03495412,
       0.03495412, 0.03495412, 0.03495412, 0.03495412, 0.03495412,
       0.03495412, 0.03495412, 0.03495412, 0.03495412, 0.03495412,
       0.03495412, 0.03495412, 0.03495412, 0.03495412, 0.03495412,
       0.03495412, 0.03495412, 0.03495412, 0.03495412, 0.03495412,
       0.03495412, 0.03495412, 0.03495412, 0.03495412, 0.03495412,
       0.03495412, 0.03495412, 0.03495412, 0.03495412, 0.03495412,
       0.03495412], dtype=float32)