In [1]:
import os
import re
import sys
import time
import gzip
import json
import implicit
import numpy as np
import tensorflow as tf
import pandas as pd
from scipy import sparse
from tqdm import tqdm_notebook as tqdm

In [2]:
IS_LOCAL = False
HOME_DIR = '/mnt/E/Projects/Content-based-Neural-Recommender-Systems/' if IS_LOCAL else '../../'
os.environ['HOME_DIR'] = HOME_DIR

sys.path.append("../..")
from utils.prepare_data import utf8_preview

DATA_DIR = f'{HOME_DIR}data/zen/'
WORKING_DIR = f'{HOME_DIR}/models/als_item_vector_prediction/'
os.chdir(WORKING_DIR)

## Load data

In [3]:
image_size = 96
items_df = []
for line in tqdm(gzip.GzipFile(f"{DATA_DIR}items.json.gz", "r"), 'loading items'):
    j = json.loads(line)
    j["content"] = j["content"].encode("utf8")  # storing in utf8 saves RAM
    j["title"] = j["title"].encode("utf8")
    if np.isnan(j["image"]).any():
        j["image"] = [0]*image_size
    items_df.append(j)
items_df = pd.DataFrame(items_df).apply(utf8_preview)

HBox(children=(IntProgress(value=1, bar_style='info', description='loading items', max=1, style=ProgressStyle(…




In [4]:
test_size = 0.2

In [5]:
split_n = int(items_df.shape[0]*(1-test_size))
items_train_df, items_test_df = items_df.iloc[:split_n], items_df.iloc[split_n:]

In [6]:
split_n

262440

In [7]:
items_train_df.shape, items_test_df.shape

((262440, 4), (65610, 4))

In [8]:
def split_items(user_items, user_ratings, split_n):
    user_items, user_ratings = np.array(user_items), np.array(user_ratings)
    train_idxs = user_items < split_n
    test_idxs = train_idxs ^ True
    return (user_items[train_idxs], user_ratings[train_idxs]), (user_items[test_idxs], user_ratings[test_idxs])

In [9]:
users_train_df, users_test_df = [], []
for line in tqdm(gzip.GzipFile(f"{DATA_DIR}train.json.gz", "r"), 'loading users'):
    j = json.loads(line)
    user_items = []
    user_ratings = []
    for item, rating in j["trainRatings"].items():
        user_items.append(int(item))
        user_ratings.append(int(rating))

    user_train, user_test = split_items(user_items, user_ratings, split_n)
    users_train_df.append({
        'userId': j["userId"],
        'userItems': np.array(user_train[0]),
        'userRatings': np.array(user_train[1]),
    })
    users_test_df.append({
        'userId': j["userId"],
        'userItems': np.array(user_test[0]),
        'userRatings': np.array(user_test[1]),
    })
users_train_df = pd.DataFrame(users_train_df)
users_test_df = pd.DataFrame(users_test_df)

HBox(children=(IntProgress(value=1, bar_style='info', description='loading users', max=1, style=ProgressStyle(…




In [10]:
train_df, test_df = users_train_df, users_test_df

In [11]:
tmp = []
for t in train_df['userItems']:
    tmp += t.tolist()
for t in test_df['userItems']:
    tmp += t.tolist()

In [12]:
max(tmp)

328049

In [13]:
len(set(tmp))

242356

In [14]:
max(train_df['userItems'].apply(max))

262436

## CSR-matrix

In [15]:
user_ids = np.concatenate([[v]*v_len for v, v_len in zip(
    train_df['userId'].values, train_df['userItems'].apply(len).values)])
items_ids = np.concatenate(train_df['userItems'].values)
ratings = np.concatenate(np.array(list(map(lambda x: x - x.mean(), train_df['userRatings'].values))))
item_user_data = sparse.csr_matrix((ratings, (items_ids, user_ids)), dtype=np.float32)

In [None]:
mean_ratings = np.array(list(map(np.mean, train_df['userRatings'].values)))

## ALS

In [None]:
item_user_data.shape

(262437, 42977)

In [None]:
users_n = train_df.shape[0]
items_n = train_df['userItems'].apply(max).max()
print(f'users_n: {users_n}\titems_n: {items_n}')

users_n: 42977	items_n: 262436


In [None]:
als_model = implicit.als.AlternatingLeastSquares(
    factors=64,
    regularization=0.01,
    iterations=20,
    calculate_training_loss=True
)
als_model.fit(item_user_data)

100%|██████████| 20.0/20 [02:20<00:00,  7.00s/it, loss=-.0051] 


In [None]:
als_model.user_factors.shape

(42977, 64)

In [None]:
als_model.item_factors.shape

(262437, 64)

In [None]:
als_item_factors = als_model.item_factors

## items_context

In [None]:
PREPROC_DIR = f'{DATA_DIR}preproc/'
items_matrix = np.load(f'{PREPROC_DIR}items_matrix2.npy')

In [None]:
items_matrix.shape

(328050, 160)

In [None]:
items_matrix_train, items_matrix_test = items_matrix[:split_n], items_matrix[split_n:]

In [None]:
items_matrix_train.shape, items_matrix_test.shape

((262440, 160), (65610, 160))

In [None]:
# assert(als_item_factors.shape[0] == items_matrix_train.shape[0])

## model to predict als-embedding

In [None]:
ITEM_EMBEDDING_SHAPE = 160
ALS_EMBEDDING_SHAPE = 64

In [None]:
import tensorflow as tf

In [None]:
def reset_tf_session():
    curr_session = tf.get_default_session()
    # close current session
    if curr_session is not None:
        curr_session.close()
    # reset graph
    tf.reset_default_graph()
    
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    return tf.Session(config=config)

In [None]:
sess = reset_tf_session()
tf.set_random_seed(42)

In [None]:
class model:
    item_embedding = tf.placeholder('float32', shape=[None, ITEM_EMBEDDING_SHAPE])
    als_item = tf.placeholder('float32', shape=[None, ALS_EMBEDDING_SHAPE])
    
    layer = tf.layers.dense(item_embedding, 256, tf.nn.elu, kernel_initializer=tf.random_normal_initializer)
    layer = tf.layers.dropout(layer, 0.2)
    layer = tf.layers.dense(layer, 256, tf.nn.elu, kernel_initializer=tf.random_normal_initializer)
    layer = tf.layers.dropout(layer, 0.2)
    layer = tf.layers.dense(layer, 128, tf.nn.elu, kernel_initializer=tf.random_normal_initializer)
    layer = tf.layers.dropout(layer, 0.2)
    layer = tf.layers.dense(layer, 64, None, kernel_initializer=tf.random_normal_initializer)
    
    loss = tf.reduce_sum(tf.square(layer - als_item))

In [None]:
# define optimizer operation to minimize the loss
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
train_step = optimizer.minimize(model.loss)

# will be used to save/load network weights.
# you need to reset your default graph and define it in the same way to be able to load the saved weights!
saver = tf.train.Saver()

# intialize all variables
sess.run(tf.global_variables_initializer())

In [None]:
EPOCHS = 10
BATCH_SIZE = 64
N_BATCHES_PER_EPOCH = 90_000

# SPLIT_N = 262437
SPLIT_N = min(als_item_factors.shape[0], items_matrix_train.shape[0])
als_item_factors = als_item_factors[:SPLIT_N]
items_matrix_train = items_matrix_train[:SPLIT_N]

In [None]:
def items_embedding_als_batch():
    indxs = np.random.randint(0, SPLIT_N, BATCH_SIZE)
    return {
        model.item_embedding: items_matrix_train[indxs],
        model.als_item: als_item_factors[indxs]
    }

In [None]:
import tqdm_utils

In [None]:
# to make training reproducible
np.random.seed(42)

for epoch in range(EPOCHS):
    
    train_loss = 0
    pbar = tqdm_utils.tqdm_notebook_failsafe(range(N_BATCHES_PER_EPOCH))
    counter = 0
    for i, _ in enumerate(pbar):
        batch_train_loss, _ = sess.run(
            [model.loss, train_step], 
            items_embedding_als_batch()
        )
        train_loss += batch_train_loss
        counter += 1
        pbar.set_description("Training loss: %f" % (train_loss / counter))
        
    train_loss /= N_BATCHES_PER_EPOCH
    
    print('Epoch: {}, train loss: {}'.format(epoch, train_loss))

print("Finished!")

HBox(children=(IntProgress(value=0, max=90000), HTML(value='')))

In [None]:
items_matrix_test.shape

In [None]:
%%time
als_item_factors_predict = sess.run(model.layer, feed_dict={model.item_embedding: items_matrix_test})

In [None]:
als_item_factors_predict.shape

In [None]:
split_n

In [None]:
sum(als_item_factors[0]**2)

In [None]:
als_item_factors_predict

In [None]:
from sklearn.metrics import log_loss

In [None]:
loss = 0
eps = 1e-06
for _, (idd, items, ratings) in tqdm(test_df.iterrows()):
    predictied_ratings = als_item_factors_predict.dot(als_model.user_factors[idd])[items-split_n]+mean_ratings[idd]
#     predictied_ratings = (predictied_ratings - predictied_ratings.min()) / (predictied_ratings.max() - predictied_ratings.min())
    predictied_ratings = np.clip(predictied_ratings, eps, 1 - eps)
#     print(predictied_ratings[ratings==1])
    if (ratings == 0).all() or (ratings == 1).all():
        continue
    loss += log_loss(ratings.astype(np.float64), predictied_ratings)
loss /= test_df.shape[0]

In [None]:
loss

In [None]:
predictied_ratings[ratings==1]