In [1]:
import os
import re
import sys
import time
import implicit
import numpy as np
import tensorflow as tf
import pandas as pd
from scipy import sparse
from tqdm import tqdm
# from tqdm import tqdm_notebook as tqdm

In [3]:
IS_LOCAL = False
HOME_DIR = '/mnt/E/Projects/Content-based-Neural-Recommender-Systems/' if IS_LOCAL else '../'
os.environ['HOME_DIR'] = HOME_DIR

# sys.path.append("../..")
from prepare_data import get_zen_data, zen_text_iterator, tokenize

DATA_DIR = f'{HOME_DIR}data/zen/'
WORKING_DIR = f'{HOME_DIR}models/mlp/'
os.chdir(WORKING_DIR)

## Load data

In [4]:
items_df, (train_df, test_df) = get_zen_data()

loading items: 328050it [00:32, 10015.82it/s]
loading users: 10431it [00:40, 234.66it/s]

KeyboardInterrupt: 

In [None]:
import implicit
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

## CSR-matrix

In [None]:
user_ids = np.concatenate([[v]*v_len for v, v_len in zip(
    train_df['userId'].values, train_df['userItems'].apply(len).values)])
items_ids = np.concatenate(train_df['userItems'].values)
ratings = np.concatenate(np.array(list(map(lambda x: x - x.mean(), train_df['userRatings'].values))))
item_user_data = sparse.csr_matrix((ratings, (items_ids, user_ids)), dtype=np.float32)

In [None]:
mean_ratings = np.array(list(map(np.mean, train_df['userRatings'].values)))

## ALS

In [None]:
item_user_data.shape

In [None]:
users_n = train_df.shape[0]
items_n = train_df['userItems'].apply(max).max()
print(f'users_n: {users_n}\titems_n: {items_n}')

In [None]:
als_model = implicit.als.AlternatingLeastSquares(
    factors=96,
    regularization=0.01,
    iterations=20,
    calculate_training_loss=True
)
als_model.fit(item_user_data)

In [None]:
als_model.user_factors.shape

In [None]:
als_model.item_factors.shape

In [None]:
users_matrix = np.concatenate((als_model.user_factors, np.array([mean_ratings]).T), axis=1)
users_matrix.shape

## Doc2Vec

In [16]:
!pwd

/data/home/Xetd71/Content-based-Neural-Recommender-Systems/models/mlp


In [None]:
doc2vec_model = Doc2Vec.load('/data/home/Xetd71/doc2vec_model/doc2vec_model')

In [None]:
items_text = (items_df['title'] + ' ' + items_df['content']).values

In [None]:
items_text.shape

In [None]:
items_processed_text = []
for item_text in tqdm(items_text):
    items_processed_text.append(doc2vec_model.infer_vector(tokenize(item_text)))
items_processed_text = np.array(items_processed_text)

In [None]:
items_processed_text.shape

In [None]:
items_processed_images = np.array(list(map(np.array, items_df['image'].values)))

In [None]:
items_processed_images.shape

In [None]:
items_matrix = np.concatenate((als_model.item_factors, items_processed_text, items_processed_images), axis=1)

In [None]:
items_matrix.shape

In [None]:
items_matrix2 = np.concatenate((items_processed_text, items_processed_images), axis=1)

## Ratings

In [None]:
def get_user_item_rating_matrix(users_df):
    user_ids = np.array([np.concatenate([[v]*v_len for v, v_len in zip(
        users_df['userId'].values, users_df['userItems'].apply(len).values)])], dtype=np.int).T
    items_ids = np.array([np.concatenate(users_df['userItems'].values)], dtype=np.int).T
    ratings = np.array([np.concatenate(np.array(list(map(lambda x: x, users_df['userRatings'].values))))], dtype=np.int).T
    return np.concatenate((user_ids, items_ids, ratings), axis=1)

In [None]:
ratings_train_matrix = get_user_item_rating_matrix(train_df)

In [None]:
ratings_train_matrix.shape

In [None]:
ratings_test_matrix = get_user_item_rating_matrix(test_df)

In [None]:
ratings_test_matrix.shape

In [None]:
ratings_test_matrix.shape[0]/(ratings_train_matrix.shape[0] + ratings_test_matrix.shape[0])

## Save users and items matrix

In [None]:
PREPROC_DIR = f'{DATA_DIR}preproc/'
if not os.path.exists(PREPROC_DIR):
    os.makedirs(PREPROC_DIR)

In [None]:
PREPROC_DIR

In [None]:
np.save(f'{PREPROC_DIR}users_matrix.npy', users_matrix)
np.save(f'{PREPROC_DIR}items_matrix.npy', items_matrix)
np.save(f'{PREPROC_DIR}ratings_train_matrix.npy', ratings_train_matrix)
np.save(f'{PREPROC_DIR}ratings_test_matrix.npy', ratings_test_matrix)

In [None]:
np.save(f'{PREPROC_DIR}als_model.user_factors.npy', als_model.user_factors)
np.save(f'{PREPROC_DIR}items_matrix2.npy', items_matrix2)