In [1]:
import os
import re
import sys
import time
import implicit
import numpy as np
import tensorflow as tf
import pandas as pd
from scipy import sparse
from tqdm import tqdm
# from tqdm import tqdm_notebook as tqdm

import implicit
from gensim.models.doc2vec import Doc2Vec

In [2]:
IS_LOCAL = True
HOME_DIR = ('/mnt/E/Projects/' if IS_LOCAL else '/home/Xetd71/') + 'Content-based-Neural-Recommender-Systems/'
os.environ['HOME_DIR'] = HOME_DIR

# sys.path.append("../..")
from prepare_data import zen, tokenize

DATA_DIR = f'{HOME_DIR}data/zen/'
# WORKING_DIR = f'{HOME_DIR}utils/'
# os.chdir(WORKING_DIR)

## Load data

In [3]:
items_df, train_df = zen.items_df(), zen.users_df()

loading items: 328050it [00:28, 11662.02it/s]
loading users: 42977it [01:03, 678.07it/s]


## CSR-matrix

In [4]:
user_ids = np.concatenate([[v]*v_len for v, v_len in zip(
    train_df['userId'].values, train_df['userItems'].apply(len).values)])
items_ids = np.concatenate(train_df['userItems'].values)
ratings = np.concatenate(np.array(list(map(lambda x: x - x.mean(), train_df['userRatings'].values))))
item_user_data = sparse.csr_matrix((ratings, (items_ids, user_ids)), dtype=np.float32)

In [5]:
mean_ratings = np.array(list(map(np.mean, train_df['userRatings'].values)))

## ALS

In [6]:
item_user_data.shape

(328050, 42977)

In [7]:
users_n = train_df.shape[0]
items_n = train_df['userItems'].apply(max).max()
print(f'users_n: {users_n}\titems_n: {items_n}')

users_n: 42977	items_n: 328049


In [8]:
als_model = implicit.als.AlternatingLeastSquares(
    factors=64,
    regularization=0.01,
    iterations=20,
    calculate_training_loss=True
)
als_model.fit(item_user_data)

100%|██████████| 20.0/20 [02:02<00:00,  5.96s/it, loss=0.00909]


In [9]:
als_model.user_factors.shape

(42977, 64)

In [10]:
als_model.item_factors.shape

(328050, 64)

In [27]:
# users_matrix = np.concatenate((als_model.user_factors, np.array([mean_ratings]).T), axis=1)
# users_matrix.shape

In [31]:
users_factors = als_model.user_factors
users_mean_ratings = np.array([mean_ratings]).T
user_factors.shape, user_mean_ratings.shape

((42977, 64), (42977, 1))

## Doc2Vec

In [12]:
import gensim
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [13]:
# doc2vec_model = gensim.models.Doc2Vec(documents=zen.text_iterator(), vector_size=96, window=10, min_count=3, workers=4, epochs=7)
# doc2vec_model.save('/home/Xetd71/doc2vec_model/doc2vec_model')

In [14]:
doc2vec_model = gensim.models.Doc2Vec.load('/data/home/Xetd71/doc2vec_model/doc2vec_model')

In [15]:
items_text = (items_df['title'] + ' ' + items_df['content']).values

In [16]:
items_text.shape

(328050,)

In [20]:
items_processed_text = []
for item_text in tqdm(items_text):
    items_processed_text.append(doc2vec_model.infer_vector(tokenize(item_text)))
items_processed_text = np.array(items_processed_text)

100%|██████████| 328050/328050 [14:36<00:00, 374.36it/s]


In [21]:
items_processed_text.shape

(328050, 96)

In [22]:
items_processed_images = np.array(list(map(np.array, items_df['image'].values)))

In [23]:
items_processed_images.shape

(328050, 96)

In [24]:
# items_matrix = np.concatenate((als_model.item_factors, items_processed_text, items_processed_images), axis=1)
# items_matrix2 = np.concatenate((items_processed_text, items_processed_images), axis=1)

In [32]:
items_factors = als_model.item_factors
items_processed_text
items_processed_images

array([[-0.169,  0.129,  0.067, ..., -0.428, -0.128,  0.145],
       [-0.158, -0.112, -0.325, ...,  0.277,  0.042,  0.149],
       [ 0.084, -0.181,  0.008, ..., -0.108, -0.359, -0.127],
       ...,
       [ 0.027, -0.148, -0.054, ...,  0.209, -0.024,  0.093],
       [ 0.176,  0.205, -0.061, ..., -0.002, -0.187,  0.001],
       [ 0.062,  0.004, -0.299, ...,  0.187,  0.119,  0.299]])

## Ratings

In [8]:
def get_user_item_rating_matrix(users_df):
    user_ids = np.array([np.concatenate([[v]*v_len for v, v_len in zip(
        users_df['userId'].values, users_df['userItems'].apply(len).values)])], dtype=np.int).T
    items_ids = np.array([np.concatenate(users_df['userItems'].values)], dtype=np.int).T
    ratings = np.array([np.concatenate(train_df['userRatings'].values)], dtype=np.int).T
    return np.concatenate((user_ids, items_ids, ratings), axis=1)

In [9]:
# ratings_train_matrix = get_user_item_rating_matrix(train_df)

# ratings_train_matrix.shape

# ratings_test_matrix = get_user_item_rating_matrix(test_df)

# ratings_test_matrix.shape

# ratings_test_matrix.shape[0]/(ratings_train_matrix.shape[0] + ratings_test_matrix.shape[0])

In [10]:
ratings_matrix = get_user_item_rating_matrix(train_df)

In [11]:
ratings_matrix.shape

(67780168, 3)

In [12]:
ratings_0_matrix = ratings_matrix[ratings_matrix[:,2]==0]

In [13]:
ratings_1_matrix = ratings_matrix[ratings_matrix[:,2]==1]

In [14]:
ratings_1_matrix

array([[     0,  93250,      1],
       [     0, 304018,      1],
       [     0, 213209,      1],
       ...,
       [ 42976,  97896,      1],
       [ 42976,  15659,      1],
       [ 42976, 273343,      1]])

## Save users and items matrix

In [15]:
PREPROC_DIR = f'{DATA_DIR}preproc/'
if not os.path.exists(PREPROC_DIR):
    os.makedirs(PREPROC_DIR)

In [16]:
PREPROC_DIR

'/mnt/E/Projects/Content-based-Neural-Recommender-Systems/data/zen/preproc/'

In [17]:
# np.save(f'{PREPROC_DIR}users_matrix.npy', users_matrix)
# np.save(f'{PREPROC_DIR}items_matrix.npy', items_matrix)
# np.save(f'{PREPROC_DIR}ratings_train_matrix.npy', ratings_train_matrix)
# np.save(f'{PREPROC_DIR}ratings_test_matrix.npy', ratings_test_matrix)
# np.save(f'{PREPROC_DIR}als_model.user_factors.npy', als_model.user_factors)
# np.save(f'{PREPROC_DIR}items_matrix2.npy', items_matrix2)

In [18]:
# # users
# np.save(f'{PREPROC_DIR}users_factors.npy', users_factors)
# np.save(f'{PREPROC_DIR}users_mean_ratings.npy', users_mean_ratings)

# # items
# np.save(f'{PREPROC_DIR}items_factors.npy', items_factors)
# np.save(f'{PREPROC_DIR}items_processed_text.npy', items_processed_text)
# np.save(f'{PREPROC_DIR}items_processed_images.npy', items_processed_images)

# ratings
np.save(f'{PREPROC_DIR}ratings_matrix.npy', ratings_matrix)
np.save(f'{PREPROC_DIR}ratings_0_matrix.npy', ratings_0_matrix)
np.save(f'{PREPROC_DIR}ratings_1_matrix.npy', ratings_1_matrix)

In [46]:
np.save(f'{PREPROC_DIR}ratings_0_matrix.npy', ratings_0_matrix)
np.save(f'{PREPROC_DIR}ratings_1_matrix.npy', ratings_1_matrix)