In [None]:
import numpy as np
import pandas as pd
from os import path
from collections import OrderedDict
from tqdm import tqdm
from typing import Dict

import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Embedding, Flatten, Input, Lambda
from tensorflow.keras.optimizers import Adam, SGD
import tensorflow.keras.backend as K
from tensorflow.keras.utils import plot_model

from sklearn.metrics import roc_auc_score

from itertools import islice

In [None]:
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [None]:
# columns = ['user_id', 'item_id', 'rating', 'timestamp']
# ratings = pd.read_csv('../ml-100k/u.data', sep='\t', names=columns)
# ratings.drop('timestamp', axis=1, inplace=True)
#
# columns = ['item_id', 'movie title', 'release date', 'video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
# movies = pd.read_csv('../ml-100k/u.item', sep='|', names=columns, encoding='latin-1')
# movies = movies[['item_id', 'movie title']]

In [None]:
ratings = pd.read_csv("test_rating.csv")
ratings['user_id'] = ratings['user_id']
ratings['artwork_id'] = ratings['artwork_id']
ratings = ratings.drop(columns=['id'])
ratings = ratings.rename(columns= {"artwork_id" : "item_id"})
ratings

In [None]:
sorted(ratings.item_id.unique())[-10:]

In [None]:
sorted(ratings.user_id.unique())[:10]

In [None]:
unique_users = ratings.user_id.unique()
user_ids = dict(zip(unique_users, np.arange(unique_users.shape[0], dtype=np.int32)))

unique_items = ratings.item_id.unique()
item_ids = dict(zip(unique_items, np.arange(unique_items.shape[0], dtype=np.int32)))

In [None]:
tmp_test = ratings[ratings.rating > 4]
tmp_test = tmp_test.groupby('user_id').item_id.count().reset_index()

In [None]:
conditions = (ratings.user_id.isin(tmp_test[tmp_test.item_id > 20].user_id)) & (ratings.rating > 4)
df_test = ratings[conditions].groupby('user_id').head(2).reset_index()
del df_test['index']
df_test.shape

In [None]:
ground_truth_test = df_test.groupby('user_id').item_id.agg(list).reset_index()
ground_truth_test.head()

In [None]:
df_train = pd.concat([ratings, df_test]).drop_duplicates(keep=False)
df_train.shape

In [None]:
ground_truth_train = df_train[df_train.rating > 3].groupby('user_id').item_id.agg(list).reset_index()

ground_truth_train.head()

In [None]:
df_triples = pd.DataFrame(columns=['user_id', 'positive_id', 'negative_id'], dtype=np.int32)
df_triples

In [None]:
data = []
users_without_data = []

for user_id in tqdm(df_train.user_id.unique()):
    positive_items = df_train[(df_train.user_id == user_id) & (df_train.rating > 3)].item_id.values
    negative_items = df_train[(df_train.user_id == user_id) & (df_train.rating <= 3)].item_id.values

    if negative_items.shape[0] == 0 or positive_items.shape[0] == 0:
        users_without_data.append(user_id)
        continue

    for positive_item in positive_items:
        for negative_item in negative_items:
            data.append({'user_id' : user_id , "positive_id" : positive_item, "negative_id": negative_item})

df_triples = df_triples.append(data, ignore_index=True)

In [None]:
df_triples.shape

In [None]:
df_train.shape

In [None]:
n_users = max(unique_users) + 1
n_items = max(unique_items) + 1

n_users, n_items

In [None]:
unique_item_ids = list(ratings.item_id.unique())

In [None]:
def bpr_predict(model: Model, user_id: int, item_ids: list, user_layer="user_embedding", item_layer="item_embedding"):
    user_vector = model.get_layer(user_layer).get_weights()[0][user_id]
    item_matrix = model.get_layer(item_layer).get_weights()[0][item_ids]

    scores = (np.dot(user_vector, item_matrix.T))

    return scores

In [None]:
@tf.function
def identity_loss(_, y_pred):
    return tf.math.reduce_mean(y_pred)

In [None]:
@tf.function
def bpr_triples_loss(X: dict):
    positive_item_latent, negative_item_latent, user_latent = X

    positive_interactions = tf.math.reduce_sum(tf.math.multiply(user_latent, positive_item_latent), axis=-1, keepdims=True)
    negative_interactions = tf.math.reduce_sum(tf.math.multiply(user_latent, negative_item_latent), axis=-1, keepdims=True)

    return tf.math.subtract(tf.constant(1.0), tf.sigmoid(tf.math.subtract(positive_interactions, negative_interactions)))

In [None]:
def out_shape(shapes):
    return shapes[0]

In [None]:
def build_model(num_users: int, num_items: int, latent_dim: int) -> Model:
    user_input = Input(shape=(1, ), name='user_input')

    positive_item_input = Input(shape=(1, ), name='positive_item_input')
    negative_item_input = Input(shape=(1, ), name='negative_item_input')

    item_embedding_layer = Embedding(num_items, latent_dim, name="item_embedding", input_length=1)

    positive_embedding = Flatten()(item_embedding_layer(positive_item_input))
    negative_embedding = Flatten()(item_embedding_layer(negative_item_input))

    user_embedding = Embedding(num_users, latent_dim, name="user_embedding", input_length=1)(user_input)
    user_embedding = Flatten()(user_embedding)

    triplet_loss = Lambda(bpr_triples_loss, output_shape=out_shape)([positive_embedding, negative_embedding, user_embedding])

    model = Model(inputs=[positive_item_input, negative_item_input, user_input], outputs=triplet_loss)

    return model

In [None]:
latent_dim = 20
batch_size = 256
num_epochs = 50
lr = 0.001

model = build_model(n_users, n_items, latent_dim)
model.compile(loss=identity_loss, optimizer=Adam(learning_rate=lr))

In [None]:
trainable_count = np.sum([K.count_params(w) for w in model.trainable_weights])
non_trainable_count = np.sum([K.count_params(w) for w in model.non_trainable_weights])

print('Total number of parameters: {:,}'.format(trainable_count + non_trainable_count))
print('Trainable number of parameters: {:,}'.format(trainable_count))
print('Non-trainable number of parameters: {:,}'.format(non_trainable_count))

print('Training data length: {:,}'.format(df_triples.shape[0]))

In [None]:
X = {
    'user_input': tf.convert_to_tensor(df_triples.user_id),
    'positive_item_input': tf.convert_to_tensor(df_triples.positive_id),
    'negative_item_input' : tf.convert_to_tensor(df_triples.negative_id)
}

In [None]:
model.fit(X,
          tf.ones(df_triples.shape[0]),
          batch_size=batch_size,
          epochs=num_epochs,
          verbose=0)

In [None]:
model.save(path.join('./', "model.h5"))

In [None]:
def full_auc(model: Model, ground_truth: Dict[int, list], items: list) -> float:
    """
    Measure AUC for model and ground truth for all items

    :param model:
    :param ground_truth: dictionary of the users and the high ranked movies for the specific user
    :param items: a list of the all available movies
    :return: AUC
    """

    number_of_items = len(items)
    scores = []

    for user_id, true_item_ids in ground_truth:
        predictions = bpr_predict(model, user_id, items)
        grnd = np.zeros(number_of_items, dtype=np.int32)

        for p in true_item_ids:
            index = items.index(p)
            grnd[index] = 1

        if true_item_ids:
            scores.append(roc_auc_score(grnd, predictions))

    return sum(scores) / len(scores)

In [None]:
def mean_average_precision_k(model: Model,
                           ground_truth: Dict[int, list],
                           items: list,
                           k=100) -> float:
    """
    Calculate mean eavarage precission per user

    :param model:
    :param ground_truth: dictionary of the users and the high ranked movies for the specific user
    :param items: a list of the all available movies
    :param k: top N recommendations per user
    :return: mean eavarage precission
    """
    scores = []

    for user, actual in ground_truth:
        predictions = bpr_predict(model, user, items)
        predictions = dict(zip(items, predictions))
        predictions = sorted(predictions.items(), key=lambda kv: kv[1], reverse=True)[:k]
        predictions = list(OrderedDict(predictions).keys())

        score = 0.0
        num_hits = 0.0

        for i, p in enumerate(predictions):
            if p in actual:
                num_hits += 1.0
                score += num_hits / (i + 1.0)

        score = score / min(len(actual), k)
        scores.append(score)

    return np.mean(scores)

In [None]:
print(f'AUC Train : {full_auc(model, ground_truth_train.values, unique_item_ids)}')

In [None]:
print(f'AUC Test : {full_auc(model, ground_truth_test.values, unique_item_ids)}')

In [None]:
print(f'Mean Average precision test : {mean_average_precision_k(model, ground_truth_test.values, unique_item_ids)}')

In [None]:
def _recommend_user( model, ratings, n_items, user, N):
    scores = bpr_predict(model, user, np.arange(0, n_items))
    positive_items = df_train[(ratings.user_id == user) & (ratings.rating > 3)].item_id.values
    liked = set(positive_items)
    count = N + len(positive_items)
    if count < scores.shape[0]:
        ids = np.argpartition(scores, -count)[-count:]
        best_ids = np.argsort(scores[ids])[::-1]
        best = ids[best_ids]
    else:
        best = np.argsort(scores)[::-1]
    top_n = list(islice((rec for rec in best if rec if rec not in liked), N))
    return top_n

def recommand(n_users, n_items, ratings, model, N=10):
    recommendation = np.zeros((n_users, N), dtype=np.uint32)
    for user in range(n_users):
        top_n = _recommend_user(model, ratings,n_items, user, N)
        recommendation[user] = top_n
    return recommendation

In [None]:
pd.DataFrame(recommand(n_users, n_items, ratings, model, N=10)).to_json("prediction_score.json", orient="values", indent=4)

In [None]:
model.summary()

In [None]:
plot_model(model, to_file="model.png", show_shapes=True)