In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

In [3]:
header = ['userId', 'movieId', 'rating', 'timestamp']
ratings = pd.read_csv('./ml-100k/u.data', sep='\t', names=header)

u_unique = ratings.userId.unique()
user2Idx = {o:i+1 for i,o in enumerate(u_unique)}

m_unique = ratings.movieId.unique()
movie2Idx = {o:i+1 for i,o in enumerate(m_unique)}

In [4]:
ratings.userId = ratings.userId.apply(lambda x: user2Idx[x])
ratings.movieId = ratings.movieId.apply(lambda x: movie2Idx[x])

In [5]:
n_split = 20000
ratings_train = ratings[n_split:]
ratings_val = ratings[:n_split]
len(ratings_train), len(ratings_val)

(80000, 20000)

In [6]:
n_users = int(ratings.userId.nunique())
n_movies = int(ratings.movieId.nunique())
n_users_train = int(ratings_train.userId.nunique())
n_movies_train = int(ratings_train.movieId.nunique())
print(n_users, n_movies, n_users_train, n_movies_train)

943 1682 943 1650


In [7]:
max_rating = ratings_train['rating'].max()
min_rating = ratings_train['rating'].min()
av_rating = ratings_train['rating'].mean()
max_rating, min_rating, av_rating

(5, 1, 3.52835)

# Definición RED

In [8]:
from keras.layers import Input, Embedding, Flatten, Dropout, Concatenate, Dense, Activation, Lambda
from keras import Model
from keras.optimizers import Adam

Using TensorFlow backend.


In [9]:
n_latent_factors_user = 5
n_latent_factors_movie = 8
movie_input = Input(shape=[1],name='Item')
movie_embedding = Embedding(n_movies + 1, n_latent_factors_movie, name='Movie-Embedding')(movie_input)
movie_vec = Flatten(name='FlattenMovies')(movie_embedding)
# movie_vec = Dropout(0.2)(movie_vec)


user_input = Input(shape=[1],name='User')
user_vec = Flatten(name='FlattenUsers')(Embedding(n_users + 1, 
                                                  n_latent_factors_user,name='User-Embedding')(user_input))
# user_vec = Dropout(0.2)(user_vec)


concat = Concatenate(name='Concat')([movie_vec, user_vec])
# concat = Dropout(0.2)(concat)

x = Dense(50,name='FullyConnected-1', activation='relu')(concat)
#x = Dropout(0.5)(x)
#x = Dense(50,name='FullyConnected-1', activation='relu')(concat)
#x = Dropout(0.5)(x)


## Se pueden sacar las siguientes dos lineas para no forzar a sigmoidea
x = Dense(1, activation='sigmoid',name='Activation')(x)
x = Lambda(lambda z: (max_rating - min_rating) * z + min_rating)(x)
##

model = Model([user_input, movie_input], x)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Item (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
User (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
Movie-Embedding (Embedding)     (None, 1, 8)         13464       Item[0][0]                       
__________________________________________________________________________________________________
User-Embedding (Embedding)      (None, 1, 5)         4720        User[0][0]                       
__________________________________________________________________________________________________
FlattenMov

In [10]:
import keras.backend as K 
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true))) 

In [11]:
model.load_weights('weights.hdf5')

In [12]:
model.compile('adam', loss= 'mean_squared_error', metrics=[root_mean_squared_error])

In [13]:
model.evaluate([ratings_val.userId, ratings_val.movieId], ratings_val.rating)



[0.8864154481887817, 0.9323507642745972]

In [14]:
model.evaluate([ratings_train.userId, ratings_train.movieId], ratings_train.rating)



[0.7083571329832077, 0.8335200972318649]

# Obtengo embeddings

In [16]:
movie_embeddings_layer = model.layers[2]
user_embeddings_layer = model.layers[3]

In [180]:
movie_embeddings_layer.name, user_embeddings_layer.name

('Movie-Embedding', 'User-Embedding')

In [19]:
movie_embeddings_matrix = movie_embeddings_layer.get_weights()[0]
user_embeddings_matrix = user_embeddings_layer.get_weights()[0]
movie_embeddings_matrix.shape, user_embeddings_matrix.shape

((1683, 8), (944, 5))

In [20]:
user_embeddings_matrix

array([[-0.0014859 , -0.04829219,  0.0136039 ,  0.01360333,  0.03192383],
       [-0.10639615,  0.0577417 ,  0.03024676,  0.06648126, -0.00818836],
       [ 0.00644937,  0.05087709, -0.1267155 ,  0.07583879,  0.00484064],
       ...,
       [-0.04635098, -0.01193763, -0.03389605,  0.19602741, -0.05608499],
       [ 0.10910267, -0.07419512,  0.03692925,  0.08315375,  0.02582754],
       [ 0.07779792,  0.1508785 ,  0.00313076, -0.06561682,  0.11070072]],
      dtype=float32)

In [21]:
movie_embeddings_matrix[1:3]

array([[-0.14363556, -0.11988138,  0.15550709, -0.10945612,  0.04878137,
        -0.07205785,  0.06160824, -0.01302086],
       [-0.07785374, -0.11487766,  0.06631942, -0.01807967,  0.08839604,
        -0.08385522,  0.1278895 , -0.16648261]], dtype=float32)

# Los puedo obtener definiendo un modelo nuevo

In [183]:
model_test_emb = Model([movie_input], [movie_embedding])

In [184]:
model_test_emb.predict([2])

array([[[-0.12120856,  0.14663969,  0.19157135,  0.09968016,
          0.14670104,  0.12186183,  0.04760471,  0.09871168]]],
      dtype=float32)

# Nearest Neighbors 

In [23]:
from sklearn.neighbors import NearestNeighbors

In [24]:
nbrs = NearestNeighbors(n_neighbors=10, metric='cosine').fit(movie_embeddings_matrix)

In [25]:
neighbors = nbrs.kneighbors([movie_embeddings_matrix[2]])

In [27]:
neighbors

(array([[0.        , 0.03244066, 0.03463501, 0.03577471, 0.04553968,
         0.04791397, 0.05075437, 0.05386627, 0.05777127, 0.05930173]],
       dtype=float32),
 array([[   2,  688,  724, 1047,   46, 1272,  755,  242, 1553,  992]]))

In [28]:
movie_embeddings_matrix[688]

array([-0.06508597, -0.07309216,  0.05873515, -0.03105282,  0.04879123,
       -0.04319927,  0.05170346, -0.08999793], dtype=float32)

# Ordenar por ratings de peliculas no vistas

In [225]:
# Todas las peliculas que el usuario 1 califico

ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,3,881250949
940,1,529,4,881251863
1133,1,378,4,881251728
1812,1,523,3,881251274
1896,1,432,5,881251793
2374,1,835,5,881252017
6910,1,381,4,881251021
7517,1,330,4,881251820
7842,1,551,5,881251911
10017,1,84,4,881251793


In [29]:
import numpy as np

In [31]:
all_movie_idxs = np.linspace(1, ratings.movieId.max(), n_movies, dtype=int)
print(all_movie_idxs)
user_idxs = np.zeros(n_movies, dtype=int) + 1
print(user_idxs)

[   1    2    3 ... 1680 1681 1682]
[1 1 1 ... 1 1 1]


In [32]:
predictions = model.predict([user_idxs,all_movie_idxs])

In [259]:
predictions.max(), predictions.min()

(4.7978897, 1.0406588)

In [267]:
predictions

array([[3.5254776],
       [3.745766 ],
       [2.0018735],
       ...,
       [3.0657752],
       [3.2810183],
       [3.0640361]], dtype=float32)

In [269]:
np.argsort(predictions[:,0])[::-1]

array([1436, 1619, 1038, ..., 1173, 1626,  850])

In [270]:
predictions[1436]

array([4.7978897], dtype=float32)