In [18]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.layers import Input, Embedding, Flatten, Dropout, Concatenate, Dense, Activation, Lambda
from keras import Model
from keras.regularizers import l2
from keras.optimizers import Adam

In [20]:
ratings = pd.read_csv('./data/train.csv', sep='\t')

items = pd.read_csv('./data/item.csv', sep='|', encoding = "ISO-8859-1")

user = pd.read_csv('./data/user.csv', sep='|', encoding = "ISO-8859-1")

occupation = pd.read_csv('./data/occupation.csv', sep='|', encoding = "ISO-8859-1")

genre = pd.read_csv('./data/genre.csv', sep='\t', encoding = "ISO-8859-1")

In [21]:
u_unique = ratings.userId.unique()
user2Idx = {o:i+1 for i,o in enumerate(u_unique)}

m_unique = ratings.movieId.unique()
movie2Idx = {o:i+1 for i,o in enumerate(m_unique)}

idx2Movie = {v:k for k,v in movie2Idx.items()}

In [22]:
ratings.userId = ratings.userId.apply(lambda x: user2Idx[x])
ratings.movieId = ratings.movieId.apply(lambda x: movie2Idx[x])
ratings['timestamp'] = ratings['timestamp']/max(ratings['timestamp'])

ratings_train, ratings_val = train_test_split(ratings, test_size=0.2)

In [23]:
n_users = int(ratings.userId.nunique())
n_movies = int(ratings.movieId.nunique())
n_users_train = int(ratings_train.userId.nunique())
n_movies_train = int(ratings_train.movieId.nunique())
max_rating = ratings_train['rating'].max()
min_rating = ratings_train['rating'].min()
av_rating = ratings_train['rating'].mean()
n_latent_factors_user = 8
n_latent_factors_movie = 8
print(n_users, n_movies, n_users_train, n_movies_train)

943 1650 943 1626


In [24]:
#timestamp_input = Input(shape=[1],name='timestamp')
movie_input = Input(shape=[1],name='Item')
movie_embedding = Embedding(n_movies + 1, n_latent_factors_movie, name='Movie-Embedding', embeddings_regularizer = l2(0.001))(movie_input)
movie_vec = Flatten(name='FlattenMovies')(movie_embedding)
movie_vec = Dropout(0.2)(movie_vec)


user_input = Input(shape=[1],name='User')
user_vec = Flatten(name='FlattenUsers')(Embedding(n_users + 1, n_latent_factors_user,name='User-Embedding', embeddings_regularizer = l2(0.001))(user_input))
user_vec = Dropout(0.2)(user_vec)


concat = Concatenate(name='Concat')([movie_vec, user_vec])
concat = Dropout(0.2)(concat)

x = Dense(50,name='FullyConnected-1', activation='relu', kernel_regularizer=l2(0.001))(concat)
x = Dropout(0.5)(x)
x = Dense(50,name='FullyConnected-1', activation='relu')(concat)
x = Dropout(0.5)(x)


## Se pueden sacar las siguientes dos lineas para no forzar a sigmoidea
x = Dense(1, activation='sigmoid',name='Activation')(x)
x = Lambda(lambda z: (max_rating - min_rating) * z + min_rating)(x)
##

model = Model([user_input, movie_input], x)
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Item (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
User (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
Movie-Embedding (Embedding)     (None, 1, 8)         13208       Item[0][0]                       
__________________________________________________________________________________________________
User-Embedding (Embedding)      (None, 1, 8)         7552        User[0][0]                       
____________________________________________________________________________________________

In [25]:
import keras.backend as K 
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true))) 

In [26]:
adam = Adam(lr=0.001)
model.compile(optimizer=adam,loss= 'mean_squared_error', metrics=[root_mean_squared_error])

In [27]:
from keras.callbacks import ModelCheckpoint
checkpointer = ModelCheckpoint(filepath='weights1.hdf5', verbose=1, save_best_only=True, monitor='val_root_mean_squared_error')

In [28]:
history = model.fit([ratings_train.userId, ratings_train.movieId], 
                    ratings_train.rating, 
                    validation_data=([ratings_val.userId, ratings_val.movieId], ratings_val.rating), 
                    batch_size = 160,
                    callbacks = [checkpointer],
                    epochs=100,
                    verbose=2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 64000 samples, validate on 16000 samples
Epoch 1/100
 - 6s - loss: 1.1217 - root_mean_squared_error: 1.0504 - val_loss: 0.9188 - val_root_mean_squared_error: 0.9482

Epoch 00001: val_root_mean_squared_error improved from inf to 0.94824, saving model to weights1.hdf5
Epoch 2/100
 - 4s - loss: 0.9585 - root_mean_squared_error: 0.9669 - val_loss: 0.8974 - val_root_mean_squared_error: 0.9334

Epoch 00002: val_root_mean_squared_error improved from 0.94824 to 0.93338, saving model to weights1.hdf5
Epoch 3/100
 - 4s - loss: 0.9410 - root_mean_squared_error: 0.9553 - val_loss: 0.8934 - val_root_mean_squared_error: 0.9294

Epoch 00003: val_root_mean_squared_error improved from 0.93338 to 0.92943, saving model to weights1.hdf5
Epoch 4/100
 - 3s - loss: 0.9344 - root_mean_squared_error: 0.9504 - val_loss: 0.8933 - val_root_mean_squared_error: 0.9283

Epoch 00004: val_root_mean_squared_error improved from 0.92943 to 0.92829, saving model to weights1.hdf5
Epoch 5/100
 - 4s - loss: 0.9285 -

In [29]:
model.evaluate([ratings_val.userId, ratings_val.movieId], ratings_val.rating)



[0.8861045098900795, 0.9141870737075806]

In [30]:
ratings_test = pd.read_csv('./data/test.csv', sep=',')
ratings_test

Unnamed: 0,Id,userId,movieId,rating
0,0,1,6,3
1,1,1,10,3
2,2,1,12,3
3,3,1,14,2
4,4,1,17,4
...,...,...,...,...
19995,19995,458,648,0
19996,19996,458,1101,0
19997,19997,459,934,2
19998,19998,460,10,0


In [31]:
model.load_weights('weights1.hdf5')
y_pred_test = model.predict([ratings_test.userId, ratings_test.movieId], verbose=1)



In [32]:
sample_csv = pd.read_csv('data/test.csv')
sample_csv['rating'] = y_pred_test
sample_csv.head()

sample_csv.to_csv(f'submision1.csv', index=False)