In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
links = pd.read_csv('../../data/ml-latest-small/links.csv')
movies = pd.read_csv('../../data/ml-latest-small/movies.csv')
ratings = pd.read_csv('../../data/ml-latest-small/ratings.csv')
tags = pd.read_csv('../../data/ml-latest-small/tags.csv')

In [3]:
df = ratings.join(movies, rsuffix='_r', how='inner', on='userId').drop(['title', 'movieId_r'], axis=1)

In [4]:
df.info()
le = LabelEncoder()
genres = le.fit_transform(df.genres)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
 4   genres     100836 non-null  object 
dtypes: float64(1), int64(3), object(1)
memory usage: 4.6+ MB


In [5]:
df = df.drop('genres', axis=1)

In [7]:
df['genres_le'] = pd.Series(genres, index=df.index)


In [8]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,genres_le
0,1,1,4.0,964982703,73
1,1,3,4.0,964981247,73
2,1,6,4.0,964982224,73
3,1,47,5.0,964983815,73
4,1,50,5.0,964982931,73


In [9]:
user_enc = LabelEncoder()
df['user'] = user_enc.fit_transform(df['userId'].values)
n_users = df['user'].nunique()
item_enc = LabelEncoder()
df['movie'] = item_enc.fit_transform(df['movieId'].values)
n_movies = df['movie'].nunique()
df['rating'] = df['rating'].values.astype(np.float32)
min_rating = min(df['rating'])
max_rating = max(df['rating'])
n_users, n_movies, min_rating, max_rating

(610, 9724, 0.5, 5.0)

In [10]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,genres_le,user,movie
0,1,1,4.0,964982703,73,0,0
1,1,3,4.0,964981247,73,0,2
2,1,6,4.0,964982224,73,0,5
3,1,47,5.0,964983815,73,0,43
4,1,50,5.0,964982931,73,0,46


In [11]:
X = df[['user', 'movie', 'genres_le', 'timestamp']].values
y = df['rating'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((90752, 4), (10084, 4), (90752,), (10084,))

In [12]:
n_factors = 50
X_train_array = [X_train[:, 0], X_train[:, 1]]
X_test_array = [X_test[:, 0], X_test[:, 1]]

In [13]:
from keras.models import Model
from keras.layers import Input, Reshape, Dot
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.regularizers import l2

In [14]:
from keras.layers import Add, Activation, Lambda
class EmbeddingLayer:
    def __init__(self, n_items, n_factors):
        self.n_items = n_items
        self.n_factors = n_factors
    
    def __call__(self, x):
        x = Embedding(self.n_items, self.n_factors, embeddings_initializer='he_normal',
                      embeddings_regularizer=l2(1e-6))(x)
        x = Reshape((self.n_factors,))(x)
        return x

In [15]:
from keras.layers import Concatenate, Dense, Dropout
def RecommenderNet(n_users, n_movies, n_factors, min_rating, max_rating):
    user = Input(shape=(1,))
    u = EmbeddingLayer(n_users, n_factors)(user)
    
    movie = Input(shape=(1,))
    m = EmbeddingLayer(n_movies, n_factors)(movie)
    
    x = Concatenate()([u, m])
    x = Dropout(0.05)(x)
    
    x = Dense(10, kernel_initializer='he_normal')(x)
    x = Activation('relu')(x)
    x = Dropout(0.5)(x)
    
    x = Dense(1, kernel_initializer='he_normal')(x)
    x = Activation('sigmoid')(x)
    x = Lambda(lambda x: x * (max_rating - min_rating) + min_rating)(x)
    model = Model(inputs=[user, movie], outputs=x)
    opt = Adam(lr=0.001)
    model.compile(loss='mean_squared_error', optimizer=opt)
    return model

In [16]:
model = RecommenderNet(n_users, n_movies, n_factors, min_rating, max_rating)

In [17]:
history = model.fit(x=X_train_array, y=y_train, batch_size=64, epochs=5,
                    verbose=1, validation_data=(X_test_array, y_test))

Epoch 1/5




Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [18]:
test_predictions = model.predict(X_test_array)

In [19]:
from sklearn.metrics import mean_squared_error
from math import sqrt

rms = sqrt(mean_squared_error(y_test, test_predictions))

In [20]:
rms

0.8814945003513225

In [None]:
# This is not better than the alternating least squares model