In [0]:
% tensorflow_version 2.x

In [33]:
# Imports

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import SGD, Adam

from sklearn.utils import shuffle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

tf.__version__

'2.1.0'

In [34]:
!wget -nc http://files.grouplens.org/datasets/movielens/ml-20m.zip

File ‘ml-20m.zip’ already there; not retrieving.



In [35]:
!unzip -n ml-20m.zip

Archive:  ml-20m.zip


In [36]:
!ls

ml-20m	ml-20m.zip  sample_data


In [37]:
df = pd.read_csv('ml-20m/ratings.csv')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [38]:
df.tail()

Unnamed: 0,userId,movieId,rating,timestamp
20000258,138493,68954,4.5,1258126920
20000259,138493,69526,4.5,1259865108
20000260,138493,69644,3.0,1260209457
20000261,138493,70286,5.0,1258126944
20000262,138493,71619,2.5,1255811136


In [0]:
# We can't trust the userId and movieId to be numbered 0...N-1
# Let's just set our own ids

df.userId = pd.Categorical(df.userId)

# pd.Series.cat.codes is used to give unique ids starting from zero.
df['new_user_id'] = df.userId.cat.codes

In [40]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,new_user_id
0,1,2,3.5,1112486027,0
1,1,29,3.5,1112484676,0
2,1,32,3.5,1112484819,0
3,1,47,3.5,1112484727,0
4,1,50,3.5,1112484580,0


In [0]:
df.movieId = pd.Categorical(df.movieId)
df['new_movie_id'] = df.movieId.cat.codes

In [42]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,new_user_id,new_movie_id
0,1,2,3.5,1112486027,0,1
1,1,29,3.5,1112484676,0,28
2,1,32,3.5,1112484819,0,31
3,1,47,3.5,1112484727,0,46
4,1,50,3.5,1112484580,0,49


In [0]:
# Get user IDs, movie IDs and ratings as seperate arrays
# .values gives a list of values of the column at each index

user_ids = df['new_user_id'].values
movie_ids = df['new_movie_id'].values
ratings = df['rating'].values

In [52]:
# Get the number of users and the number of movies

N = len(set(user_ids))
M = len(set(movie_ids))

# Set the embedding dimensions
K = 10

print('N:', N, 'M:', M, 'K:', K)

N: 138493 M: 26744 K: 10


In [0]:
# Make the neural network

# User input
u = Input(shape = (1,))

# Movie input
m = Input(shape = (1,))

# User Embedding
u_emb = Embedding(N, K)(u)  # output is (num_samples, 1, K)

# Movie Embedding
m_emb = Embedding(M, K)(m)  # output is (num_samples, 1, K)

# Flatten both Embeddings
u_emb = Flatten()(u_emb)  # now it's (num_samples, K)
m_emb = Flatten()(m_emb)  # now it's (num_samples, K)

# Concatenate user-movie embeddings into a feature vector
x = Concatenate()([u_emb, m_emb]) # now it's (num_samples, 2K)

# Now that we have a feature vector, it's just a regular ANN
x = Dense(1024, activation = 'relu')(x)
x = Dense(400, activation = 'relu')(x)
x = Dense(400, activation = 'relu')(x)
x = Dense(1)(x)

In [0]:
model = Model(inputs = [u, m], outputs = x)

In [0]:
model.compile(
    loss = 'mse',
    optimizer = SGD(learning_rate = 0.08, momentum = 0.9)
)

In [0]:
# Split the data
user_ids, movie_ids, ratings = shuffle(user_ids, movie_ids, ratings)
Ntrain = int(0.8 * len(ratings))
train_user = user_ids[:Ntrain]
train_movie = movie_ids[:Ntrain]
train_ratings = ratings[:Ntrain]

test_user = user_ids[Ntrain:]
test_movie = movie_ids[Ntrain:]
test_ratings = ratings[Ntrain:]


In [0]:
# Center the ratings
avg_rating = train_ratings.mean()
train_ratings = train_ratings - avg_rating
test_ratings = test_ratings - avg_rating

In [65]:
r = model.fit(
    x = [train_user, train_movie],
    y = train_ratings,
    epochs = 25,
    batch_size = 1024,
    validation_data = ([test_user, test_movie], test_ratings)
)

Train on 16000210 samples, validate on 4000053 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [101]:
# Let's see how the user 0 rates movie 61 and 81 
yp = model.predict([ np.array([0,0]), np.array([61,81]) ])
yp

array([[0.08660349],
       [0.13332261]], dtype=float32)

In [0]:
for a in range(len(yp)):
  yp[a] += avg_rating 


In [103]:
yp

array([[3.61196  ],
       [3.6586792]], dtype=float32)