In [1]:
# We used code from this tutorial:
# https://github.com/susanli2016/Machine-Learning-with-Python/blob/master/Collaborative%20Filtering%20Model%20with%20TensorFlow.ipynb
# And we also used code from this tutorial:
# https://medium.com/@connectwithghosh/recommender-system-on-the-movielens-using-an-autoencoder-using-tensorflow-in-python-f13d3e8d600d
# Then, we integrated these two tutorials and edited the code from each of them in order to create a recommender that allows us to recommend a top 10 list of movies without needing to retrain for each new user

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE

In [3]:
rating = pd.read_csv('/Users/blakemyers/Desktop/Jupyter/ml-1m/ratings.csv', sep = "::", error_bad_lines=False, encoding='latin-1', engine='python')

In [4]:
rating.rename({"1": "userId", "1193": "movieId", "5": "rating", "978300760": "timestamp"}, axis=1, inplace=True)

In [5]:
movie = pd.read_csv("/Users/blakemyers/Desktop/Jupyter/ml-1m/movies.csv", sep = "::", error_bad_lines=False, encoding='latin-1', engine='python')

In [6]:
movie.rename({"1": "movieId", "Toy Story (1995)": "title", "Animation|Children's|Comedy": "genre"}, axis=1, inplace=True)

In [7]:
movie_rating = pd.merge(rating, movie, on = 'movieId')

In [8]:
cols = ['timestamp']

In [9]:
movie_rating.drop(cols, axis=1, inplace=True)

In [10]:
numrate_movie = movie_rating.groupby("title")["rating"].count().reset_index()

In [11]:
numrate_movie.rename({"rating": "ratecount_movie"}, axis=1, inplace=True)

In [12]:
numrate_movie = numrate_movie.query("ratecount_movie >= 20")

In [13]:
ratings20plus = pd.merge(numrate_movie, movie_rating, on = 'title', how = 'inner')

In [14]:
numrate_user = ratings20plus.groupby("userId")["rating"].count().reset_index()

In [15]:
numrate_user.rename({"rating": "ratecount_user"}, axis=1, inplace=True)

In [16]:
numrate_user = numrate_user.query("ratecount_user >= 20")

In [17]:
ur20plus = pd.merge(ratings20plus, numrate_user, on = "userId", how = "inner")

In [18]:
scaler = MinMaxScaler()
ur20plus['rating'] = ur20plus['rating'].values.astype(float) #set rating values as float
rating_scaled = pd.DataFrame(scaler.fit_transform(ur20plus['rating'].values.reshape(-1,1)))
#place the rating values on a scale from -1 to 1
ur20plus['rating'] = rating_scaled

In [19]:
ur20plus = ur20plus.drop_duplicates(['userId', 'title']) #drop duplicates
user_movie_matrix = ur20plus.pivot(index='userId', columns='title', values='rating')
#create matrix (see below)
user_movie_matrix.fillna(0, inplace=True)

In [20]:
X_train, X_test = train_test_split(user_movie_matrix, train_size=0.8)
#split the training data (80%) from the testing data

In [21]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [22]:
num_input = ur20plus['title'].nunique()
# Deciding how many nodes each layer should have
n_nodes_inpl = num_input
n_nodes_hl1  = 256
n_nodes_outl = num_input  
# first hidden layer has num_input*32 weights and 32 biases
hidden_1_layer_vals = {'weights':tf.Variable(tf.random_normal([n_nodes_inpl+1,n_nodes_hl1]))}
# first hidden layer has 784*32 weights and 32 biases
output_layer_vals = {'weights':tf.Variable(tf.random_normal([n_nodes_hl1+1,n_nodes_outl]))}

In [23]:
input_layer = tf.placeholder('float', [None, num_input])
# add a constant node to the first layer
# it needs to have the same shape as the input layer to be able to concatinate it later
input_layer_const = tf.fill( [tf.shape(input_layer)[0], 1] ,1.0  )
input_layer_concat =  tf.concat([input_layer, input_layer_const], 1)
# multiply output of input_layer wth a weight matrix 
layer_1 = tf.nn.sigmoid(tf.matmul(input_layer_concat,\
hidden_1_layer_vals['weights']))
# adding one bias node to the hidden layer
layer1_const = tf.fill( [tf.shape(layer_1)[0], 1] ,1.0  )
layer_concat =  tf.concat([layer_1, layer1_const], 1)
# multiply output of hidden with a weight matrix to get final output
output_layer = tf.matmul( layer_concat,output_layer_vals['weights'])
# output_true shall have the original shape for error calculations
output_true = tf.placeholder('float', [None, num_input])
# define cost function
meansq =    tf.reduce_mean(tf.square(output_layer - output_true))
# define optimizer
learn_rate = 0.1   # how fast the model should learn
optimizer = tf.train.AdagradOptimizer(learn_rate).minimize(meansq)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [24]:
# initialising variables and starting the session
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
# defining batch size, number of epochs and learning rate
batch_size = 100  # how many images to use together for training
hm_epochs =200    # how many times to go through the entire dataset
tot_images = X_train.shape[0] # total number of images

In [25]:
# running the model for a 200 epochs taking 100 users in batches
# total improvement is printed out after each epoch
for epoch in range(hm_epochs):
    epoch_loss = 0    # initializing loss (error) as 0
    
    for i in range(int(tot_images/batch_size)):
        epoch_x = X_train[ i*batch_size : (i+1)*batch_size ]
        _, c = sess.run([optimizer, meansq],\
               feed_dict={input_layer: epoch_x, \
               output_true: epoch_x})
        epoch_loss += c
        
    output_train = sess.run(output_layer,\
               feed_dict={input_layer:X_train})
    output_test = sess.run(output_layer,\
                   feed_dict={input_layer:X_test})
        
    if epoch <= 10 or epoch >= 190:
        print('MSE train', MSE(output_train, X_train),'MSE test', MSE(output_test, X_test))
        print('Epoch', epoch, '/', hm_epochs, 'loss:',epoch_loss)
    elif epoch == 11:
        print('.......................')
    else:
        continue

MSE train 45.47096944200303 MSE test 45.45663493292782
Epoch 0 / 200 loss: 3287.859016418457
MSE train 29.232484009715073 MSE test 29.29490657091357
Epoch 1 / 200 loss: 1755.8993587493896
MSE train 20.576133857685836 MSE test 20.594698386897182
Epoch 2 / 200 loss: 1184.195053100586
MSE train 15.494754437907725 MSE test 15.478001259624163
Epoch 3 / 200 loss: 860.0043792724609
MSE train 12.341900352537731 MSE test 12.296746591071667
Epoch 4 / 200 loss: 665.832594871521
MSE train 10.254885766881918 MSE test 10.202365266814267
Epoch 5 / 200 loss: 541.5718011856079
MSE train 8.778459425985455 MSE test 8.732516252264462
Epoch 6 / 200 loss: 456.80153942108154
MSE train 7.679155550612467 MSE test 7.643732852477211
Epoch 7 / 200 loss: 395.26916551589966
MSE train 6.8296415662770835 MSE test 6.808334383332125
Epoch 8 / 200 loss: 348.615195274353
MSE train 6.152705596198394 MSE test 6.148642863717057
Epoch 9 / 200 loss: 312.04456615448
MSE train 5.601255730696967 MSE test 5.614200911404191
Epoch 

In [26]:
# a top 10 ranking for a new user (9999991) who rates "Chariots of Fire (1981)" with a 5
userCol = ur20plus["userId"]
max_value = userCol.max()
new_userId = max_value + 1
ur20plus = ur20plus.append(pd.DataFrame([["Chariots of Fire (1981)",1,new_userId,1,5,"genre",1]], columns =ur20plus.columns), ignore_index=True)
scaler = MinMaxScaler()
ur20plus['rating'] = ur20plus['rating'].values.astype(float)
rating_scaled = pd.DataFrame(scaler.fit_transform(ur20plus['rating'].values.reshape(-1,1)))
ur20plus['rating'] = rating_scaled
ur20plus = ur20plus.drop_duplicates(['userId', 'title'])
user_movie_matrix = ur20plus.pivot(index='userId', columns='title', values='rating')
user_movie_matrix.fillna(0, inplace=True)
pred_data = pd.DataFrame()
preds = sess.run(output_layer, feed_dict={input_layer: user_movie_matrix})
pred_data = pred_data.append(pd.DataFrame(preds))
pred_data = pred_data.stack().reset_index(name='rating')
pred_data.columns = ['userId', 'title', 'rating']
users = user_movie_matrix.index.tolist()
movies = user_movie_matrix.columns.tolist()
pred_data['userId'] = pred_data['userId'].map(lambda value: users[value])
pred_data['title'] = pred_data['title'].map(lambda value: movies[value])
keys = ['userId', 'title']
index_1 = pred_data.set_index(keys).index
index_2 = ur20plus.set_index(keys).index
top_ten_ranked = pred_data[~index_1.isin(index_2)]
top_ten_ranked = top_ten_ranked.sort_values(['userId', 'rating'], ascending=[True, False])
top_ten_ranked = top_ten_ranked.groupby('userId').head(10)
top_ten_ranked.loc[top_ten_ranked['userId'] == new_userId]

Unnamed: 0,userId,title,rating
18300507,6041,Urban Legends: Final Cut (2000),3.702182
18299477,6041,Mr. Wrong (1996),3.448936
18299026,6041,"Incredible Journey, The (1963)",3.312068
18300617,6041,Willow (1988),3.297467
18299885,6041,Raise the Red Lantern (1991),3.180922
18298189,6041,"Ciao, Professore! (Io speriamo che me la cavo ...",3.03559
18298576,6041,Fear and Loathing in Las Vegas (1998),2.992504
18297688,6041,"African Queen, The (1951)",2.956947
18299736,6041,"Perfect Storm, The (2000)",2.928829
18299135,6041,"Kid, The (2000)",2.895118


In [27]:
# a top 10 ranking for a new user (9999992) who rates "Hurricane, The (1999)" with a 3
userCol = ur20plus["userId"]
max_value = userCol.max()
new_userId = max_value + 1
ur20plus = ur20plus.append(pd.DataFrame([["Hurricane, The (1999)",1,new_userId,1,3,"genre",1]], columns =ur20plus.columns), ignore_index=True)
scaler = MinMaxScaler()
ur20plus['rating'] = ur20plus['rating'].values.astype(float)
rating_scaled = pd.DataFrame(scaler.fit_transform(ur20plus['rating'].values.reshape(-1,1)))
ur20plus['rating'] = rating_scaled
ur20plus = ur20plus.drop_duplicates(['userId', 'title'])
user_movie_matrix = ur20plus.pivot(index='userId', columns='title', values='rating')
user_movie_matrix.fillna(0, inplace=True)
pred_data = pd.DataFrame()
preds = sess.run(output_layer, feed_dict={input_layer: user_movie_matrix})
pred_data = pred_data.append(pd.DataFrame(preds))
pred_data = pred_data.stack().reset_index(name='rating')
pred_data.columns = ['userId', 'title', 'rating']
users = user_movie_matrix.index.tolist()
movies = user_movie_matrix.columns.tolist()
pred_data['userId'] = pred_data['userId'].map(lambda value: users[value])
pred_data['title'] = pred_data['title'].map(lambda value: movies[value])
keys = ['userId', 'title']
index_1 = pred_data.set_index(keys).index
index_2 = ur20plus.set_index(keys).index
top_ten_ranked = pred_data[~index_1.isin(index_2)]
top_ten_ranked = top_ten_ranked.sort_values(['userId', 'rating'], ascending=[True, False])
top_ten_ranked = top_ten_ranked.groupby('userId').head(10)
top_ten_ranked.loc[top_ten_ranked['userId'] == new_userId]

Unnamed: 0,userId,title,rating
18301173,6042,Casablanca (1942),4.020238
18301843,6042,Grateful Dead (1995),3.280342
18303159,6042,Singles (1992),3.231777
18302325,6042,Love Stinks (1999),3.206244
18302960,6042,"Relic, The (1997)",3.145146
18301448,6042,Dirty Work (1998),3.013652
18301866,6042,Grumpier Old Men (1995),3.002073
18302446,6042,Mickey Blue Eyes (1999),2.978279
18302931,6042,Rambo III (1988),2.897503
18301879,6042,Hackers (1995),2.81605
