In [1]:
# We used code from this tutorial:
# https://github.com/susanli2016/Machine-Learning-with-Python/blob/master/Collaborative%20Filtering%20Model%20with%20TensorFlow.ipynb
# And we also used code from this tutorial:
# https://medium.com/@connectwithghosh/recommender-system-on-the-movielens-using-an-autoencoder-using-tensorflow-in-python-f13d3e8d600d
# Then, we integrated these two tutorials and edited the code from each of them in order to create a recommender that allows us to recommend a top 10 list of movies without needing to retrain for each new user

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE

In [3]:
rating = pd.read_csv('/Users/blakemyers/Desktop/data/ratings.csv', error_bad_lines=False, encoding='latin-1')

In [4]:
movie = pd.read_csv("/Users/blakemyers/Desktop/data/movies.csv", error_bad_lines=False, encoding='latin-1')

In [5]:
movie_rating = pd.merge(rating, movie, on = 'movieId')

In [6]:
cols = ['timestamp']

In [7]:
movie_rating.drop(cols, axis=1, inplace=True)

In [8]:
numrate_movie = movie_rating.groupby("title")["rating"].count().reset_index()

In [9]:
numrate_movie.rename({"rating": "ratecount_movie"}, axis=1, inplace=True)

In [10]:
numrate_movie = numrate_movie.query("ratecount_movie >= 20")

In [11]:
ratings20plus = pd.merge(numrate_movie, movie_rating, on = 'title', how = 'inner')

In [12]:
numrate_user = ratings20plus.groupby("userId")["rating"].count().reset_index()

In [13]:
numrate_user.rename({"rating": "ratecount_user"}, axis=1, inplace=True)

In [14]:
numrate_user = numrate_user.query("ratecount_user >= 20")

In [15]:
ur20plus = pd.merge(ratings20plus, numrate_user, on = "userId", how = "inner")

In [16]:
ur20plus.head()

Unnamed: 0,title,ratecount_movie,userId,movieId,rating,genres,ratecount_user
0,(500) Days of Summer (2009),42,15,69757,4.0,Comedy|Drama|Romance,122
1,101 Dalmatians (One Hundred and One Dalmatians...,44,15,2085,1.5,Adventure|Animation|Children,122
2,28 Days Later (2002),58,15,6502,3.5,Action|Horror|Sci-Fi,122
3,A.I. Artificial Intelligence (2001),56,15,4370,4.0,Adventure|Drama|Sci-Fi,122
4,"Adjustment Bureau, The (2011)",21,15,84954,4.5,Romance|Sci-Fi|Thriller,122


In [17]:
scaler = MinMaxScaler()
ur20plus['rating'] = ur20plus['rating'].values.astype(float)
rating_scaled = pd.DataFrame(scaler.fit_transform(ur20plus['rating'].values.reshape(-1,1)))
ur20plus['rating'] = rating_scaled

In [18]:
ur20plus = ur20plus.drop_duplicates(['userId', 'title'])
user_movie_matrix = ur20plus.pivot(index='userId', columns='title', values='rating')
user_movie_matrix.fillna(0, inplace=True)

In [19]:
user_movie_matrix.head()

title,(500) Days of Summer (2009),10 Things I Hate About You (1999),101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),13 Going on 30 (2004),"13th Warrior, The (1999)",1408 (2007),2001: A Space Odyssey (1968),2012 (2009),...,Young Frankenstein (1974),Young Guns (1988),Zack and Miri Make a Porno (2008),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002),Â¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.777778,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.777778
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.555556,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
X_train, X_test = train_test_split(user_movie_matrix, train_size=0.8)

In [21]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [22]:
num_input = ur20plus['title'].nunique()
n_nodes_inpl = num_input  
n_nodes_hl1  = 256  
n_nodes_outl = num_input  
hidden_1_layer_vals = {'weights':tf.Variable(tf.random_normal([n_nodes_inpl+1,n_nodes_hl1]))}
output_layer_vals = {'weights':tf.Variable(tf.random_normal([n_nodes_hl1+1,n_nodes_outl]))}

In [23]:
input_layer = tf.placeholder('float', [None, num_input])
input_layer_const = tf.fill( [tf.shape(input_layer)[0], 1] ,1.0  )
input_layer_concat =  tf.concat([input_layer, input_layer_const], 1)
layer_1 = tf.nn.sigmoid(tf.matmul(input_layer_concat,hidden_1_layer_vals['weights']))
layer1_const = tf.fill( [tf.shape(layer_1)[0], 1] ,1.0  )
layer_concat =  tf.concat([layer_1, layer1_const], 1)
output_layer = tf.matmul( layer_concat,output_layer_vals['weights'])
output_true = tf.placeholder('float', [None, num_input])
meansq =    tf.reduce_mean(tf.square(output_layer - output_true))
learn_rate = 0.1
optimizer = tf.train.AdagradOptimizer(learn_rate).minimize(meansq)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [24]:
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
batch_size = 100
hm_epochs =200
tot_images = X_train.shape[0]

In [25]:
for epoch in range(hm_epochs):
    epoch_loss = 0
    
    for i in range(int(tot_images/batch_size)):
        epoch_x = X_train[ i*batch_size : (i+1)*batch_size ]
        _, c = sess.run([optimizer, meansq],\
               feed_dict={input_layer: epoch_x, \
               output_true: epoch_x})
        epoch_loss += c
        
    output_train = sess.run(output_layer,\
               feed_dict={input_layer:X_train})
    output_test = sess.run(output_layer,\
                   feed_dict={input_layer:X_test})
        
    if epoch <= 5 or epoch >= 195:
        print('MSE train', MSE(output_train, X_train),'MSE test', MSE(output_test, X_test))
        print('Epoch', epoch, '/', hm_epochs, 'loss:',epoch_loss)
    elif epoch == 11:
        print('.......................')
    else:
        continue

MSE train 87.34355290382348 MSE test 86.37646972355003
Epoch 0 / 200 loss: 405.09436798095703
MSE train 73.10932081556949 MSE test 72.60598824352172
Epoch 1 / 200 loss: 326.6635513305664
MSE train 63.74165924423135 MSE test 63.49703470397264
Epoch 2 / 200 loss: 277.68482971191406
MSE train 57.204455328663876 MSE test 57.11945233391163
Epoch 3 / 200 loss: 244.72882461547852
MSE train 52.43244102331223 MSE test 52.461706353249355
Epoch 4 / 200 loss: 221.36968994140625
MSE train 48.78942939100675 MSE test 48.923050424157466
Epoch 5 / 200 loss: 204.05298233032227
.......................
MSE train 5.633646386121445 MSE test 7.9431145346746055
Epoch 195 / 200 loss: 21.751840114593506
MSE train 5.600238363667592 MSE test 7.90787141683635
Epoch 196 / 200 loss: 21.61825942993164
MSE train 5.567122958464125 MSE test 7.872885559592932
Epoch 197 / 200 loss: 21.485960960388184
MSE train 5.5342413051826025 MSE test 7.838118507973324
Epoch 198 / 200 loss: 21.354723930358887
MSE train 5.5015292164014 

In [26]:
ur20plus = ur20plus.append(pd.DataFrame([["Hulk (2003)",1,1001,4370,5,"genre",1]], columns =ur20plus.columns), ignore_index=True)
scaler = MinMaxScaler()
ur20plus['rating'] = ur20plus['rating'].values.astype(float)
rating_scaled = pd.DataFrame(scaler.fit_transform(ur20plus['rating'].values.reshape(-1,1)))
ur20plus['rating'] = rating_scaled
ur20plus = ur20plus.drop_duplicates(['userId', 'title'])
user_movie_matrix = ur20plus.pivot(index='userId', columns='title', values='rating')
user_movie_matrix.fillna(0, inplace=True)

In [27]:
pred_data = pd.DataFrame()
preds = sess.run(output_layer, feed_dict={input_layer: user_movie_matrix})
pred_data = pred_data.append(pd.DataFrame(preds))
pred_data = pred_data.stack().reset_index(name='rating')
pred_data.columns = ['userId', 'title', 'rating']
users = user_movie_matrix.index.tolist()
movies = user_movie_matrix.columns.tolist()
pred_data['userId'] = pred_data['userId'].map(lambda value: users[value])
pred_data['title'] = pred_data['title'].map(lambda value: movies[value])
keys = ['userId', 'title']
index_1 = pred_data.set_index(keys).index
index_2 = ur20plus.set_index(keys).index
top_ten_ranked = pred_data[~index_1.isin(index_2)]
top_ten_ranked = top_ten_ranked.sort_values(['userId', 'rating'], ascending=[True, False])
top_ten_ranked = top_ten_ranked.groupby('userId').head(10)

In [28]:
top_ten_ranked.loc[top_ten_ranked['userId'] == 1001] #an example of the top 10 recommendations for the above new user who only rated one movie, i.e., "Hulk (2003)", with a 5/5.

Unnamed: 0,userId,title,rating
734683,1001,In the Line of Fire (1993),10.229039
735161,1001,Space Cowboys (2000),9.845463
734199,1001,"Aviator, The (2004)",9.694229
734869,1001,"Money Pit, The (1986)",9.029591
734196,1001,Avatar (2009),8.624407
734808,1001,LÃ©on: The Professional (a.k.a. The Profession...,8.063248
734679,1001,Identity (2003),7.752854
734138,1001,Aeon Flux (2005),7.686942
734496,1001,"Family Man, The (2000)",7.426023
734751,1001,L.A. Confidential (1997),6.99631


In [29]:
ur20plus = ur20plus.append(pd.DataFrame([["(500) Days of Summer (2009)",1,1002,4370,5,"genre",1]], columns =ur20plus.columns), ignore_index=True)
ur20plus = ur20plus.append(pd.DataFrame([["Aliens (1986)",1,1002,4370,3,"genre",1]], columns =ur20plus.columns), ignore_index=True)
ur20plus = ur20plus.append(pd.DataFrame([["Brave (2012)",1,1002,4370,4,"genre",1]], columns =ur20plus.columns), ignore_index=True)
scaler = MinMaxScaler()
ur20plus['rating'] = ur20plus['rating'].values.astype(float)
rating_scaled = pd.DataFrame(scaler.fit_transform(ur20plus['rating'].values.reshape(-1,1)))
ur20plus['rating'] = rating_scaled
ur20plus = ur20plus.drop_duplicates(['userId', 'title'])
user_movie_matrix = ur20plus.pivot(index='userId', columns='title', values='rating')
user_movie_matrix.fillna(0, inplace=True)

In [30]:
pred_data = pd.DataFrame()
preds = sess.run(output_layer, feed_dict={input_layer: user_movie_matrix})
pred_data = pred_data.append(pd.DataFrame(preds))
pred_data = pred_data.stack().reset_index(name='rating')
pred_data.columns = ['userId', 'title', 'rating']
users = user_movie_matrix.index.tolist()
movies = user_movie_matrix.columns.tolist()
pred_data['userId'] = pred_data['userId'].map(lambda value: users[value])
pred_data['title'] = pred_data['title'].map(lambda value: movies[value])
keys = ['userId', 'title']
index_1 = pred_data.set_index(keys).index
index_2 = ur20plus.set_index(keys).index
top_ten_ranked = pred_data[~index_1.isin(index_2)]
top_ten_ranked = top_ten_ranked.sort_values(['userId', 'rating'], ascending=[True, False])
top_ten_ranked = top_ten_ranked.groupby('userId').head(10)

In [31]:
top_ten_ranked.loc[top_ten_ranked['userId'] == 1002] #an example of the top 10 recommendations for the above new user who rated three movies, i.e., "(500) Days of Summer (2009)" with a 5/5, "Aliens (1986)" with a 3/5, and "Brave (2012)" with a 4/5.

Unnamed: 0,userId,title,rating
735435,1002,Aeon Flux (2005),9.359016
736365,1002,Rounders (1998),9.23794
735774,1002,Equilibrium (2002),8.566386
736166,1002,"Money Pit, The (1986)",8.497048
735523,1002,Before Sunrise (1995),8.444674
736014,1002,Jay and Silent Bob Strike Back (2001),8.441698
735761,1002,Ed Wood (1994),8.192542
735696,1002,Dave (1993),8.178222
735700,1002,Dazed and Confused (1993),8.137302
735843,1002,Friends with Benefits (2011),8.075338
