In [1]:
# We used this tutorial:
# https://github.com/susanli2016/Machine-Learning-with-Python/blob/master/Collaborative%20Filtering%20Model%20with%20TensorFlow.ipynb
# And we also used code from this tutorial:
# https://medium.com/@connectwithghosh/recommender-system-on-the-movielens-using-an-autoencoder-using-tensorflow-in-python-f13d3e8d600d
# Then, we integrated these two tutorials and edited the code from each of them in order to create
# a recommender that allows us to recommend a top 10 list of movies
# without needing to retrain for each new user

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE

In [3]:
rating = pd.read_csv('/Users/blakemyers/Desktop/data/ratings.csv', error_bad_lines=False, encoding='latin-1')

In [4]:
movie = pd.read_csv("/Users/blakemyers/Desktop/data/movies.csv", error_bad_lines=False, encoding='latin-1')

In [5]:
movie_rating = pd.merge(rating, movie, on = 'movieId')

In [6]:
cols = ['timestamp']

In [7]:
movie_rating.drop(cols, axis=1, inplace=True)

In [8]:
numrate_movie = movie_rating.groupby("title")["rating"].count().reset_index()

In [9]:
numrate_movie.rename({"rating": "ratecount_movie"}, axis=1, inplace=True)

In [10]:
numrate_movie = numrate_movie.query("ratecount_movie >= 20")

In [11]:
ratings20plus = pd.merge(numrate_movie, movie_rating, on = 'title', how = 'inner')

In [12]:
numrate_user = ratings20plus.groupby("userId")["rating"].count().reset_index()

In [13]:
numrate_user.rename({"rating": "ratecount_user"}, axis=1, inplace=True)

In [14]:
numrate_user = numrate_user.query("ratecount_user >= 20")

In [15]:
ur20plus = pd.merge(ratings20plus, numrate_user, on = "userId", how = "inner")

In [16]:
ur20plus = ur20plus.append(pd.DataFrame([["A.I. Artificial Intelligence (2001)",1,9999999,4370,5,"genre",1]], columns =ur20plus.columns), ignore_index=True)

In [17]:
scaler = MinMaxScaler()
ur20plus['rating'] = ur20plus['rating'].values.astype(float)
rating_scaled = pd.DataFrame(scaler.fit_transform(ur20plus['rating'].values.reshape(-1,1)))
ur20plus['rating'] = rating_scaled

In [18]:
ur20plus = ur20plus.drop_duplicates(['userId', 'title'])
user_movie_matrix = ur20plus.pivot(index='userId', columns='title', values='rating')
user_movie_matrix.fillna(0, inplace=True)

In [19]:
user_movie_matrix.head()

title,(500) Days of Summer (2009),10 Things I Hate About You (1999),101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),13 Going on 30 (2004),"13th Warrior, The (1999)",1408 (2007),2001: A Space Odyssey (1968),2012 (2009),...,Young Frankenstein (1974),Young Guns (1988),Zack and Miri Make a Porno (2008),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002),Â¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.777778,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.777778
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.555556,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
X_train, X_test = train_test_split(user_movie_matrix, train_size=0.8)

In [21]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [22]:
num_input = ur20plus['title'].nunique()
n_nodes_inpl = num_input  
n_nodes_hl1  = 256  
n_nodes_outl = num_input  
hidden_1_layer_vals = {'weights':tf.Variable(tf.random_normal([n_nodes_inpl+1,n_nodes_hl1]))}
output_layer_vals = {'weights':tf.Variable(tf.random_normal([n_nodes_hl1+1,n_nodes_outl]))}

In [23]:
input_layer = tf.placeholder('float', [None, num_input])
input_layer_const = tf.fill( [tf.shape(input_layer)[0], 1] ,1.0  )
input_layer_concat =  tf.concat([input_layer, input_layer_const], 1)
layer_1 = tf.nn.sigmoid(tf.matmul(input_layer_concat,hidden_1_layer_vals['weights']))
layer1_const = tf.fill( [tf.shape(layer_1)[0], 1] ,1.0  )
layer_concat =  tf.concat([layer_1, layer1_const], 1)
output_layer = tf.matmul( layer_concat,output_layer_vals['weights'])
output_true = tf.placeholder('float', [None, num_input])
meansq =    tf.reduce_mean(tf.square(output_layer - output_true))
learn_rate = 0.1
optimizer = tf.train.AdagradOptimizer(learn_rate).minimize(meansq)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [24]:
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
batch_size = 100
hm_epochs =200
tot_images = X_train.shape[0]

In [25]:
for epoch in range(hm_epochs):
    epoch_loss = 0
    
    for i in range(int(tot_images/batch_size)):
        epoch_x = X_train[ i*batch_size : (i+1)*batch_size ]
        _, c = sess.run([optimizer, meansq],\
               feed_dict={input_layer: epoch_x, \
               output_true: epoch_x})
        epoch_loss += c
        
    output_train = sess.run(output_layer,\
               feed_dict={input_layer:X_train})
    output_test = sess.run(output_layer,\
                   feed_dict={input_layer:X_test})
        
    print('MSE train', MSE(output_train, X_train),'MSE test', MSE(output_test, X_test))      
    print('Epoch', epoch, '/', hm_epochs, 'loss:',epoch_loss)

MSE train 91.82437661412699 MSE test 92.10847704744378
Epoch 0 / 200 loss: 427.97928619384766
MSE train 76.20612836340655 MSE test 76.80663097421993
Epoch 1 / 200 loss: 341.9962387084961
MSE train 65.95440004303283 MSE test 66.72087202261335
Epoch 2 / 200 loss: 288.3486785888672
MSE train 58.7957680528673 MSE test 59.65506569845345
Epoch 3 / 200 loss: 252.37351989746094
MSE train 53.55177847259794 MSE test 54.47840036702652
Epoch 4 / 200 loss: 226.79779052734375
MSE train 49.5754278476019 MSE test 50.55931642177745
Epoch 5 / 200 loss: 207.84182357788086
MSE train 46.470221322685575 MSE test 47.50893099301113
Epoch 6 / 200 loss: 193.3106689453125
MSE train 43.99391545289107 MSE test 45.08504024363708
Epoch 7 / 200 loss: 181.86090850830078
MSE train 41.979998188160735 MSE test 43.125833081700016
Epoch 8 / 200 loss: 172.66121292114258
MSE train 40.3065883587741 MSE test 41.51318499842421
Epoch 9 / 200 loss: 165.11857986450195
MSE train 38.88831852444306 MSE test 40.162882398828394
Epoch 1

MSE train 13.283282561645464 MSE test 16.009164926403045
Epoch 86 / 200 loss: 52.68117809295654
MSE train 13.123648770389066 MSE test 15.845164044195277
Epoch 87 / 200 loss: 52.0341796875
MSE train 12.966517082794113 MSE test 15.683654919334026
Epoch 88 / 200 loss: 51.39748477935791
MSE train 12.811893650015275 MSE test 15.524749099204422
Epoch 89 / 200 loss: 50.77082347869873
MSE train 12.659799238307475 MSE test 15.368600173126678
Epoch 90 / 200 loss: 50.15429973602295
MSE train 12.510174642266781 MSE test 15.21531119883911
Epoch 91 / 200 loss: 49.5479621887207
MSE train 12.362916592013345 MSE test 15.064925840944651
Epoch 92 / 200 loss: 48.951504707336426
MSE train 12.217931085943686 MSE test 14.91744855062708
Epoch 93 / 200 loss: 48.364545822143555
MSE train 12.075152845288061 MSE test 14.772836509883255
Epoch 94 / 200 loss: 47.786810874938965
MSE train 11.934489560466295 MSE test 14.630966755713683
Epoch 95 / 200 loss: 47.218079566955566
MSE train 11.795756834696496 MSE test 14.49

MSE train 5.595101437313499 MSE test 8.21803770953921
Epoch 174 / 200 loss: 21.53531312942505
MSE train 5.551058119451032 MSE test 8.171859928014547
Epoch 175 / 200 loss: 21.35715961456299
MSE train 5.507711777907041 MSE test 8.126277190492733
Epoch 176 / 200 loss: 21.181525707244873
MSE train 5.46506697193629 MSE test 8.081298714624003
Epoch 177 / 200 loss: 21.00864267349243
MSE train 5.423027701443615 MSE test 8.036888156696826
Epoch 178 / 200 loss: 20.838512420654297
MSE train 5.381471418589931 MSE test 7.993000854081835
Epoch 179 / 200 loss: 20.67074966430664
MSE train 5.340311329988758 MSE test 7.94961247967766
Epoch 180 / 200 loss: 20.504913330078125
MSE train 5.299500629874961 MSE test 7.906715363452576
Epoch 181 / 200 loss: 20.340735912322998
MSE train 5.259002201334662 MSE test 7.8643018496424135
Epoch 182 / 200 loss: 20.17808198928833
MSE train 5.218757646855353 MSE test 7.822359361064428
Epoch 183 / 200 loss: 20.016815185546875
MSE train 5.178695944567854 MSE test 7.78088924

In [26]:
pred_data = pd.DataFrame()
preds = sess.run(output_layer, feed_dict={input_layer: user_movie_matrix})
pred_data = pred_data.append(pd.DataFrame(preds))
pred_data = pred_data.stack().reset_index(name='rating')
pred_data.columns = ['userId', 'title', 'rating']
users = user_movie_matrix.index.tolist()
movies = user_movie_matrix.columns.tolist()
pred_data['userId'] = pred_data['userId'].map(lambda value: users[value])
pred_data['title'] = pred_data['title'].map(lambda value: movies[value])
keys = ['userId', 'title']
index_1 = pred_data.set_index(keys).index
index_2 = ur20plus.set_index(keys).index
top_ten_ranked = pred_data[~index_1.isin(index_2)]
top_ten_ranked = top_ten_ranked.sort_values(['userId', 'rating'], ascending=[True, False])
top_ten_ranked = top_ten_ranked.groupby('userId').head(10)

In [27]:
top_ten_ranked.loc[top_ten_ranked['userId'] == 9999999]

Unnamed: 0,userId,title,rating
735134,9999999,"Simple Plan, A (1998)",7.316847
735371,9999999,Witness (1985),7.061924
734766,9999999,Leaving Las Vegas (1995),7.045127
735301,9999999,Tropic Thunder (2008),6.941782
734854,9999999,Mighty Aphrodite (1995),6.583347
734103,9999999,10 Things I Hate About You (1999),6.565654
734985,9999999,Planet of the Apes (1968),6.465528
734708,9999999,"Island of Dr. Moreau, The (1996)",6.461706
735285,9999999,Top Gun (1986),6.397721
734233,9999999,Better Off Dead... (1985),6.371764


In [28]:
ur20plus = ur20plus.append(pd.DataFrame([["Hulk (2003)",1,9999991,4370,5,"genre",1]], columns =ur20plus.columns), ignore_index=True)
scaler = MinMaxScaler()
ur20plus['rating'] = ur20plus['rating'].values.astype(float)
rating_scaled = pd.DataFrame(scaler.fit_transform(ur20plus['rating'].values.reshape(-1,1)))
ur20plus['rating'] = rating_scaled
ur20plus = ur20plus.drop_duplicates(['userId', 'title'])
user_movie_matrix = ur20plus.pivot(index='userId', columns='title', values='rating')
user_movie_matrix.fillna(0, inplace=True)

In [29]:
pred_data = pd.DataFrame()
preds = sess.run(output_layer, feed_dict={input_layer: user_movie_matrix})
pred_data = pred_data.append(pd.DataFrame(preds))
pred_data = pred_data.stack().reset_index(name='rating')
pred_data.columns = ['userId', 'title', 'rating']
users = user_movie_matrix.index.tolist()
movies = user_movie_matrix.columns.tolist()
pred_data['userId'] = pred_data['userId'].map(lambda value: users[value])
pred_data['title'] = pred_data['title'].map(lambda value: movies[value])
keys = ['userId', 'title']
index_1 = pred_data.set_index(keys).index
index_2 = ur20plus.set_index(keys).index
top_ten_ranked = pred_data[~index_1.isin(index_2)]
top_ten_ranked = top_ten_ranked.sort_values(['userId', 'rating'], ascending=[True, False])
top_ten_ranked = top_ten_ranked.groupby('userId').head(10)

In [30]:
top_ten_ranked.loc[top_ten_ranked['userId'] == 9999991] #an example of the top 10 recommendations for the above
# new user who only rated one movie, i.e., "Hulk (2003)", with a 5/5.

Unnamed: 0,userId,title,rating
734950,9999991,Outbreak (1995),7.819874
734502,9999991,"Fast and the Furious, The (2001)",7.736551
734310,9999991,Cape Fear (1991),7.396494
734341,9999991,Cinderella (1950),7.045932
734110,9999991,2001: A Space Odyssey (1968),6.947413
734125,9999991,A.I. Artificial Intelligence (2001),6.777423
735383,9999991,X-Men: Days of Future Past (2014),6.678552
734673,9999991,I Heart Huckabees (2004),6.576962
734901,9999991,My Fair Lady (1964),6.494349
734207,9999991,Backdraft (1991),6.404796


In [31]:
ur20plus = ur20plus.append(pd.DataFrame([["Hulk (2003)",1,9999992,4370,5,"genre",1]], columns =ur20plus.columns), ignore_index=True)
ur20plus = ur20plus.append(pd.DataFrame([["Aliens (1986)",1,9999992,4370,3,"genre",1]], columns =ur20plus.columns), ignore_index=True)
ur20plus = ur20plus.append(pd.DataFrame([["Brave (2012)",1,9999992,4370,4,"genre",1]], columns =ur20plus.columns), ignore_index=True)
scaler = MinMaxScaler()
ur20plus['rating'] = ur20plus['rating'].values.astype(float)
rating_scaled = pd.DataFrame(scaler.fit_transform(ur20plus['rating'].values.reshape(-1,1)))
ur20plus['rating'] = rating_scaled
ur20plus = ur20plus.drop_duplicates(['userId', 'title'])
user_movie_matrix = ur20plus.pivot(index='userId', columns='title', values='rating')
user_movie_matrix.fillna(0, inplace=True)

In [32]:
pred_data = pd.DataFrame()
preds = sess.run(output_layer, feed_dict={input_layer: user_movie_matrix})
pred_data = pred_data.append(pd.DataFrame(preds))
pred_data = pred_data.stack().reset_index(name='rating')
pred_data.columns = ['userId', 'title', 'rating']
users = user_movie_matrix.index.tolist()
movies = user_movie_matrix.columns.tolist()
pred_data['userId'] = pred_data['userId'].map(lambda value: users[value])
pred_data['title'] = pred_data['title'].map(lambda value: movies[value])
keys = ['userId', 'title']
index_1 = pred_data.set_index(keys).index
index_2 = ur20plus.set_index(keys).index
top_ten_ranked = pred_data[~index_1.isin(index_2)]
top_ten_ranked = top_ten_ranked.sort_values(['userId', 'rating'], ascending=[True, False])
top_ten_ranked = top_ten_ranked.groupby('userId').head(10)

In [33]:
top_ten_ranked.loc[top_ten_ranked['userId'] == 9999992] #an example of the top 10 recommendations for the above
# new user who rated three movies, i.e., "Hulk (2003)" with a 5/5, "Aliens (1986)" with a 3/5, and "Brave (2012)"
# with a 4/5.

Unnamed: 0,userId,title,rating
735422,9999992,A.I. Artificial Intelligence (2001),10.394133
735799,9999992,"Fast and the Furious, The (2001)",10.382468
736247,9999992,Outbreak (1995),9.191101
736059,9999992,"Lawnmower Man, The (1992)",8.967257
736390,9999992,School of Rock (2003),8.936007
736358,9999992,Rogue One: A Star Wars Story (2016),8.628779
735880,9999992,Goldfinger (1964),8.338116
736063,9999992,Leaving Las Vegas (1995),8.09357
736151,9999992,Mighty Aphrodite (1995),8.018227
735561,9999992,Blue Velvet (1986),7.955853
