In [1]:
# We used code from this tutorial:
# https://github.com/susanli2016/Machine-Learning-with-Python/blob/master/Collaborative%20Filtering%20Model%20with%20TensorFlow.ipynb
# And we also used code from this tutorial:
# https://medium.com/@connectwithghosh/recommender-system-on-the-movielens-using-an-autoencoder-using-tensorflow-in-python-f13d3e8d600d
# Then, we integrated these two tutorials and edited the code from each of them in order to create a recommender that allows us to recommend a top 10 list of movies without needing to retrain for each new user

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE

In [3]:
rating = pd.read_csv('/Users/blakemyers/Desktop/data/ratings.csv', error_bad_lines=False, encoding='latin-1')

In [4]:
movie = pd.read_csv("/Users/blakemyers/Desktop/data/movies.csv", error_bad_lines=False, encoding='latin-1')

In [5]:
movie_rating = pd.merge(rating, movie, on = 'movieId')

In [6]:
cols = ['timestamp']

In [7]:
movie_rating.drop(cols, axis=1, inplace=True)

In [8]:
numrate_movie = movie_rating.groupby("title")["rating"].count().reset_index()

In [9]:
numrate_movie.rename({"rating": "ratecount_movie"}, axis=1, inplace=True)

In [10]:
numrate_movie = numrate_movie.query("ratecount_movie >= 20")

In [11]:
ratings20plus = pd.merge(numrate_movie, movie_rating, on = 'title', how = 'inner')

In [12]:
numrate_user = ratings20plus.groupby("userId")["rating"].count().reset_index()

In [13]:
numrate_user.rename({"rating": "ratecount_user"}, axis=1, inplace=True)

In [14]:
numrate_user = numrate_user.query("ratecount_user >= 20")

In [15]:
ur20plus = pd.merge(ratings20plus, numrate_user, on = "userId", how = "inner")

In [16]:
ur20plus = ur20plus.append(pd.DataFrame([["A.I. Artificial Intelligence (2001)",1,9999999,4370,5,"genre",1]], columns =ur20plus.columns), ignore_index=True)

In [17]:
scaler = MinMaxScaler()
ur20plus['rating'] = ur20plus['rating'].values.astype(float)
rating_scaled = pd.DataFrame(scaler.fit_transform(ur20plus['rating'].values.reshape(-1,1)))
ur20plus['rating'] = rating_scaled

In [18]:
ur20plus = ur20plus.drop_duplicates(['userId', 'title'])
user_movie_matrix = ur20plus.pivot(index='userId', columns='title', values='rating')
user_movie_matrix.fillna(0, inplace=True)

In [19]:
user_movie_matrix.head()

title,(500) Days of Summer (2009),10 Things I Hate About You (1999),101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),13 Going on 30 (2004),"13th Warrior, The (1999)",1408 (2007),2001: A Space Odyssey (1968),2012 (2009),...,Young Frankenstein (1974),Young Guns (1988),Zack and Miri Make a Porno (2008),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002),Â¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.777778,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.777778
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.555556,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
X_train, X_test = train_test_split(user_movie_matrix, train_size=0.8)

In [21]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [22]:
num_input = ur20plus['title'].nunique()
n_nodes_inpl = num_input  
n_nodes_hl1  = 256  
n_nodes_outl = num_input  
hidden_1_layer_vals = {'weights':tf.Variable(tf.random_normal([n_nodes_inpl+1,n_nodes_hl1]))}
output_layer_vals = {'weights':tf.Variable(tf.random_normal([n_nodes_hl1+1,n_nodes_outl]))}

In [23]:
input_layer = tf.placeholder('float', [None, num_input])
input_layer_const = tf.fill( [tf.shape(input_layer)[0], 1] ,1.0  )
input_layer_concat =  tf.concat([input_layer, input_layer_const], 1)
layer_1 = tf.nn.sigmoid(tf.matmul(input_layer_concat,hidden_1_layer_vals['weights']))
layer1_const = tf.fill( [tf.shape(layer_1)[0], 1] ,1.0  )
layer_concat =  tf.concat([layer_1, layer1_const], 1)
output_layer = tf.matmul( layer_concat,output_layer_vals['weights'])
output_true = tf.placeholder('float', [None, num_input])
meansq =    tf.reduce_mean(tf.square(output_layer - output_true))
learn_rate = 0.1
optimizer = tf.train.AdagradOptimizer(learn_rate).minimize(meansq)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [24]:
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
batch_size = 100
hm_epochs =200
tot_images = X_train.shape[0]

In [25]:
for epoch in range(hm_epochs):
    epoch_loss = 0
    
    for i in range(int(tot_images/batch_size)):
        epoch_x = X_train[ i*batch_size : (i+1)*batch_size ]
        _, c = sess.run([optimizer, meansq],\
               feed_dict={input_layer: epoch_x, \
               output_true: epoch_x})
        epoch_loss += c
        
    output_train = sess.run(output_layer,\
               feed_dict={input_layer:X_train})
    output_test = sess.run(output_layer,\
                   feed_dict={input_layer:X_test})
        
    print('MSE train', MSE(output_train, X_train),'MSE test', MSE(output_test, X_test))      
    print('Epoch', epoch, '/', hm_epochs, 'loss:',epoch_loss)

MSE train 87.14590365887507 MSE test 88.89720264489704
Epoch 0 / 200 loss: 402.74925994873047
MSE train 72.87188677151673 MSE test 74.65397533155947
Epoch 1 / 200 loss: 324.65380859375
MSE train 63.374221012655966 MSE test 65.20928233860539
Epoch 2 / 200 loss: 275.6084518432617
MSE train 56.70541653532451 MSE test 58.59570008118859
Epoch 3 / 200 loss: 242.34528732299805
MSE train 51.823586804881366 MSE test 53.73769278739335
Epoch 4 / 200 loss: 218.63084411621094
MSE train 48.11270792537035 MSE test 50.044162212087286
Epoch 5 / 200 loss: 201.04117965698242
MSE train 45.193681947029965 MSE test 47.1491398282726
Epoch 6 / 200 loss: 187.49439239501953
MSE train 42.8586923176389 MSE test 44.83127947136522
Epoch 7 / 200 loss: 176.76210403442383
MSE train 40.95200999349856 MSE test 42.93942922996855
Epoch 8 / 200 loss: 168.11508560180664
MSE train 39.367306011441265 MSE test 41.370400206278454
Epoch 9 / 200 loss: 161.0049934387207
MSE train 38.026129038473286 MSE test 40.04796910011531
Epoch

MSE train 13.11923849574475 MSE test 15.574907673445454
Epoch 87 / 200 loss: 52.23862075805664
MSE train 12.98050317729262 MSE test 15.432573527859075
Epoch 88 / 200 loss: 51.6784029006958
MSE train 12.843924703169913 MSE test 15.292449071210905
Epoch 89 / 200 loss: 51.12625694274902
MSE train 12.709574430496744 MSE test 15.154585090053773
Epoch 90 / 200 loss: 50.58271789550781
MSE train 12.577418436290909 MSE test 15.019042979800311
Epoch 91 / 200 loss: 50.04801273345947
MSE train 12.447345602061423 MSE test 14.885771604280762
Epoch 92 / 200 loss: 49.521888732910156
MSE train 12.319320895447712 MSE test 14.75453486962732
Epoch 93 / 200 loss: 49.00375938415527
MSE train 12.193539249417071 MSE test 14.625140773481732
Epoch 94 / 200 loss: 48.49358081817627
MSE train 12.070272402319883 MSE test 14.497595131591833
Epoch 95 / 200 loss: 47.99240970611572
MSE train 11.949528691292752 MSE test 14.37195354952945
Epoch 96 / 200 loss: 47.501376152038574
MSE train 11.831025841800935 MSE test 14.24

MSE train 6.0508655220432646 MSE test 8.163596663766532
Epoch 174 / 200 loss: 23.530885219573975
MSE train 6.005875313662692 MSE test 8.118422764655906
Epoch 175 / 200 loss: 23.34753131866455
MSE train 5.961040137302342 MSE test 8.073722463685941
Epoch 176 / 200 loss: 23.165541172027588
MSE train 5.9164363070066655 MSE test 8.029444485216244
Epoch 177 / 200 loss: 22.98439359664917
MSE train 5.8724092441685345 MSE test 7.985704628226658
Epoch 178 / 200 loss: 22.80437660217285
MSE train 5.829217053321369 MSE test 7.942639906136846
Epoch 179 / 200 loss: 22.626726627349854
MSE train 5.786820864639881 MSE test 7.900267641036092
Epoch 180 / 200 loss: 22.452214241027832
MSE train 5.7451111359286235 MSE test 7.858547443627094
Epoch 181 / 200 loss: 22.280601501464844
MSE train 5.704011675547893 MSE test 7.817447743517046
Epoch 182 / 200 loss: 22.111519813537598
MSE train 5.663465714830038 MSE test 7.776942816483443
Epoch 183 / 200 loss: 21.944735050201416
MSE train 5.623445183094406 MSE test 7.

In [26]:
pred_data = pd.DataFrame()
preds = sess.run(output_layer, feed_dict={input_layer: user_movie_matrix})
pred_data = pred_data.append(pd.DataFrame(preds))
pred_data = pred_data.stack().reset_index(name='rating')
pred_data.columns = ['userId', 'title', 'rating']
users = user_movie_matrix.index.tolist()
movies = user_movie_matrix.columns.tolist()
pred_data['userId'] = pred_data['userId'].map(lambda value: users[value])
pred_data['title'] = pred_data['title'].map(lambda value: movies[value])
keys = ['userId', 'title']
index_1 = pred_data.set_index(keys).index
index_2 = ur20plus.set_index(keys).index
top_ten_ranked = pred_data[~index_1.isin(index_2)]
top_ten_ranked = top_ten_ranked.sort_values(['userId', 'rating'], ascending=[True, False])
top_ten_ranked = top_ten_ranked.groupby('userId').head(10)

In [27]:
top_ten_ranked.loc[top_ten_ranked['userId'] == 9999999]

Unnamed: 0,userId,title,rating
735089,9999999,Scary Movie 2 (2001),10.734091
734378,9999999,Crimson Tide (1995),9.42856
735156,9999999,Solaris (2002),7.677377
734358,9999999,Collateral (2004),7.309671
734459,9999999,Dune (1984),7.200686
735351,9999999,"Wedding Singer, The (1998)",6.868629
734836,9999999,Maverick (1994),6.576529
735385,9999999,X-Men: The Last Stand (2006),6.514274
734159,9999999,American Gangster (2007),6.435902
735387,9999999,Yes Man (2008),6.374254


In [28]:
ur20plus = ur20plus.append(pd.DataFrame([["Hulk (2003)",1,9999991,4370,5,"genre",1]], columns =ur20plus.columns), ignore_index=True)
scaler = MinMaxScaler()
ur20plus['rating'] = ur20plus['rating'].values.astype(float)
rating_scaled = pd.DataFrame(scaler.fit_transform(ur20plus['rating'].values.reshape(-1,1)))
ur20plus['rating'] = rating_scaled
ur20plus = ur20plus.drop_duplicates(['userId', 'title'])
user_movie_matrix = ur20plus.pivot(index='userId', columns='title', values='rating')
user_movie_matrix.fillna(0, inplace=True)

In [29]:
pred_data = pd.DataFrame()
preds = sess.run(output_layer, feed_dict={input_layer: user_movie_matrix})
pred_data = pred_data.append(pd.DataFrame(preds))
pred_data = pred_data.stack().reset_index(name='rating')
pred_data.columns = ['userId', 'title', 'rating']
users = user_movie_matrix.index.tolist()
movies = user_movie_matrix.columns.tolist()
pred_data['userId'] = pred_data['userId'].map(lambda value: users[value])
pred_data['title'] = pred_data['title'].map(lambda value: movies[value])
keys = ['userId', 'title']
index_1 = pred_data.set_index(keys).index
index_2 = ur20plus.set_index(keys).index
top_ten_ranked = pred_data[~index_1.isin(index_2)]
top_ten_ranked = top_ten_ranked.sort_values(['userId', 'rating'], ascending=[True, False])
top_ten_ranked = top_ten_ranked.groupby('userId').head(10)

In [30]:
top_ten_ranked.loc[top_ten_ranked['userId'] == 9999991] #an example of the top 10 recommendations for the above new user who only rated one movie, i.e., "Hulk (2003)", with a 5/5.

Unnamed: 0,userId,title,rating
734836,9999991,Maverick (1994),8.735404
734944,9999991,"Omen, The (1976)",7.697131
734536,9999991,Four Rooms (1995),7.527423
735227,9999991,Talladega Nights: The Ballad of Ricky Bobby (2...,7.48517
735020,9999991,Rain Man (1988),7.321448
734750,9999991,Kung Fu Panda (2008),7.252985
735328,9999991,"Virgin Suicides, The (1999)",6.782855
735025,9999991,Reality Bites (1994),6.664783
734997,9999991,Predator 2 (1990),6.609117
734137,9999991,"Adventures of Priscilla, Queen of the Desert, ...",6.509538


In [31]:
ur20plus = ur20plus.append(pd.DataFrame([["Hulk (2003)",1,9999992,4370,5,"genre",1]], columns =ur20plus.columns), ignore_index=True)
ur20plus = ur20plus.append(pd.DataFrame([["Aliens (1986)",1,9999992,4370,3,"genre",1]], columns =ur20plus.columns), ignore_index=True)
ur20plus = ur20plus.append(pd.DataFrame([["Brave (2012)",1,9999992,4370,4,"genre",1]], columns =ur20plus.columns), ignore_index=True)
scaler = MinMaxScaler()
ur20plus['rating'] = ur20plus['rating'].values.astype(float)
rating_scaled = pd.DataFrame(scaler.fit_transform(ur20plus['rating'].values.reshape(-1,1)))
ur20plus['rating'] = rating_scaled
ur20plus = ur20plus.drop_duplicates(['userId', 'title'])
user_movie_matrix = ur20plus.pivot(index='userId', columns='title', values='rating')
user_movie_matrix.fillna(0, inplace=True)

In [32]:
pred_data = pd.DataFrame()
preds = sess.run(output_layer, feed_dict={input_layer: user_movie_matrix})
pred_data = pred_data.append(pd.DataFrame(preds))
pred_data = pred_data.stack().reset_index(name='rating')
pred_data.columns = ['userId', 'title', 'rating']
users = user_movie_matrix.index.tolist()
movies = user_movie_matrix.columns.tolist()
pred_data['userId'] = pred_data['userId'].map(lambda value: users[value])
pred_data['title'] = pred_data['title'].map(lambda value: movies[value])
keys = ['userId', 'title']
index_1 = pred_data.set_index(keys).index
index_2 = ur20plus.set_index(keys).index
top_ten_ranked = pred_data[~index_1.isin(index_2)]
top_ten_ranked = top_ten_ranked.sort_values(['userId', 'rating'], ascending=[True, False])
top_ten_ranked = top_ten_ranked.groupby('userId').head(10)

In [33]:
top_ten_ranked.loc[top_ten_ranked['userId'] == 9999992] #an example of the top 10 recommendations for the above new user who rated three movies, i.e., "Hulk (2003)" with a 5/5, "Aliens (1986)" with a 3/5, and "Brave (2012)" with a 4/5.

Unnamed: 0,userId,title,rating
735833,9999992,Four Rooms (1995),13.103753
736047,9999992,Kung Fu Panda (2008),8.568954
736682,9999992,X-Men: The Last Stand (2006),8.504925
736524,9999992,Talladega Nights: The Ballad of Ricky Bobby (2...,8.497384
735724,9999992,"Dirty Dozen, The (1967)",8.466221
735420,9999992,"6th Day, The (2000)",8.337077
735625,9999992,Charlie's Angels: Full Throttle (2003),8.259453
736257,9999992,"Peacemaker, The (1997)",8.158257
735484,9999992,Armageddon (1998),7.902801
736317,9999992,Rain Man (1988),7.900194
