In [1]:
# We used code from this tutorial:
# https://github.com/susanli2016/Machine-Learning-with-Python/blob/master/Collaborative%20Filtering%20Model%20with%20TensorFlow.ipynb
# And we also used code from this tutorial:
# https://medium.com/@connectwithghosh/recommender-system-on-the-movielens-using-an-autoencoder-using-tensorflow-in-python-f13d3e8d600d
# Then, we integrated these two tutorials and edited the code from each of them in order to create a recommender that allows us to recommend a top 10 list of movies without needing to retrain for each new user

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
import json

In [3]:
rating = pd.read_csv('big_data/ratings.csv', sep = "::", error_bad_lines=False, encoding='latin-1', engine='python')

In [4]:
rating.rename({"1": "userId", "1193": "movieId", "5": "rating", "978300760": "timestamp"}, axis=1, inplace=True)

In [5]:
movie = pd.read_csv("big_data/movies.csv", sep = "::", error_bad_lines=False, encoding='latin-1', engine='python')

In [6]:
movie.rename({"1": "movieId", "Toy Story (1995)": "title", "Animation|Children's|Comedy": "genre"}, axis=1, inplace=True)

In [7]:
movie_rating = pd.merge(rating, movie, on = 'movieId')

In [8]:
cols = ['timestamp']

In [9]:
movie_rating.drop(cols, axis=1, inplace=True)

In [10]:
numrate_movie = movie_rating.groupby("title")["rating"].count().reset_index()

In [11]:
numrate_movie.rename({"rating": "ratecount_movie"}, axis=1, inplace=True)

In [12]:
numrate_movie = numrate_movie.query("ratecount_movie >= 20")

In [13]:
ratings20plus = pd.merge(numrate_movie, movie_rating, on = 'title', how = 'inner')

In [14]:
numrate_user = ratings20plus.groupby("userId")["rating"].count().reset_index()

In [15]:
numrate_user.rename({"rating": "ratecount_user"}, axis=1, inplace=True)

In [16]:
numrate_user = numrate_user.query("ratecount_user >= 20")

In [17]:
ur20plus = pd.merge(ratings20plus, numrate_user, on = "userId", how = "inner")

In [18]:
scaler = MinMaxScaler()
ur20plus['rating'] = ur20plus['rating'].values.astype(float) #set rating values as float
rating_scaled = pd.DataFrame(scaler.fit_transform(ur20plus['rating'].values.reshape(-1,1)))
#place the rating values on a scale from -1 to 1
ur20plus['rating'] = rating_scaled

In [19]:
ur20plus = ur20plus.drop_duplicates(['userId', 'title']) #drop duplicates
user_movie_matrix = ur20plus.pivot(index='userId', columns='title', values='rating')
#create matrix (see below)
user_movie_matrix.fillna(0, inplace=True)

In [20]:
X_train, X_test = train_test_split(user_movie_matrix, train_size=0.8)
#split the training data (80%) from the testing data

In [21]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [22]:
num_input = ur20plus['title'].nunique()
# Deciding how many nodes each layer should have
n_nodes_inpl = num_input
n_nodes_hl1  = 256
n_nodes_outl = num_input  
# first hidden layer has num_input*32 weights and 32 biases
hidden_1_layer_vals = {'weights':tf.Variable(tf.random_normal([n_nodes_inpl+1,n_nodes_hl1]))}
# first hidden layer has 784*32 weights and 32 biases
output_layer_vals = {'weights':tf.Variable(tf.random_normal([n_nodes_hl1+1,n_nodes_outl]))}

In [23]:
input_layer = tf.placeholder('float', [None, num_input])
# add a constant node to the first layer
# it needs to have the same shape as the input layer to be able to concatinate it later
input_layer_const = tf.fill( [tf.shape(input_layer)[0], 1] ,1.0  )
input_layer_concat =  tf.concat([input_layer, input_layer_const], 1)
# multiply output of input_layer wth a weight matrix 
layer_1 = tf.nn.sigmoid(tf.matmul(input_layer_concat,\
hidden_1_layer_vals['weights']))
# adding one bias node to the hidden layer
layer1_const = tf.fill( [tf.shape(layer_1)[0], 1] ,1.0  )
layer_concat =  tf.concat([layer_1, layer1_const], 1)
# multiply output of hidden with a weight matrix to get final output
output_layer = tf.matmul( layer_concat,output_layer_vals['weights'])
# output_true shall have the original shape for error calculations
output_true = tf.placeholder('float', [None, num_input])
# define cost function
meansq =    tf.reduce_mean(tf.square(output_layer - output_true))
# define optimizer
learn_rate = 0.1   # how fast the model should learn
optimizer = tf.train.AdagradOptimizer(learn_rate).minimize(meansq)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [24]:
# initialising variables and starting the session
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
# defining batch size, number of epochs and learning rate
batch_size = 100  # how many images to use together for training
hm_epochs =200    # how many times to go through the entire dataset
tot_images = X_train.shape[0] # total number of images

In [25]:
# running the model for a 200 epochs taking 100 users in batches
# total improvement is printed out after each epoch
for epoch in range(hm_epochs):
    epoch_loss = 0    # initializing loss (error) as 0
    
    for i in range(int(tot_images/batch_size)):
        epoch_x = X_train[ i*batch_size : (i+1)*batch_size ]
        _, c = sess.run([optimizer, meansq],\
               feed_dict={input_layer: epoch_x, \
               output_true: epoch_x})
        epoch_loss += c
        
    output_train = sess.run(output_layer,\
               feed_dict={input_layer:X_train})
    output_test = sess.run(output_layer,\
                   feed_dict={input_layer:X_test})
        

MSE train 46.380179793167535 MSE test 46.672074497196505
Epoch 0 / 200 loss: 3301.287986755371
MSE train 30.904770200256028 MSE test 31.105200648208758
Epoch 1 / 200 loss: 1819.0695171356201
MSE train 22.309516237515027 MSE test 22.48869904320725
Epoch 2 / 200 loss: 1268.3550605773926
MSE train 16.831422192343634 MSE test 16.94413875880739
Epoch 3 / 200 loss: 935.2982873916626
MSE train 13.181665714983968 MSE test 13.31123114869411
Epoch 4 / 200 loss: 718.2910089492798
MSE train 10.799190714793072 MSE test 10.939400832763225
Epoch 5 / 200 loss: 574.7171277999878
MSE train 9.150585528615292 MSE test 9.283602543038224
Epoch 6 / 200 loss: 478.8486623764038
MSE train 7.95469703292151 MSE test 8.079505713872148
Epoch 7 / 200 loss: 411.0414843559265
MSE train 7.039133074376187 MSE test 7.159159045794751
Epoch 8 / 200 loss: 360.5341911315918
MSE train 6.318384890399996 MSE test 6.43394682044787
Epoch 9 / 200 loss: 321.3250422477722
MSE train 5.734630749832954 MSE test 5.84617739570224
Epoch 1

Unnamed: 0,userId,title,rating
18300256,6041,"Story of G.I. Joe, The (1945)",2.522219
18298416,6041,Dog Park (1998),2.341502
18298077,6041,"Broadway Melody, The (1929)",2.334806
18299844,6041,"Professional, The (a.k.a. Leon: The Profession...",2.283562
18299467,6041,Mr. Death: The Rise and Fall of Fred A. Leucht...,2.281995
18300148,6041,Smoke (1995),2.261795
18300435,6041,"Toxic Avenger, The (1985)",2.22079
18299436,6041,"Misérables, Les (1995)",2.210887
18299482,6041,Mrs. Parker and the Vicious Circle (1994),2.175649
18298689,6041,From the Hip (1987),2.110963


In [27]:
# POST /get_recommended
req = json.loads(REQUEST)
movie_name = req['body']['movie'][0]
movie_rating = req['body']['rating'][0]

# a top 10 ranking for a new user (9999992) who rates "Hurricane, The (1999)" with a 3
userCol = ur20plus["userId"]
max_value = userCol.max()
new_userId = max_value + 1
ur20plus = ur20plus.append(pd.DataFrame([[movie_name,1,new_userId,1,movie_rating,"genre",1]], columns =ur20plus.columns), ignore_index=True)
scaler = MinMaxScaler()
ur20plus['rating'] = ur20plus['rating'].values.astype(float)
rating_scaled = pd.DataFrame(scaler.fit_transform(ur20plus['rating'].values.reshape(-1,1)))
ur20plus['rating'] = rating_scaled
ur20plus = ur20plus.drop_duplicates(['userId', 'title'])
user_movie_matrix = ur20plus.pivot(index='userId', columns='title', values='rating')
user_movie_matrix.fillna(0, inplace=True)
pred_data = pd.DataFrame()
preds = sess.run(output_layer, feed_dict={input_layer: user_movie_matrix})
pred_data = pred_data.append(pd.DataFrame(preds))
pred_data = pred_data.stack().reset_index(name='rating')
pred_data.columns = ['userId', 'title', 'rating']
users = user_movie_matrix.index.tolist()
movies = user_movie_matrix.columns.tolist()
pred_data['userId'] = pred_data['userId'].map(lambda value: users[value])
pred_data['title'] = pred_data['title'].map(lambda value: movies[value])
keys = ['userId', 'title']
index_1 = pred_data.set_index(keys).index
index_2 = ur20plus.set_index(keys).index
top_ten_ranked = pred_data[~index_1.isin(index_2)]
top_ten_ranked = top_ten_ranked.sort_values(['userId', 'rating'], ascending=[True, False])
top_ten_ranked = top_ten_ranked.groupby('userId').head(10)
top_ten_ranked.loc[top_ten_ranked['userId'] == new_userId]
print(top_ten_ranked.loc[top_ten_ranked['userId'] == new_userId])



Unnamed: 0,userId,title,rating
18301239,6042,Citizen Ruth (1996),3.035116
18301616,6042,"Favor, The (1994)",2.532531
18301905,6042,"Hard Day's Night, A (1964)",2.528854
18301709,6042,French Kiss (1995),2.450675
18301409,6042,Deconstructing Harry (1997),2.422072
18301743,6042,G.I. Jane (1997),2.402671
18301064,6042,Boiler Room (2000),2.402489
18302361,6042,"Maltese Falcon, The (1941)",2.380704
18302033,6042,I Know What You Did Last Summer (1997),2.344082
18302660,6042,Nothing to Lose (1994),2.333413


Unnamed: 0,userId,title,rating
