In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
import json
import time

In [2]:
rating = pd.read_csv('data/ratings.csv', error_bad_lines=False, encoding='latin-1')

In [3]:
movie = pd.read_csv("data/movies.csv", error_bad_lines=False, encoding='latin-1')

In [4]:
movie_rating = pd.merge(rating, movie, on = 'movieId')

In [5]:
cols = ['timestamp']

In [6]:
movie_rating.drop(cols, axis=1, inplace=True)

In [7]:
numrate_movie = movie_rating.groupby("title")["rating"].count().reset_index()

In [8]:
numrate_movie.rename({"rating": "ratecount_movie"}, axis=1, inplace=True)

In [9]:
numrate_movie = numrate_movie.query("ratecount_movie >= 20")

In [10]:
ratings20plus = pd.merge(numrate_movie, movie_rating, on = 'title', how = 'inner')

In [11]:
numrate_user = ratings20plus.groupby("userId")["rating"].count().reset_index()

In [12]:
numrate_user.rename({"rating": "ratecount_user"}, axis=1, inplace=True)

In [13]:
numrate_user = numrate_user.query("ratecount_user >= 20")

In [14]:
ur20plus = pd.merge(ratings20plus, numrate_user, on = "userId", how = "inner")

In [15]:
ur20plus = ur20plus.append(pd.DataFrame([["A.I. Artificial Intelligence (2001)",1,9999999,4370,5,"genre",1]], columns =ur20plus.columns), ignore_index=True)

In [16]:
scaler = MinMaxScaler()
ur20plus['rating'] = ur20plus['rating'].values.astype(float)
rating_scaled = pd.DataFrame(scaler.fit_transform(ur20plus['rating'].values.reshape(-1,1)))
ur20plus['rating'] = rating_scaled

In [17]:
ur20plus = ur20plus.drop_duplicates(['userId', 'title'])
user_movie_matrix = ur20plus.pivot(index='userId', columns='title', values='rating')
user_movie_matrix.fillna(0, inplace=True)

In [18]:
user_movie_matrix.head()

title,(500) Days of Summer (2009),10 Things I Hate About You (1999),101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),13 Going on 30 (2004),"13th Warrior, The (1999)",1408 (2007),2001: A Space Odyssey (1968),2012 (2009),...,Young Frankenstein (1974),Young Guns (1988),Zack and Miri Make a Porno (2008),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002),Â¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.777778,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.777778
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.555556,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
X_train, X_test = train_test_split(user_movie_matrix, train_size=0.8)

In [20]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [21]:
num_input = ur20plus['title'].nunique()
n_nodes_inpl = num_input  
n_nodes_hl1  = 256  
n_nodes_outl = num_input  
hidden_1_layer_vals = {'weights':tf.Variable(tf.random_normal([n_nodes_inpl+1,n_nodes_hl1]))}
output_layer_vals = {'weights':tf.Variable(tf.random_normal([n_nodes_hl1+1,n_nodes_outl]))}

In [22]:
input_layer = tf.placeholder('float', [None, num_input])
# add a constant node to the first layer
# it needs to have the same shape as the input layer for me to be
# able to concatinate it later
input_layer_const = tf.fill( [tf.shape(input_layer)[0], 1] ,1.0  )
input_layer_concat =  tf.concat([input_layer, input_layer_const], 1)
# multiply output of input_layer wth a weight matrix 
layer_1 = tf.nn.sigmoid(tf.matmul(input_layer_concat,hidden_1_layer_vals['weights']))
# adding one bias node to the hidden layer
layer1_const = tf.fill( [tf.shape(layer_1)[0], 1] ,1.0  )
layer_concat =  tf.concat([layer_1, layer1_const], 1)
# multiply output of hidden with a weight matrix to get final output
output_layer = tf.matmul( layer_concat,output_layer_vals['weights'])
# output_true shall have the original shape for error calculations
output_true = tf.placeholder('float', [None, num_input])
# define our cost function
meansq =    tf.reduce_mean(tf.square(output_layer - output_true))
# define our optimizer
learn_rate = 0.1   # how fast the model should learn
optimizer = tf.train.AdagradOptimizer(learn_rate).minimize(meansq)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [23]:
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
# defining batch size, number of epochs and learning rate
batch_size = 100  # how many images to use together for training
hm_epochs =200    # how many times to go through the entire dataset
tot_images = X_train.shape[0] # total number of images

In [24]:
for epoch in range(hm_epochs):
    epoch_loss = 0    # initializing error as 0
    
    for i in range(int(tot_images/batch_size)):
        epoch_x = X_train[ i*batch_size : (i+1)*batch_size ]
        _, c = sess.run([optimizer, meansq],\
               feed_dict={input_layer: epoch_x, \
               output_true: epoch_x})
        epoch_loss += c
        
    output_train = sess.run(output_layer,\
               feed_dict={input_layer:X_train})
    output_test = sess.run(output_layer,\
                   feed_dict={input_layer:X_test})
    if not 10 < epoch < 190: # print out some trainings 
        print('MSE train', MSE(output_train, X_train),'MSE test', MSE(output_test, X_test))      
        print('Epoch', epoch, '/', hm_epochs, 'loss:',epoch_loss)
        if epoch == 10:
          print('--------------------------------------------------------')  

MSE train 95.10114464588546 MSE test 95.71423191887929
Epoch 0 / 200 loss: 447.06604766845703
MSE train 78.3595896478405 MSE test 79.2784042925872
Epoch 1 / 200 loss: 353.4072952270508
MSE train 67.52651021187249 MSE test 68.59390309045111
Epoch 2 / 200 loss: 296.1738052368164
MSE train 60.05357083256738 MSE test 61.186572598361906
Epoch 3 / 200 loss: 258.3053970336914
MSE train 54.62473810797974 MSE test 55.80427098467618
Epoch 4 / 200 loss: 231.69047927856445
MSE train 50.51436788576035 MSE test 51.74682050079096
Epoch 5 / 200 loss: 212.05007934570312
MSE train 47.31729799575009 MSE test 48.60827400569783
Epoch 6 / 200 loss: 197.01358032226562
MSE train 44.776470412496565 MSE test 46.119645271416275
Epoch 7 / 200 loss: 185.22546005249023
MSE train 42.71066964753074 MSE test 44.10330004925144
Epoch 8 / 200 loss: 175.78204727172852
MSE train 40.996348445045335 MSE test 42.43856705544009
Epoch 9 / 200 loss: 168.04206466674805
MSE train 39.547370050759405 MSE test 41.039692320708426
Epoc

In [25]:
pred_data = pd.DataFrame()
preds = sess.run(output_layer, feed_dict={input_layer: user_movie_matrix})
pred_data = pred_data.append(pd.DataFrame(preds))
pred_data = pred_data.stack().reset_index(name='rating')
pred_data.columns = ['userId', 'title', 'rating']
users = user_movie_matrix.index.tolist()
movies = user_movie_matrix.columns.tolist()
pred_data['userId'] = pred_data['userId'].map(lambda value: users[value])
pred_data['title'] = pred_data['title'].map(lambda value: movies[value])
keys = ['userId', 'title']
index_1 = pred_data.set_index(keys).index
index_2 = ur20plus.set_index(keys).index
top_ten_ranked = pred_data[~index_1.isin(index_2)]
top_ten_ranked = top_ten_ranked.sort_values(['userId', 'rating'], ascending=[True, False])
top_ten_ranked = top_ten_ranked.groupby('userId').head(10)

In [26]:
top_ten_ranked.loc[top_ten_ranked['userId'] == 9999999]

Unnamed: 0,userId,title,rating
735224,9999999,Swordfish (2001),8.513034
734813,9999999,Madagascar (2005),7.878016
735124,9999999,Short Circuit (1986),7.764729
735095,9999999,Scott Pilgrim vs. the World (2010),7.606499
734833,9999999,"Matrix Reloaded, The (2003)",7.415253
734602,9999999,"Great Escape, The (1963)",7.203551
734676,9999999,I.Q. (1994),7.199822
734738,9999999,Kids (1995),6.776124
734492,9999999,Eyes Wide Shut (1999),6.467609
734682,9999999,In Bruges (2008),6.330659


In [27]:
ur20plus = ur20plus.append(pd.DataFrame([["Hulk (2003)",1,9999991,4370,5,"genre",1]], columns =ur20plus.columns), ignore_index=True)
scaler = MinMaxScaler()
ur20plus['rating'] = ur20plus['rating'].values.astype(float)
rating_scaled = pd.DataFrame(scaler.fit_transform(ur20plus['rating'].values.reshape(-1,1)))
ur20plus['rating'] = rating_scaled
ur20plus = ur20plus.drop_duplicates(['userId', 'title'])
user_movie_matrix = ur20plus.pivot(index='userId', columns='title', values='rating')
user_movie_matrix.fillna(0, inplace=True)

In [28]:
pred_data = pd.DataFrame()
preds = sess.run(output_layer, feed_dict={input_layer: user_movie_matrix})
pred_data = pred_data.append(pd.DataFrame(preds))
pred_data = pred_data.stack().reset_index(name='rating')
pred_data.columns = ['userId', 'title', 'rating']
users = user_movie_matrix.index.tolist()
movies = user_movie_matrix.columns.tolist()
pred_data['userId'] = pred_data['userId'].map(lambda value: users[value])
pred_data['title'] = pred_data['title'].map(lambda value: movies[value])
keys = ['userId', 'title']
index_1 = pred_data.set_index(keys).index
index_2 = ur20plus.set_index(keys).index
top_ten_ranked = pred_data[~index_1.isin(index_2)]
top_ten_ranked = top_ten_ranked.sort_values(['userId', 'rating'], ascending=[True, False])
top_ten_ranked = top_ten_ranked.groupby('userId').head(10)

In [29]:
top_ten_ranked.loc[top_ten_ranked['userId'] == 9999991]

Unnamed: 0,userId,title,rating
735156,9999991,Solaris (2002),8.836129
734822,9999991,Man on the Moon (1999),7.92761
734934,9999991,"Nutty Professor, The (1996)",7.574513
735169,9999991,Spider-Man (2002),7.383693
734821,9999991,Man on Fire (2004),7.227942
734407,9999991,Death Proof (2007),7.00806
734469,9999991,Elizabeth (1998),6.842804
734594,9999991,"Goonies, The (1985)",6.777796
735121,9999991,Shine (1996),6.661472
734363,9999991,Congo (1995),6.50445


In [30]:
user_movie_matrix.tail()

title,(500) Days of Summer (2009),10 Things I Hate About You (1999),101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),13 Going on 30 (2004),"13th Warrior, The (1999)",1408 (2007),2001: A Space Odyssey (1968),2012 (2009),...,Young Frankenstein (1974),Young Guns (1988),Zack and Miri Make a Porno (2008),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002),Â¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,...,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.177778,0.133333,0.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
610,0.133333,0.0,0.0,0.0,0.0,0.0,0.0,0.155556,0.177778,0.111111,...,0.0,0.0,0.133333,0.2,0.133333,0.155556,0.155556,0.0,0.066667,0.0
9999991,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9999999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
ur20plus = ur20plus.append(pd.DataFrame([["Wild Wild West (1999)",1,9999995,900000000,5,"genre",1]], columns =ur20plus.columns), ignore_index=True)
scaler = MinMaxScaler()
ur20plus['rating'] = ur20plus['rating'].values.astype(float)
rating_scaled = pd.DataFrame(scaler.fit_transform(ur20plus['rating'].values.reshape(-1,1)))
ur20plus['rating'] = rating_scaled
ur20plus = ur20plus.drop_duplicates(['userId', 'title'])
user_movie_matrix = ur20plus.pivot(index='userId', columns='title', values='rating')
user_movie_matrix.fillna(0, inplace=True)
pred_data = pd.DataFrame()
preds = sess.run(output_layer, feed_dict={input_layer: user_movie_matrix})
pred_data = pred_data.append(pd.DataFrame(preds))
pred_data = pred_data.stack().reset_index(name='rating')
pred_data.columns = ['userId', 'title', 'rating']
users = user_movie_matrix.index.tolist()
movies = user_movie_matrix.columns.tolist()
pred_data['userId'] = pred_data['userId'].map(lambda value: users[value])
pred_data['title'] = pred_data['title'].map(lambda value: movies[value])
keys = ['userId', 'title']
index_1 = pred_data.set_index(keys).index
index_2 = ur20plus.set_index(keys).index
top_ten_ranked = pred_data[~index_1.isin(index_2)]
top_ten_ranked = top_ten_ranked.sort_values(['userId', 'rating'], ascending=[True, False])
top_ten_ranked = top_ten_ranked.groupby('userId').head(10)

In [32]:
top_ten_ranked.loc[top_ten_ranked['userId'] == 9999995]

Unnamed: 0,userId,title,rating
735562,9999995,"Blues Brothers, The (1980)",9.230131
735667,9999995,Coraline (2009),9.076329
736238,9999995,Office Space (1999),8.315799
736526,9999995,Tank Girl (1995),8.308948
736466,9999995,Spider-Man (2002),8.154663
735647,9999995,Click (2006),8.134357
736392,9999995,Scott Pilgrim vs. the World (2010),8.051975
735815,9999995,"Firm, The (1993)",7.91922
735858,9999995,Gattaca (1997),7.845191
735540,9999995,"Big Short, The (2015)",7.792197


In [36]:
# POST /get_price
req = json.loads(REQUEST)
movie_name = req['body']['movie'][0]
movie_rating = req['body']['rating'][0]
print(movie_name)
print(movie_rating)


ur20plus = ur20plus.append(pd.DataFrame([[movie_name,1,9999996,900000000,movie_rating,"genre",1]], columns =ur20plus.columns), ignore_index=True)
# ur20plus = ur20plus.append(pd.DataFrame([["Wild Wild West (1999)",1,9999996,900000000,5,"genre",1]], columns =ur20plus.columns), ignore_index=True)
scaler = MinMaxScaler()
ur20plus['rating'] = ur20plus['rating'].values.astype(float)
rating_scaled = pd.DataFrame(scaler.fit_transform(ur20plus['rating'].values.reshape(-1,1)))
ur20plus['rating'] = rating_scaled
ur20plus = ur20plus.drop_duplicates(['userId', 'title'])
user_movie_matrix = ur20plus.pivot(index='userId', columns='title', values='rating')
user_movie_matrix.fillna(0, inplace=True)

pred_data = pd.DataFrame()
preds = sess.run(output_layer, feed_dict={input_layer: user_movie_matrix})
pred_data = pred_data.append(pd.DataFrame(preds))
pred_data = pred_data.stack().reset_index(name='rating')
pred_data.columns = ['userId', 'title', 'rating']
users = user_movie_matrix.index.tolist()
movies = user_movie_matrix.columns.tolist()
pred_data['userId'] = pred_data['userId'].map(lambda value: users[value])
pred_data['title'] = pred_data['title'].map(lambda value: movies[value])
keys = ['userId', 'title']
index_1 = pred_data.set_index(keys).index
index_2 = ur20plus.set_index(keys).index
top_ten_ranked = pred_data[~index_1.isin(index_2)]
top_ten_ranked = top_ten_ranked.sort_values(['userId', 'rating'], ascending=[True, False])
top_ten_ranked = top_ten_ranked.groupby('userId').head(10)
print('==============')
print(top_ten_ranked.loc[top_ten_ranked['userId'] == 9999996])

NameError: name 'json' is not defined

In [34]:
top_ten_ranked.loc[top_ten_ranked['userId'] == 9999996]

Unnamed: 0,userId,title,rating
737890,9999996,Transformers (2007),10.6872
737811,9999996,Superman II (1980),9.621454
737683,9999996,Scary Movie 2 (2001),8.841021
736784,9999996,"Arrival, The (1996)",8.521485
737939,9999996,Waterworld (1995),8.385056
737468,9999996,Monty Python's Life of Brian (1979),7.610262
736912,9999996,Cars (2006),7.33776
737511,9999996,"Negotiator, The (1998)",7.289282
737381,9999996,"Little Princess, A (1995)",6.990422
736708,9999996,25th Hour (2002),6.906772
