In [1]:
# if error message: Module not found, do following:
# %pip install <module-name>

In [2]:
import numpy as np
import numpy.ma as ma
from numpy import genfromtxt
from collections import defaultdict
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from tabulate import tabulate
from recsysNN_utils import *
pd.set_option("display.precision", 1)

In [3]:
# Load Data, set configuration variables
item_train, user_train, y_train, item_features, user_features, item_vecs, movie_dict, user_to_genre = load_data()

num_user_features = user_train.shape[1] - 3  # remove userid, rating count and ave rating during training
num_item_features = item_train.shape[1] - 1  # remove movie id at train time
uvs = 3  # user genre vector start
ivs = 3  # item genre vector start
u_s = 3  # start of columns to use in training, user
i_s = 1  # start of columns to use in training, items
scaledata = True  # applies the standard scalar to data if true
print(f"Number of training vectors: {len(item_train)}")

Number of training vectors: 58187


In [4]:
pprint_train(user_train, user_features, uvs, u_s, maxcount=10)

[user id],[rating count],[rating ave],Act ion,Adve nture,Anim ation,Chil dren,Com edy,Crime,Docum entary,Drama,Fan tasy,Hor ror,Mys tery,Rom ance,Sci -Fi,Thri ller
2,16,4.1,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.2,3.9
2,16,4.1,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.2,3.9
2,16,4.1,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.2,3.9
2,16,4.1,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.2,3.9
2,16,4.1,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.2,3.9
2,16,4.1,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.2,3.9
2,16,4.1,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.2,3.9
2,16,4.1,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.2,3.9
2,16,4.1,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.2,3.9
2,16,4.1,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.2,3.9


In [9]:
np.sum(user_train[:,0]==2)

39

In [10]:
pprint_train(item_train, item_features, ivs, i_s, maxcount=5, user=True)

[movie id],year,ave rating,Act ion,Adve nture,Anim ation,Chil dren,Com edy,Crime,Docum entary,Drama,Fan tasy,Hor ror,Mys tery,Rom ance,Sci -Fi,Thri ller
6874,2003,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6874,2003,4.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6874,2003,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8798,2004,3.8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8798,2004,3.8,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
len(y_train), len(item_train), len(user_train)

(58187, 58187, 58187)

In [12]:
y_train[:10]

array([4. , 4. , 4. , 3.5, 3.5, 3.5, 3.5, 4. , 4. , 4. ])

In [13]:
if scaledata:
    item_train_save = item_train
    user_train_save = user_train
    
    scalerItem = StandardScaler()
    scalerItem.fit(item_train)
    item_train = scalerItem.transform(item_train)
    
    scalerUser = StandardScaler()
    scalerUser.fit(user_train)
    user_train =scalerUser.transform(user_train)
    
    print(np.allclose(item_train_save, scalerItem.inverse_transform(item_train)))
    print(np.allclose(user_train_save, scalerUser.inverse_transform(user_train)))

True
True


In [14]:
item_train, item_test = train_test_split(item_train, train_size=0.80, shuffle=True, random_state=1)
user_train, user_test = train_test_split(user_train, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test = train_test_split(y_train, train_size=0.80, shuffle=True, random_state=1)

print(f"movie/item training data shape: {item_train.shape}")
print(f"movie/item test data shape: {item_test.shape}")

movie/item training data shape: (46549, 17)
movie/item test data shape: (11638, 17)


In [15]:
pprint_train(user_train, user_features, uvs, u_s, maxcount=5)

[user id],[rating count],[rating ave],Act ion,Adve nture,Anim ation,Chil dren,Com edy,Crime,Docum entary,Drama,Fan tasy,Hor ror,Mys tery,Rom ance,Sci -Fi,Thri ller
1,0,0.6,0.7,0.6,0.6,0.7,0.7,0.5,0.7,0.2,0.3,0.3,0.5,0.5,0.8,0.5
0,0,1.6,1.5,1.7,0.9,1.0,1.4,0.8,-1.2,1.2,1.2,1.6,0.9,1.4,1.2,1.0
0,0,0.8,0.6,0.7,0.5,0.6,0.6,0.3,-1.2,0.7,0.8,0.9,0.6,0.2,0.6,0.6
1,0,-0.1,0.2,-0.1,0.3,0.7,0.3,0.2,1.0,-0.5,-0.7,-2.1,0.5,0.7,0.3,0.0
-1,0,-1.3,-0.8,-0.8,0.1,-0.1,-1.1,-0.9,-1.2,-1.5,-0.6,-0.5,-0.6,-0.9,-0.4,-0.9


In [16]:
scaler = MinMaxScaler((-1,1))
scaler.fit(y_train.reshape(-1,1))
ynorm_train = scaler.transform(y_train.reshape(-1,1))
ynorm_test = scaler.transform(y_test.reshape(-1,1))
print(ynorm_train.shape, ynorm_test.shape)

(46549, 1) (11638, 1)


In [18]:
num_outputs = 32
tf.random.set_seed(1)
user_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs, activation='linear')
])

item_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs, activation='linear')
])

input_user = tf.keras.layers.Input(shape=(num_user_features))
vu = user_NN(input_user)
vu = tf.linalg.l2_normalize(vu, axis=1)

input_item = tf.keras.layers.Input(shape=(num_item_features))
vm = item_NN(input_item)
vm = tf.linalg.l2_normalize(vm, axis=1)

output = tf.keras.layers.Dot(axes=1)([vu, vm])

model = Model([input_user, input_item], output)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 14)]         0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 16)]         0           []                               
                                                                                                  
 sequential_2 (Sequential)      (None, 32)           40864       ['input_3[0][0]']                
                                                                                                  
 sequential_3 (Sequential)      (None, 32)           41376       ['input_4[0][0]']                
                                                                                              

In [19]:
tf.random.set_seed(1)
cost_fn = tf.keras.losses.MeanSquaredError()
opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt,
             loss=cost_fn)

In [20]:
tf.random.set_seed(1)
model.fit([user_train[:,u_s:], item_train[:,i_s:]], ynorm_train, epochs=30)

Epoch 1/30


2022-11-07 18:25:32.271906: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-11-07 18:25:32.476707: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x16dc02cd0>

In [22]:
model.evaluate([user_test[:, u_s:], item_test[:, i_s:]], ynorm_test)

 54/364 [===>..........................] - ETA: 0s - loss: 0.1025

2022-11-07 18:30:13.679982: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




0.10468854010105133

In [23]:
new_user_id = 5000
new_rating_ave = 1.0
new_action = 1.0
new_adventure = 1
new_animation = 1
new_childrens = 1
new_comedy = 5
new_crime = 1
new_documentary = 1
new_drama = 1
new_fantasy = 1
new_horror = 1
new_mystery = 1
new_romance = 5
new_scifi = 5
new_thriller = 1
new_rating_count = 3

user_vec = np.array([[new_user_id, new_rating_count, new_rating_ave,
                      new_action, new_adventure, new_animation, new_childrens,
                      new_comedy, new_crime, new_documentary,
                      new_drama, new_fantasy, new_horror, new_mystery,
                      new_romance, new_scifi, new_thriller]])

In [24]:
item_vecs.shape

(1883, 17)

In [29]:
user_vecs = gen_user_vecs(user_vec, len(item_vecs))
sorted_index, sorted_ypu, sorted_items, sorted_user = predict_uservec(user_vecs,
                                        item_vecs, model, u_s, i_s, scaler,
                                        scalerUser, scalerItem, scaledata=scaledata)
print_pred_movies(sorted_ypu, sorted_user, sorted_items, movie_dict,
                 maxcount=10)



2022-11-07 18:55:05.902361: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


y_p,movie id,rating ave,title,genres
4.8349,4979,3.68382,"Royal Tenenbaums, The (2001)",Comedy|Drama
4.83162,5013,3.7037,Gosford Park (2001),Comedy|Drama|Mystery
4.82237,4246,3.62308,Bridget Jones's Diary (2001),Comedy|Drama|Romance
4.81813,5110,3.60417,Super Troopers (2001),Comedy|Crime|Mystery
4.7963,5577,3.66667,Igby Goes Down (2002),Comedy|Drama
4.79527,5218,3.68824,Ice Age (2002),Adventure|Animation|Children|Comedy
4.79355,5673,3.62121,Punch-Drunk Love (2002),Comedy|Drama|Romance
4.78804,6003,3.6,Confessions of a Dangerous Mind (2002),Comedy|Crime|Drama|Thriller
4.78577,5377,3.71591,About a Boy (2002),Comedy|Drama|Romance


In [32]:
uid = 36
user_vecs, y_vecs = get_user_vecs(uid, scalerUser.inverse_transform(user_train),
                                item_vecs, user_to_genre)
sorted_index, sorted_ypu, sorted_items, sorted_user = predict_uservec(
            user_vecs, item_vecs, model, u_s, i_s, scaler, scalerUser,
            scalerItem, scaledata=scaledata)
sorted_y = y_vecs[sorted_index]

print_existing_user(sorted_ypu, sorted_y.reshape(-1,1), sorted_user,
                   sorted_items, item_features, ivs, uvs, movie_dict,
                   maxcount=10)



y_p,y,user,user genre ave,movie rating ave,title,genres
3.2,3.0,36,3.0,2.86,"Time Machine, The (2002)",Adventure
3.1,3.0,36,3.0,2.86,"Time Machine, The (2002)",Action
2.9,3.0,36,3.0,2.86,"Time Machine, The (2002)",Sci-Fi
2.0,1.5,36,1.75,3.52,Road to Perdition (2002),Crime
2.0,2.0,36,1.75,3.52,Gangs of New York (2002),Crime
1.9,1.0,36,1.5,4.0,"Beautiful Mind, A (2001)",Drama
1.9,1.0,36,1.0,4.0,"Beautiful Mind, A (2001)",Romance
1.8,2.0,36,1.5,3.52,Gangs of New York (2002),Drama
1.8,1.5,36,1.5,3.52,Road to Perdition (2002),Drama


In [33]:
def sq_dist(a, b):
    return np.sum((a - b)**2)

In [35]:
from public_tests import *
test_sq_dist(sq_dist)

[92mAll tests passed!


In [36]:
a1 = np.array([1.0, 2.0, 3.0]); b1 = np.array([1.0, 2.0, 3.0])
a2 = np.array([1.1, 2.1, 3.1]); b2 = np.array([1.0, 2.0, 3.0])
a3 = np.array([0, 1, 0]);       b3 = np.array([1, 0, 0])
print(f"squared distance between a1 and b1: {sq_dist(a1, b1)}")
print(f"squared distance between a2 and b2: {sq_dist(a2, b2)}")
print(f"squared distance between a3 and b3: {sq_dist(a3, b3)}")

squared distance between a1 and b1: 0.0
squared distance between a2 and b2: 0.030000000000000054
squared distance between a3 and b3: 2


In [37]:
input_item_m = tf.keras.layers.Input(shape=(num_item_features))
vm_m = item_NN(input_item_m)
vm_m = tf.linalg.l2_normalize(vm_m, axis=1)
model_m = Model(input_item_m, vm_m)
model_m.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 16)]              0         
                                                                 
 sequential_3 (Sequential)   (None, 32)                41376     
                                                                 
 tf.math.l2_normalize_4 (TFO  (None, 32)               0         
 pLambda)                                                        
                                                                 
Total params: 41,376
Trainable params: 41,376
Non-trainable params: 0
_________________________________________________________________


In [38]:
scaled_item_vecs = scalerItem.transform(item_vecs)
vms = model_m.predict(scaled_item_vecs[:,i_s:])
print(f"size of all predicted movie feature vectors: {vms.shape}")

size of all predicted movie feature vectors: (1883, 32)


2022-11-07 23:07:29.822898: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
