## Deep Learnig for Content-Based Filtering

In [1]:
import numpy as np
import numpy.ma as ma
from numpy import genfromtxt
from collections import defaultdict
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Lambda

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import tabulate
from recsysNN_utils import *
pd.set_option("display.precision", 1)

## Training Data

In [2]:
# Load Data, set configuration variables
item_train, user_train, y_train, item_features, user_features, item_vecs, movie_dict, user_to_genre = load_data()

num_user_features = user_train.shape[1] - 3  # remove userid, rating count and ave rating during training
num_item_features = item_train.shape[1] - 1  # remove movie id at train time
uvs = 3  # user genre vector start
ivs = 3  # item genre vector start
u_s = 3  # start of columns to use in training, user
i_s = 1  # start of columns to use in training, items
print(f"Number of training vectors: {len(item_train)}")

Number of training vectors: 58187


In [51]:
pprint_train(user_train, user_features, uvs, u_s, maxcount=5)

[user id],[rating count],[rating ave],Act ion,Adve nture,Anim ation,Chil dren,Com edy,Crime,Docum entary,Drama,Fan tasy,Hor ror,Mys tery,Rom ance,Sci -Fi,Thri ller
2,16,4.1,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.2,3.9
2,16,4.1,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.2,3.9
2,16,4.1,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.2,3.9
2,16,4.1,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.2,3.9
2,16,4.1,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.2,3.9


In [55]:
pprint_train(item_train, item_features, ivs, i_s, maxcount=5, user=False)

[movie id],year,ave rating,Act ion,Adve nture,Anim ation,Chil dren,Com edy,Crime,Docum entary,Drama,Fan tasy,Hor ror,Mys tery,Rom ance,Sci -Fi,Thri ller
6874,2003,4.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
6874,2003,4.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
6874,2003,4.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
8798,2004,3.8,1,0,0,0,0,0,0,0,0,0,0,0,0,0
8798,2004,3.8,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [57]:
print(f"y_train[:5]: {y_train[:5]}")

y_train[:5]: [4.  4.  4.  3.5 3.5]


## Preparing the training data

In [3]:
# scale training data
item_train_unscaled = item_train
user_train_unscaled = user_train
y_train_unscaled    = y_train

scalerItem = StandardScaler()
scalerItem.fit(item_train)
item_train = scalerItem.transform(item_train)

scalerUser = StandardScaler()
scalerUser.fit(user_train)
user_train = scalerUser.transform(user_train)

scalerTarget = MinMaxScaler((-1, 1))
scalerTarget.fit(y_train.reshape(-1, 1))
y_train = scalerTarget.transform(y_train.reshape(-1, 1))
#ynorm_test = scalerTarget.transform(y_test.reshape(-1, 1))

print(np.allclose(item_train_unscaled, scalerItem.inverse_transform(item_train)))
print(np.allclose(user_train_unscaled, scalerUser.inverse_transform(user_train)))

True
True


## Create Training and Test Data Set

In [4]:
item_train, item_test = train_test_split(item_train, train_size=0.80, shuffle=True, random_state = 1)
user_train, user_test = train_test_split(user_train, train_size=0.80, shuffle=True, random_state = 1)
y_train, y_test = train_test_split(y_train, train_size = 0.80, shuffle = True, random_state = 1)

print(f"movie/item training data shape: {item_train.shape}")
print(f"movie/item test data shape: {item_test.shape}")

movie/item training data shape: (46549, 17)
movie/item test data shape: (11638, 17)


The scaled, shuffled data now has a mean of zero.

In [63]:
pprint_train(user_train, user_features, uvs, u_s, maxcount=5)

[user id],[rating count],[rating ave],Act ion,Adve nture,Anim ation,Chil dren,Com edy,Crime,Docum entary,Drama,Fan tasy,Hor ror,Mys tery,Rom ance,Sci -Fi,Thri ller
1,0,0.6,0.7,0.6,0.6,0.7,0.7,0.5,0.7,0.2,0.3,0.3,0.5,0.5,0.8,0.5
0,0,1.6,1.5,1.7,0.9,1.0,1.4,0.8,-1.2,1.2,1.2,1.6,0.9,1.4,1.2,1.0
0,0,0.8,0.6,0.7,0.5,0.6,0.6,0.3,-1.2,0.7,0.8,0.9,0.6,0.2,0.6,0.6
1,0,-0.1,0.2,-0.1,0.3,0.7,0.3,0.2,1.0,-0.5,-0.7,-2.1,0.5,0.7,0.3,0.0
-1,0,-1.3,-0.8,-0.8,0.1,-0.1,-1.1,-0.9,-1.2,-1.5,-0.6,-0.5,-0.6,-0.9,-0.4,-0.9


## Neural Network for content-based filtering

In [9]:
num_outputs = 32
tf.random.set_seed(1)
user_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs, activation='linear'), 
])

item_NN = tf.keras.models.Sequential([    
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs, activation='linear'),
])

# create the user input and point to the base network
input_user = tf.keras.layers.Input(shape=(num_user_features,))
vu = user_NN(input_user)
#vu = tf.linalg.l2_normalize(vu, axis=1)
vu = Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(vu)

# create the item input and point to the base network
input_item = tf.keras.layers.Input(shape=(num_item_features,))
vm = item_NN(input_item)
#vm = tf.linalg.l2_normalize(vm, axis=1)
vm = Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(vm)

# compute the dot product of the two vectors vu and vm
output = tf.keras.layers.Dot(axes=1)([vu, vm])

# specify the inputs and output of the model
model = Model([input_user, input_item], output)

model.summary()




In [11]:
tf.random.set_seed(1)
cost_fn = tf.keras.losses.MeanSquaredError()
opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt,
              loss=cost_fn)

In [13]:
tf.random.set_seed(1)
model.fit([user_train[:, u_s:], item_train[:, i_s:]], y_train, epochs=30)

Epoch 1/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 0.1303
Epoch 2/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.1174
Epoch 3/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.1151
Epoch 4/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.1139
Epoch 5/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.1125
Epoch 6/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.1112
Epoch 7/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.1102
Epoch 8/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.1093
Epoch 9/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.1083
Epoch 10/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

<keras.src.callbacks.history.History at 0x2020ee066f0>

In [17]:
model.evaluate([user_test[:, u_s:], item_test[:, i_s:]], y_test)

[1m364/364[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.1062


0.10660506784915924

It is comparable to the training loss indicationg model has not overfit the training data.

## Predictions for a New User

In [19]:
#Create a new user
new_user_id = 5000
new_rating_count = 3
new_rating_ave = 1.0
new_action = 1.0
new_adventure = 1
new_animation = 1
new_children = 1
new_comedy = 5
new_crime = 1
new_documentary = 1
new_drama = 1
new_fantasy =1
new_horror =1
new_mystery = 1
new_romance = 5
new_scifi = 5
new_thriller = 1

user_vec = np.array([[new_user_id, new_rating_count, new_rating_ave, new_action, new_adventure, new_animation, 
                      new_children, new_comedy, new_crime, new_documentary, new_drama, new_fantasy, new_horror, 
                      new_mystery, new_romance, new_scifi, new_thriller]])

 look at the top-rated movies for the new user. Recall, the user vector had genres that favored Comedy and Romance. Below, we'll use a set of movie/item vectors, item_vecs that have a vector for each movie in the training/test set. This is matched with the user vector above and the scaled vectors are used to predict ratings for all the movies for our new user above.

In [21]:
# generate and replicate the user vector to match the number movies in the data set.
user_vecs = gen_user_vecs(user_vec,len(item_vecs))

# scale our user and item vectors
suser_vecs = scalerUser.transform(user_vecs)
sitem_vecs = scalerItem.transform(item_vecs)

# make a prediction
y_p = model.predict([suser_vecs[:, u_s:], sitem_vecs[:, i_s:]])

# unscale y prediction 
y_pu = scalerTarget.inverse_transform(y_p)

# sort the results, highest prediction first
sorted_index = np.argsort(-y_pu,axis=0).reshape(-1).tolist()  #negate to get largest rating first
sorted_ypu   = y_pu[sorted_index]
sorted_items = item_vecs[sorted_index]  #using unscaled vectors for display

print_pred_movies(sorted_ypu, user_vecs, sorted_items, movie_dict, maxcount = 10)

[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


y_p,movie id,rating ave,title,genres
4.9743,99007,3.5,Warm Bodies (2013),Comedy|Horror|Romance
4.93482,86882,3.56,Midnight in Paris (2011),Comedy|Fantasy|Romance
4.89931,69406,3.5,"Proposal, The (2009)",Comedy|Romance
4.87756,64957,3.59459,"Curious Case of Benjamin Button, The (2008)",Drama|Fantasy|Mystery|Romance
4.86672,60756,3.55357,Step Brothers (2008),Comedy
4.86296,52287,3.54545,Meet the Robinsons (2007),Action|Adventure|Animation|Children|Comedy|Sci-Fi
4.86129,102407,3.375,"Great Gatsby, The (2013)",Drama
4.85826,68135,3.55,17 Again (2009),Comedy|Drama
4.85762,54259,3.60294,Stardust (2007),Adventure|Comedy|Fantasy|Romance
4.85706,61323,3.48718,Burn After Reading (2008),Comedy|Crime|Drama


## Prediction for an exiting user

In [25]:
uid = 36
# form a set of user vectors. This is the same vector, transformed and repeated.
user_vecs, y_vecs = get_user_vecs(uid, user_train_unscaled, item_vecs, user_to_genre)

# scale our user and item vectors
suser_vecs = scalerUser.transform(user_vecs)
sitem_vecs = scalerItem.transform(item_vecs)

# make a prediction
y_p = model.predict([suser_vecs[:, u_s:], sitem_vecs[:, i_s:]])

# unscale y prediction 
y_pu = scalerTarget.inverse_transform(y_p)

# sort the results, highest prediction first
sorted_index = np.argsort(-y_pu,axis=0).reshape(-1).tolist()  #negate to get largest rating first
sorted_ypu   = y_pu[sorted_index]
sorted_items = item_vecs[sorted_index]  #using unscaled vectors for display
sorted_user  = user_vecs[sorted_index]
sorted_y     = y_vecs[sorted_index]

#print sorted predictions for movies rated by the user
print_existing_user(sorted_ypu, sorted_y.reshape(-1,1), sorted_user, sorted_items, item_features, ivs, uvs, movie_dict, maxcount = 10)

[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


y_p,y,user,user genre ave,movie rating ave,title,genres
2.9,3.0,36,3.0,2.86,"Time Machine, The (2002)",Adventure
2.7,3.0,36,3.0,2.86,"Time Machine, The (2002)",Action
2.6,3.0,36,3.0,2.86,"Time Machine, The (2002)",Sci-Fi
1.6,1.5,36,1.75,3.52,Road to Perdition (2002),Crime
1.6,2.0,36,1.75,3.52,Gangs of New York (2002),Crime
1.6,1.0,36,1.0,4.0,"Beautiful Mind, A (2001)",Romance
1.6,1.0,36,1.5,4.0,"Beautiful Mind, A (2001)",Drama
1.4,1.5,36,1.5,3.52,Road to Perdition (2002),Drama
1.4,2.0,36,1.5,3.52,Gangs of New York (2002),Drama


### Finding similar items

In [27]:
def sq_dist(a, b):
    
    """
    Returns the square distance between the two vectors
    Args:
    a (ndarray(n,)): vector with n features
    b (ndarray(n,)): vector with n features
    Returns:
    d (float) : distance
    """
    d = sum(np.square(a - b))

    return d

Trained item_NN and build a small model to allow us to run the movie vectors through it to generate vm.

In [29]:
input_item_m = tf.keras.layers.Input(shape=(num_item_features,))   #input layer
vm_m = item_NN(input_item_m)
vm_m = Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(vm_m)

# specify the inputs and output of the model
model_m = Model(input_item_m, vm_m)
model_m.summary()

Create a set of movie feature vectors by using the model to predict using a set of item/movie vectors as input. item_vecs is a set of all of the movie vectors. Recall that the same movie will appear as a separate vector for each of its genres. It must be scaled to use with the trained model. The result of the prediction is a 32 entry feature vector for each movie.

In [31]:
scaled_item_vec = scalerItem.transform(item_vecs)

vms = model_m.predict(scaled_item_vec[:,i_s:]) 
print(f"size of all predicted movie feature vectors: {vms.shape}")   

[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
size of all predicted movie feature vectors: (1883, 32)


Find the closet movie by finding the miminum along each row. Avoid selecting the same movie use of numpy massked arrays. The masked values along the daigonal won't be included in the computation 

In [33]:
count = 50
dim = len(vms)
dist = np.zeros((dim,dim))

for i in range(dim):
    for j in range(dim):
        dist[i,j] = sq_dist(vms[i, :], vms[j, :])
        
m_dist = ma.masked_array(dist, mask=np.identity(dist.shape[0]))  # mask the diagonal

disp = [["movie1", "genres", "movie2", "genres"]]
for i in range(count):
    min_idx = np.argmin(m_dist[i])
    movie1_id = int(item_vecs[i,0])
    movie2_id = int(item_vecs[min_idx,0])
    genre1,_  = get_item_genre(item_vecs[i,:], ivs, item_features)
    genre2,_  = get_item_genre(item_vecs[min_idx,:], ivs, item_features)

    disp.append( [movie_dict[movie1_id]['title'], genre1,
                  movie_dict[movie2_id]['title'], genre2]
               )
table = tabulate.tabulate(disp, tablefmt='html', headers="firstrow", floatfmt=[".1f", ".1f", ".0f", ".2f", ".2f"])
table

movie1,genres,movie2,genres.1
Save the Last Dance (2001),Drama,John Q (2002),Drama
Save the Last Dance (2001),Romance,Saving Silverman (Evil Woman) (2001),Romance
"Wedding Planner, The (2001)",Comedy,Spy Kids (2001),Comedy
"Wedding Planner, The (2001)",Romance,Mr. Deeds (2002),Romance
Hannibal (2001),Horror,Final Destination 2 (2003),Horror
Hannibal (2001),Thriller,"Sum of All Fears, The (2002)",Thriller
Saving Silverman (Evil Woman) (2001),Comedy,Cats & Dogs (2001),Comedy
Saving Silverman (Evil Woman) (2001),Romance,Save the Last Dance (2001),Romance
Down to Earth (2001),Comedy,Joe Dirt (2001),Comedy
Down to Earth (2001),Fantasy,"Haunted Mansion, The (2003)",Fantasy


The results show the model will generally suggest a movie with similar genre's.