In [34]:
import numpy as np
import numpy.ma as ma
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import tabulate
pd.set_option("display.precision", 1)
from IPython.display import HTML

## 2 - Movie ratings dataset
The data set is derived from the [MovieLens ml-latest-small](https://grouplens.org/datasets/movielens/latest/) dataset.

The original dataset consisted of around 9,000 films rated by 600 users, with ratings ranging from 0.5 to 5 in increments of 0.5. For analysis purposes, the dataset was trimmed to include only movies released after the year 2000 and belonging to popular genres. The refined version includes $n_u = 397$ users, $n_m = 847$ movies, and a total of 25,521 ratings. Each movie entry includes the title, release year, and one or more associated genres. For instance, Toy Story 3, released in 2010, is categorized under several genres such as "Adventure|Animation|Children|Comedy|Fantasy". While the dataset contains extensive information about movies, it holds minimal data about the users apart from their ratings. This dataset is utilized to construct training vectors for the neural network models discussed below.
Below is a table listing the top 10 most-rated movies, which also tend to have high average ratings. How many of these have you seen?



In [35]:
top10_df = pd.read_csv("/content/content_top10_df.csv")
bygenre_df = pd.read_csv("/content/content_bygenre_df.csv")
top10_df

Unnamed: 0,movie id,num ratings,ave rating,title,genres
0,4993,198,4.1,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy
1,5952,188,4.0,"Lord of the Rings: The Two Towers, The",Adventure|Fantasy
2,7153,185,4.1,"Lord of the Rings: The Return of the King, The",Action|Adventure|Drama|Fantasy
3,4306,170,3.9,Shrek,Adventure|Animation|Children|Comedy|Fantasy|Ro...
4,58559,149,4.2,"Dark Knight, The",Action|Crime|Drama
5,6539,149,3.8,Pirates of the Caribbean: The Curse of the Bla...,Action|Adventure|Comedy|Fantasy
6,79132,143,4.1,Inception,Action|Crime|Drama|Mystery|Sci-Fi|Thriller
7,6377,141,4.0,Finding Nemo,Adventure|Animation|Children|Comedy
8,4886,132,3.9,"Monsters, Inc.",Adventure|Animation|Children|Comedy|Fantasy
9,7361,131,4.2,Eternal Sunshine of the Spotless Mind,Drama|Romance|Sci-Fi


In [3]:
bygenre_df

Unnamed: 0,genre,num movies,ave rating/genre,ratings per genre
0,Action,321,3.4,10377
1,Adventure,234,3.4,8785
2,Animation,76,3.6,2588
3,Children,69,3.4,2472
4,Comedy,326,3.4,8911
5,Crime,139,3.5,4671
6,Documentary,13,3.8,280
7,Drama,342,3.6,10201
8,Fantasy,124,3.4,4468
9,Horror,56,3.2,1345


## 3 - Content-based filtering with a neural network


Content-based filtering also creates feature vectors for both users and movies. However, it acknowledges that incorporating additional details about users or movies could enhance prediction accuracy. This extra information is fed into a neural network, which then produces the user and movie vectors as illustrated below.



<a name="3.1"></a>
### 3.1 Training Data
The input features for movies combine original dataset attributes with additional 'engineered features'. One such feature is the average rating, calculated from existing user ratings.

For users, the input features are entirely engineered. These include the average rating each user has given across different genres. Although user ID, total number of ratings, and overall average rating are part of the dataset, they are excluded from the training and prediction phases. These fields are retained solely for the purpose of data interpretation.

The training dataset comprises all user-provided ratings. To improve representation of less common genres, some ratings are intentionally duplicated, increasing the number of relevant training samples. The dataset is divided into two equal-sized arrays: one representing users and the other representing movies/items.

Let’s now load and examine a portion of this data.

In [36]:
import csv
import numpy as np
from numpy import genfromtxt
import pickle
import tabulate
from collections import defaultdict

def load_data():
    ''' called to load preprepared data for the lab '''
    item_train = genfromtxt('/content/content_item_train.csv', delimiter=',')
    user_train = genfromtxt('/content/content_user_train.csv', delimiter=',')
    y_train    = genfromtxt('/content/content_y_train.csv', delimiter=',')
    with open('/content/content_item_train_header.txt', newline='') as f:    #csv reader handles quoted strings better
        item_features = list(csv.reader(f))[0]
    with open('/content/content_user_train_header.txt', newline='') as f:
        user_features = list(csv.reader(f))[0]
    item_vecs = genfromtxt('/content/content_item_vecs.csv', delimiter=',')

    movie_dict = defaultdict(dict)
    count = 0
#    with open('/movies.csv', newline='') as csvfile:
    with open('/content/content_movie_list.csv', newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        for line in reader:
            if count == 0:
                count += 1  #skip header
                #print(line) print
            else:
                count += 1
                movie_id = int(line[0])
                movie_dict[movie_id]["title"] = line[1]
                movie_dict[movie_id]["genres"] = line[2]

    with open('/content/content_user_to_genre.pickle', 'rb') as f:
        user_to_genre = pickle.load(f)
#user_to_genre
    return(item_train, user_train, y_train, item_features, user_features, item_vecs, movie_dict, user_to_genre)

In [37]:
def split_str(ifeatures, smax):
    ''' Splits long feature name strings to improve table formatting '''
    ofeatures = []
    for s in ifeatures:
        if ' ' not in s:  # skip if string already contains a space
            if len(s) > smax:
                mid = int(len(s) / 2)
                s = s[:mid] + " " + s[mid:]
        ofeatures.append(s)
    return ofeatures


def pprint_train(x_train, features, vs, u_s, maxcount=5, user=True):
    """
    Nicely formats and prints the user_train or item_train array as an HTML table.

    Args:
        x_train (ndarray): The input training array to print.
        features (list): List of feature names.
        vs (int): Index where vector features begin.
        u_s (int): Number of user-specific columns before vector data.
        maxcount (int): Number of rows to display.
        user (bool): Flag indicating if it's user data (True) or item data (False).

    Returns:
        str: An HTML-formatted string representing the data table.
    """
    if user:
        flist = [".0f", ".0f", ".1f",
                 ".1f", ".1f", ".1f", ".1f", ".1f", ".1f", ".1f", ".1f", ".1f", ".1f", ".1f", ".1f", ".1f", ".1f"]
    else:
        flist = [".0f", ".0f", ".1f",
                 ".0f", ".0f", ".0f", ".0f", ".0f", ".0f", ".0f", ".0f", ".0f", ".0f", ".0f", ".0f", ".0f", ".0f"]

    head = features[:vs]
    if vs < u_s:
        print("error, vector start {vs} should be greater than user start {u_s}")
    for i in range(u_s):
        head[i] = "[" + head[i] + "]"
    genres = features[vs:]
    hdr = head + genres
    disp = [split_str(hdr, 5)]
    count = 0
    for i in range(x_train.shape[0]):
        if count == maxcount:
            break
        count += 1
        disp.append([
            x_train[i, 0].astype(int),
            x_train[i, 1].astype(int),
            x_train[i, 2].astype(float),
            *x_train[i, 3:].astype(float)
        ])
    table = tabulate.tabulate(disp, tablefmt='html', headers="firstrow", floatfmt=flist, numalign='center')
    return table


In [38]:
# Load Data, set configuration variables
item_train, user_train, y_train, item_features, user_features, item_vecs, movie_dict, user_to_genre = load_data()

num_user_features = user_train.shape[1] - 3  # remove userid, rating count and ave rating during training
num_item_features = item_train.shape[1] - 1  # remove movie id at train time
uvs = 3  # user genre vector start
ivs = 3  # item genre vector start
u_s = 3  # start of columns to use in training, user
i_s = 1  # start of columns to use in training, items
print(f"Number of training vectors: {len(item_train)}")

Number of training vectors: 50884


  user_to_genre = pickle.load(f)


In [7]:
user_train[0]

array([ 2.  , 22.  ,  4.  ,  3.95,  4.25,  0.  ,  0.  ,  4.  ,  4.12,
        4.  ,  4.04,  0.  ,  3.  ,  4.  ,  0.  ,  3.88,  3.89])

In [8]:
from IPython.display import HTML
html_code = pprint_train(user_train, user_features, uvs,  u_s, maxcount=7)
display(HTML(html_code))


[user id],[rating count],[rating ave],Act ion,Adve nture,Anim ation,Chil dren,Com edy,Crime,Docum entary,Drama,Fan tasy,Hor ror,Mys tery,Rom ance,Sci -Fi,Thri ller
2,22,4.0,4.0,4.2,0.0,0.0,4.0,4.1,4.0,4.0,0.0,3.0,4.0,0.0,3.9,3.9
2,22,4.0,4.0,4.2,0.0,0.0,4.0,4.1,4.0,4.0,0.0,3.0,4.0,0.0,3.9,3.9
2,22,4.0,4.0,4.2,0.0,0.0,4.0,4.1,4.0,4.0,0.0,3.0,4.0,0.0,3.9,3.9
2,22,4.0,4.0,4.2,0.0,0.0,4.0,4.1,4.0,4.0,0.0,3.0,4.0,0.0,3.9,3.9
2,22,4.0,4.0,4.2,0.0,0.0,4.0,4.1,4.0,4.0,0.0,3.0,4.0,0.0,3.9,3.9
2,22,4.0,4.0,4.2,0.0,0.0,4.0,4.1,4.0,4.0,0.0,3.0,4.0,0.0,3.9,3.9
2,22,4.0,4.0,4.2,0.0,0.0,4.0,4.1,4.0,4.0,0.0,3.0,4.0,0.0,3.9,3.9


In [9]:
html_code_ = pprint_train(item_train, item_features, ivs, i_s, maxcount=5, user=False)
display(HTML(html_code_))

[movie id],year,ave rating,Act ion,Adve nture,Anim ation,Chil dren,Com edy,Crime,Docum entary,Drama,Fan tasy,Hor ror,Mys tery,Rom ance,Sci -Fi,Thri ller
6874,2003,4.0,1,0,0,0,0,1,0,0,0,0,0,0,0,1
8798,2004,3.8,1,0,0,0,0,1,0,1,0,0,0,0,0,1
46970,2006,3.2,1,0,0,0,1,0,0,0,0,0,0,0,0,0
48516,2006,4.3,0,0,0,0,0,1,0,1,0,0,0,0,0,1
58559,2008,4.2,1,0,0,0,0,1,0,1,0,0,0,0,0,0


Above, the movie array contains the year the film was released, the average rating and an indicator for each potential genre. The indicator is one for each genre that applies to the movie. The movie id is not used in training but is useful when interpreting the data.

In [10]:
print(f"y_train[:5]: {y_train[:5]}")

y_train[:5]: [4.  3.5 4.  4.  4.5]


The target, y, is the movie rating given by the user.

<a name="3.2"></a>
### 3.2 Preparing the training data
We'll scale the input features using the [scikit learn StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html).
Below, the inverse_transform is also shown to produce the original inputs. We'll scale the target ratings using a Min Max Scaler which scales the target to be between -1 and 1. [scikit learn MinMaxScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html)

In [39]:
# Save original versions of the data before normalization
item_train_unscaled = item_train
user_train_unscaled = user_train
y_train_unscaled    = y_train

# Normalize item feature vectors using standard scaling (mean=0, std=1)
scalerItem = StandardScaler()
scalerItem.fit(item_train)
item_train = scalerItem.transform(item_train)

# Normalize user feature vectors similarly
scalerUser = StandardScaler()
scalerUser.fit(user_train)
user_train = scalerUser.transform(user_train)

# Scale the target ratings to the range [-1, 1] using Min-Max normalization
scalerTarget = MinMaxScaler((-1, 1))
scalerTarget.fit(y_train.reshape(-1, 1))
y_train = scalerTarget.transform(y_train.reshape(-1, 1))
# ynorm_test = scalerTarget.transform(y_test.reshape(-1, 1))  # Uncomment if needed for test data

# Check if scaling and inverse transformation restores the original data
print(np.allclose(item_train_unscaled, scalerItem.inverse_transform(item_train)))
print(np.allclose(user_train_unscaled, scalerUser.inverse_transform(user_train)))


True
True


In [40]:
# Split the data into training and test sets (80% train, 20% test)
# Shuffling ensures a good mix, and random_state makes the split reproducible
item_train, item_test = train_test_split(item_train, train_size=0.80, shuffle=True, random_state=1)
user_train, user_test = train_test_split(user_train, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test       = train_test_split(y_train,    train_size=0.80, shuffle=True, random_state=1)

# Display the shapes of the resulting training and test sets for items
print(f"movie/item training data shape: {item_train.shape}")
print(f"movie/item test data shape: {item_test.shape}")


movie/item training data shape: (40707, 17)
movie/item test data shape: (10177, 17)


The scaled, shuffled data now has a mean of zero.

In [13]:
html_code3 = pprint_train(user_train, user_features, uvs, u_s, maxcount=5)
display(HTML(html_code3))

[user id],[rating count],[rating ave],Act ion,Adve nture,Anim ation,Chil dren,Com edy,Crime,Docum entary,Drama,Fan tasy,Hor ror,Mys tery,Rom ance,Sci -Fi,Thri ller
1,0,-1.0,-0.8,-0.7,0.1,-0.0,-1.2,-0.4,0.6,-0.5,-0.5,-0.1,-0.6,-0.6,-0.7,-0.7
0,1,-0.7,-0.5,-0.7,-0.1,-0.2,-0.6,-0.2,0.7,-0.5,-0.8,0.1,-0.0,-0.6,-0.5,-0.4
-1,-1,-0.2,0.3,-0.4,0.4,0.5,1.0,0.6,-1.2,-0.3,-0.6,-2.3,-0.1,0.0,0.4,-0.0
0,-1,0.6,0.5,0.5,0.2,0.6,-0.1,0.5,-1.2,0.9,1.2,-2.3,-0.1,0.0,0.2,0.3
-1,0,0.7,0.6,0.5,0.3,0.5,0.4,0.6,1.0,0.6,0.3,0.8,0.8,0.4,0.7,0.7


<a name="4"></a>
## 4 - Neural Network for Content-Based Filtering

In this section, we'll build two neural networks for content-based recommendation.

### Model Architecture

We'll use a Keras Sequential model with the following structure:

- A Dense layer with 256 units and ReLU activation.
- Followed by another Dense layer with 128 units and ReLU activation.
- The final layer is a Dense layer with `num_outputs` units, using a linear activation (or no activation function).

This setup will be used to learn both user and item embeddings based on content features.


In [41]:
import tensorflow as tf

# Number of outputs from the neural networks
num_outputs = 32

# Set the random seed for reproducibility
tf.random.set_seed(1)

# Define the User Neural Network
user_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs),
    tf.keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))  # Normalize the output vectors
])

# Define the Item Neural Network
item_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs),
    tf.keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))  # Normalize the output vectors
])

# Create the user input layer and apply the user network
input_user = tf.keras.layers.Input(shape=(num_user_features, ))  # User feature input
vu = user_NN(input_user)
vu = tf.keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(vu)  # Normalize user vector

# Create the item input layer and apply the item network
input_item = tf.keras.layers.Input(shape=(num_item_features, ))  # Item feature input
vm = item_NN(input_item)
vm = tf.keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(vm)  # Normalize item vector

# Compute the dot product of the user and item vectors (prediction)
output = tf.keras.layers.Dot(axes=1)([vu, vm])

# Create the model specifying the inputs and the output
model = tf.keras.Model([input_user, input_item], output)

# Show the model summary to verify the architecture
model.summary()


In [20]:
user_NN.summary()
item_NN.summary()


In [21]:
print(input_user.shape)

(None, 14)


In [22]:
tf.random.set_seed(1)
cost_fn = tf.keras.losses.MeanSquaredError()
opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt,
              loss=cost_fn)

In [23]:
print(user_train[:, u_s:].shape)
print(item_train[:, i_s:].shape)

(40707, 14)
(40707, 16)


In [24]:
tf.random.set_seed(1)
model.fit([user_train[:, u_s:], item_train[:, i_s:]], y_train, epochs=30)

Epoch 1/30
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - loss: 0.1306
Epoch 2/30
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - loss: 0.1125
Epoch 3/30
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - loss: 0.1075
Epoch 4/30
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.1042
Epoch 5/30
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - loss: 0.1011
Epoch 6/30
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - loss: 0.0979
Epoch 7/30
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.0954
Epoch 8/30
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.0935
Epoch 9/30
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - loss: 0.0919
Epoch 10/30
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

<keras.src.callbacks.history.History at 0x790fccb52750>

In [25]:
model.evaluate([user_test[:, u_s:], item_test[:, i_s:]], y_test)

[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.0898


0.08603718876838684

<a name="5"></a>
## 5 - Predictions
<a name="5.1"></a>
### 5.1 - Predictions for a new user
First, we'll create a new user and have the model suggest movies for that user.

In [43]:
new_user_id = 5000
new_rating_ave = 0.0
new_action = 0.0
new_adventure = 5.0
new_animation = 0.0
new_childrens = 0.0
new_comedy = 0.0
new_crime = 0.0
new_documentary = 0.0
new_drama = 0.0
new_fantasy = 5.0
new_horror = 0.0
new_mystery = 0.0
new_romance = 0.0
new_scifi = 0.0
new_thriller = 0.0
new_rating_count = 3

user_vec = np.array([[new_user_id, new_rating_count, new_rating_ave,
                      new_action, new_adventure, new_animation, new_childrens,
                      new_comedy, new_crime, new_documentary,
                      new_drama, new_fantasy, new_horror, new_mystery,
                      new_romance, new_scifi, new_thriller]])




In [47]:
def print_pred_movies(y_p, item, movie_dict, maxcount=10):
    """ print results of prediction of a new user. inputs are expected to be in
        sorted order, unscaled. """
    count = 0
    disp = [["y_p", "movie id", "rating ave", "title", "genres"]]

    for i in range(0, y_p.shape[0]):
        if count == maxcount:
            break
        count += 1
        movie_id = item[i, 0].astype(int)
        disp.append([np.around(y_p[i, 0], 1), item[i, 0].astype(int), np.around(item[i, 2].astype(float), 1),
                     movie_dict[movie_id]['title'], movie_dict[movie_id]['genres']])

    table = tabulate.tabulate(disp, tablefmt='html', headers="firstrow")
    return table

def gen_user_vecs(user_vec, num_items):
    """ given a user vector return:
        user predict maxtrix to match the size of item_vecs """
    user_vecs = np.tile(user_vec, (num_items, 1))
    return user_vecs

The new user enjoys movies from the adventure, fantasy genres. Let's find the top-rated movies for the new user.  
Below, we'll use a set of movie/item vectors, `item_vecs` that have a vector for each movie in the training/test set. This is matched with the new user vector above and the scaled vectors are used to predict ratings for all the movies.

In [49]:
# generate and replicate the user vector to match the number movies in the data set.
user_vecs = gen_user_vecs(user_vec,len(item_vecs))

# scale our user and item vectors
suser_vecs = scalerUser.transform(user_vecs)
sitem_vecs = scalerItem.transform(item_vecs)

# make a prediction
y_p = model.predict([suser_vecs[:, u_s:], sitem_vecs[:, i_s:]])

# unscale y prediction
y_pu = scalerTarget.inverse_transform(y_p)

# sort the results, highest prediction first
sorted_index = np.argsort(-y_pu,axis=0).reshape(-1).tolist()  #negate to get largest rating first
sorted_ypu   = y_pu[sorted_index]
sorted_items = item_vecs[sorted_index]  #using unscaled vectors for display

html_code4 = print_pred_movies(sorted_ypu, sorted_items, movie_dict, maxcount = 10)
display(HTML(html_code4))

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


y_p,movie id,rating ave,title,genres
3.2,111759,4.0,Edge of Tomorrow (2014),Action|Sci-Fi
3.2,168252,4.3,Logan (2017),Action|Sci-Fi
3.2,96737,3.7,Dredd (2012),Action|Sci-Fi
3.2,164179,4.0,Arrival (2016),Sci-Fi
3.2,176371,3.8,Blade Runner 2049 (2017),Sci-Fi
3.2,104841,3.6,Gravity (2013),Action|Sci-Fi
3.1,109487,4.0,Interstellar (2014),Sci-Fi
3.1,56145,3.4,"Mist, The (2007)",Horror|Sci-Fi
3.1,54995,3.8,Planet Terror (2007),Action|Horror|Sci-Fi
3.1,97752,3.7,Cloud Atlas (2012),Drama|Sci-Fi


<a name="5.2"></a>
### 5.2 - Predictions for an existing user.
Let's look at the predictions for "user 2", one of the users in the data set. We can compare the predicted ratings with the model's ratings.

In [30]:

def get_user_vecs(user_id, user_train, item_vecs, user_to_genre):
    """ given a user_id, return:
        user train/predict matrix to match the size of item_vecs
        y vector with ratings for all rated movies and 0 for others of size item_vecs """

    if not user_id in user_to_genre:
        print("error: unknown user id")
        return None
    else:
        user_vec_found = False
        for i in range(len(user_train)):
            if user_train[i, 0] == user_id:
                user_vec = user_train[i]
                user_vec_found = True
                break
        if not user_vec_found:
            print("error in get_user_vecs, did not find uid in user_train")
        num_items = len(item_vecs)
        user_vecs = np.tile(user_vec, (num_items, 1))

        y = np.zeros(num_items)
        for i in range(num_items):  # walk through movies in item_vecs and get the movies, see if user has rated them
            movie_id = item_vecs[i, 0]
            if movie_id in user_to_genre[user_id]['movies']:
                rating = user_to_genre[user_id]['movies'][movie_id]
            else:
                rating = 0
            y[i] = rating
    return(user_vecs, y)


def print_existing_user(y_p, y, user, items, ivs, uvs, movie_dict, maxcount=10):
    """ print results of prediction for a user who was in the database.
        Inputs are expected to be in sorted order, unscaled.
    """
    count = 0
    disp = [["y_p", "y", "user", "user genre ave", "movie rating ave", "movie id", "title", "genres"]]
    count = 0
    for i in range(0, y.shape[0]):
        if y[i, 0] != 0:  # zero means not rated
            if count == maxcount:
                break
            count += 1
            movie_id = items[i, 0].astype(int)

            offsets = np.nonzero(items[i, ivs:] == 1)[0]
            genre_ratings = user[i, uvs + offsets]
            disp.append([y_p[i, 0], y[i, 0],
                         user[i, 0].astype(int),      # userid
                         np.array2string(genre_ratings,
                                         formatter={'float_kind':lambda x: "%.1f" % x},
                                         separator=',', suppress_small=True),
                         items[i, 2].astype(float),    # movie average rating
                         movie_id,
                         movie_dict[movie_id]['title'],
                         movie_dict[movie_id]['genres']])

    table = tabulate.tabulate(disp, tablefmt='html', headers="firstrow", floatfmt=[".1f", ".1f", ".0f", ".2f", ".1f"])
    return table



In [33]:
uid = 2
# form a set of user vectors. This is the same vector, transformed and repeated.
user_vecs, y_vecs = get_user_vecs(uid, user_train_unscaled, item_vecs, user_to_genre)

# scale our user and item vectors
suser_vecs = scalerUser.transform(user_vecs)
sitem_vecs = scalerItem.transform(item_vecs)

# make a prediction
y_p = model.predict([suser_vecs[:, u_s:], sitem_vecs[:, i_s:]])

# unscale y prediction
y_pu = scalerTarget.inverse_transform(y_p)

# sort the results, highest prediction first
sorted_index = np.argsort(-y_pu,axis=0).reshape(-1).tolist()  #negate to get largest rating first
sorted_ypu   = y_pu[sorted_index]
sorted_items = item_vecs[sorted_index]  #using unscaled vectors for display
sorted_user  = user_vecs[sorted_index]
sorted_y     = y_vecs[sorted_index]

#print sorted predictions for movies rated by the user
html_code5 = print_existing_user(sorted_ypu, sorted_y.reshape(-1,1), sorted_user, sorted_items, ivs, uvs, movie_dict, maxcount = 50)
display(HTML(html_code5))

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


y_p,y,user,user genre ave,movie rating ave,movie id,title,genres
4.6,5.0,2,[4.0],4.3,80906,Inside Job (2010),Documentary
4.5,4.0,2,"[4.0,4.1,3.9]",4.0,6874,Kill Bill: Vol. 1 (2003),Action|Crime|Thriller
4.3,4.0,2,"[4.0,4.1,4.0,4.0,3.9,3.9]",4.1,79132,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller
4.2,4.5,2,"[4.0,4.1,4.0]",4.2,58559,"Dark Knight, The (2008)",Action|Crime|Drama
4.2,3.5,2,"[4.0,4.1,4.0,3.9]",3.8,8798,Collateral (2004),Action|Crime|Drama|Thriller
4.2,4.5,2,"[4.0,4.0]",4.1,68157,Inglourious Basterds (2009),Action|Drama
4.1,4.0,2,"[4.1,4.0,3.9]",4.3,48516,"Departed, The (2006)",Crime|Drama|Thriller
4.0,3.5,2,"[4.0,4.0]",3.9,99114,Django Unchained (2012),Action|Drama
4.0,5.0,2,"[4.0,4.1,4.0]",3.9,106782,"Wolf of Wall Street, The (2013)",Comedy|Crime|Drama
4.0,5.0,2,"[4.0,4.2,3.9,3.9]",3.8,122882,Mad Max: Fury Road (2015),Action|Adventure|Sci-Fi|Thriller
