### Import Libraries

In [100]:
import numpy as np
import numpy.ma as ma
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from numpy import genfromtxt
from collections import defaultdict
import csv
import pickle

### Dataset

In [101]:
top10_df = pd.read_csv("data1/content_top10_df.csv")
bygenre_df = pd.read_csv("data1/content_bygenre_df.csv")

In [102]:
top10_df

Unnamed: 0,movie id,num ratings,ave rating,title,genres
0,4993,198,4.106061,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy
1,5952,188,4.021277,"Lord of the Rings: The Two Towers, The",Adventure|Fantasy
2,7153,185,4.118919,"Lord of the Rings: The Return of the King, The",Action|Adventure|Drama|Fantasy
3,4306,170,3.867647,Shrek,Adventure|Animation|Children|Comedy|Fantasy|Ro...
4,58559,149,4.238255,"Dark Knight, The",Action|Crime|Drama
5,6539,149,3.778523,Pirates of the Caribbean: The Curse of the Bla...,Action|Adventure|Comedy|Fantasy
6,79132,143,4.066434,Inception,Action|Crime|Drama|Mystery|Sci-Fi|Thriller
7,6377,141,3.960993,Finding Nemo,Adventure|Animation|Children|Comedy
8,4886,132,3.871212,"Monsters, Inc.",Adventure|Animation|Children|Comedy|Fantasy
9,7361,131,4.160305,Eternal Sunshine of the Spotless Mind,Drama|Romance|Sci-Fi


In [103]:
bygenre_df

Unnamed: 0,genre,num movies,ave rating/genre,ratings per genre
0,Action,321,3.37,10377
1,Adventure,234,3.42,8785
2,Animation,76,3.63,2588
3,Children,69,3.44,2472
4,Comedy,326,3.36,8911
5,Crime,139,3.54,4671
6,Documentary,13,3.81,280
7,Drama,342,3.61,10201
8,Fantasy,124,3.37,4468
9,Horror,56,3.2,1345


In [104]:
item_train = genfromtxt('data1/content_item_train.csv', delimiter=',')
user_train = genfromtxt('data1/content_user_train.csv', delimiter=',')
y_train = genfromtxt('data1/content_y_train.csv', delimiter=',')

with open('data1/content_item_train_header.txt', newline='') as f:
    movie_features = np.array(list(csv.reader(f))[0])
with open('data1/content_user_train_header.txt', newline='') as f:
    user_features = np.array(list(csv.reader(f))[0])

item_vecs = genfromtxt('data1/content_item_vecs.csv', delimiter=',')
movie_dict = defaultdict(dict)
count = 0
with open('data1/content_movie_list.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar='"')
    for line in reader:
        if count == 0:
            count += 1
        else:
            count += 1
            movie_id = int(line[0])
            movie_dict[movie_id]["title"] = line[1]
            movie_dict[movie_id]["genres"] = line[2]
            
with open('data1/content_user_to_genre.pickle', 'rb') as f:
    user_to_genre = pickle.load(f)

In [105]:
user_data_df = pd.DataFrame(user_train, columns=user_features).drop_duplicates().reset_index(drop=True)
movie_data_df = pd.DataFrame(item_train, columns=movie_features).drop_duplicates().reset_index(drop=True)

In [106]:
user_data_df

Unnamed: 0,user id,rating count,rating ave,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Horror,Mystery,Romance,Sci-Fi,Thriller
0,2.0,22.0,4.00,3.95,4.25,0.00,0.00,4.00,4.12,4.00,4.04,0.00,3.00,4.00,0.00,3.88,3.89
1,3.0,1.0,0.50,0.50,0.00,0.00,0.00,0.00,0.00,0.00,0.50,0.00,0.00,0.00,0.00,0.50,0.50
2,4.0,8.0,3.38,0.00,4.00,0.00,4.00,2.50,4.00,0.00,3.29,4.00,4.00,0.00,2.50,0.00,4.00
3,7.0,74.0,3.05,3.03,3.13,3.20,2.95,3.12,3.39,0.00,2.98,3.19,2.75,3.14,2.33,2.64,3.24
4,9.0,10.0,3.90,2.75,4.00,4.00,4.00,4.67,3.00,0.00,4.50,5.00,0.00,4.00,5.00,3.50,2.67
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
392,605.0,68.0,3.20,3.10,3.35,3.28,3.25,3.02,2.88,0.00,2.96,3.35,2.86,4.00,3.32,2.81,2.67
393,606.0,177.0,3.59,3.18,3.50,3.85,3.72,3.32,3.92,3.75,3.85,3.55,2.60,3.81,3.67,3.13,3.40
394,607.0,2.0,3.00,0.00,0.00,0.00,0.00,3.00,0.00,0.00,3.00,0.00,0.00,0.00,3.00,0.00,0.00
395,608.0,195.0,3.81,3.99,3.76,3.87,3.73,3.55,4.05,2.80,3.84,3.87,4.15,3.88,3.40,3.98,4.05


In [107]:
movie_data_df

Unnamed: 0,movie id,year,ave rating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Horror,Mystery,Romance,Sci-Fi,Thriller
0,6874.0,2003.0,3.961832,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,8798.0,2004.0,3.761364,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,46970.0,2006.0,3.250000,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,48516.0,2006.0,4.252336,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,58559.0,2008.0,4.238255,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
842,51412.0,2007.0,2.821429,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
843,85510.0,2011.0,3.125000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
844,93363.0,2012.0,3.090909,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
845,111364.0,2014.0,2.615385,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [108]:
len(y_train)

50884

In [109]:
num_user_features = user_data_df.shape[1] - 3
num_item_features = movie_data_df.shape[1] - 1
uvs = 3
ivs = 3
u_s = 3
i_s = 1

print(f"Number of training vectors: {len(movie_data_df)}")

Number of training vectors: 847


### Training Data

In [110]:
item_train_unscaled = item_train
user_train_unscaled = user_train
y_train_unscaled    = y_train

scalerItem = StandardScaler()
scalerItem.fit(item_train)
item_train = scalerItem.transform(item_train)

scalerUser = StandardScaler()
scalerUser.fit(user_train)
user_train = scalerUser.transform(user_train)

scalerTarget = MinMaxScaler((-1, 1))
scalerTarget.fit(y_train.reshape(-1, 1))
y_train = scalerTarget.transform(y_train.reshape(-1, 1))

item_train, item_test = train_test_split(item_train, train_size=0.80, shuffle=True, random_state=1)
user_train, user_test = train_test_split(user_train, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test       = train_test_split(y_train,    train_size=0.80, shuffle=True, random_state=1)
print(f"movie/item training data shape: {item_train.shape}")
print(f"movie/item test data shape: {item_test.shape}")

movie/item training data shape: (40707, 17)
movie/item test data shape: (10177, 17)


### Model Training

In [111]:
num_outputs = 32
tf.random.set_seed(1)
user_NN = tf.keras.models.Sequential([
  tf.keras.layers.Dense(256, activation='relu'),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(num_outputs),
])

item_NN = tf.keras.models.Sequential([
  tf.keras.layers.Dense(256, activation='relu'),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(num_outputs),
])

input_user = tf.keras.layers.Input(shape=(num_user_features))
vu = user_NN(input_user)
vu = tf.linalg.l2_normalize(vu, axis=1)

input_item = tf.keras.layers.Input(shape=(num_item_features))
vm = item_NN(input_item)
vm = tf.linalg.l2_normalize(vm, axis=1)

output = tf.keras.layers.Dot(axes=1)([vu, vm])

model = tf.keras.Model([input_user, input_item], output)

model.summary()

Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_13 (InputLayer)          [(None, 14)]         0           []                               
                                                                                                  
 input_14 (InputLayer)          [(None, 16)]         0           []                               
                                                                                                  
 sequential_12 (Sequential)     (None, 32)           40864       ['input_13[0][0]']               
                                                                                                  
 sequential_13 (Sequential)     (None, 32)           41376       ['input_14[0][0]']               
                                                                                            

In [112]:
tf.random.set_seed(1)
cost_fn = tf.keras.losses.MeanSquaredError()
opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt,loss=cost_fn)

tf.random.set_seed(1)
model.fit([user_train[:, u_s:], item_train[:, i_s:]], y_train, epochs=30)

Epoch 1/30




KeyboardInterrupt: 

### New User

In [None]:
new_user_id = 5000
new_rating_ave = 0.0
new_action = 0.0
new_adventure = 5.0
new_animation = 0.0
new_childrens = 0.0
new_comedy = 0.0
new_crime = 0.0
new_documentary = 0.0
new_drama = 0.0
new_fantasy = 5.0
new_horror = 0.0
new_mystery = 0.0
new_romance = 0.0
new_scifi = 0.0
new_thriller = 0.0
new_rating_count = 3

user_vec = np.array([[new_user_id, new_rating_count, new_rating_ave,
                      new_action, new_adventure, new_animation, new_childrens,
                      new_comedy, new_crime, new_documentary,
                      new_drama, new_fantasy, new_horror, new_mystery,
                      new_romance, new_scifi, new_thriller]])