In [2]:
import numpy as np
import pandas as pd

In [3]:
# here is a handy function from fast.ai
def proc_col(col):
    """Encodes a pandas column with continous ids. 
    """
    uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx[x] for x in col]), len(uniq)
    

def encode_data(df):
    """Encodes rating data with continous user and movie ids using 
    the helpful fast.ai function from above.
    
    Arguments:
      train_csv: a csv file with columns user_id,movie_id,rating 
    
    Returns:
      df: a dataframe with the encode data
      num_users
      num_movies
      
    """
    # call this function for userId and movieId column of the dataframe
    user2id, userArray, num_users= proc_col(df.userId)
    movie2id, movieArray, num_movies= proc_col(df.movieId)
    df['userId']= userArray
    df['movieId']= movieArray
    return df, num_users, num_movies

In [4]:
def create_embedings(n, K):
    """ Creating a numpy random matrix of shape n, K initialized with uniform values in (0, 6/K)
    Arguments:
    
    Inputs:
    n: number of items/users
    K: number of factors in the embeding 
    
    Returns:
    emb: numpy array of shape (n, num_factors)
    """
    np.random.seed(3)
    emb = 6*np.random.random((n, K)) / K
    return emb

In [5]:
## Encoding Y as a sparse matrix
from scipy import sparse
def df2matrix(df, nrows, ncols, column_name="rating"):
    """ Returns a sparse matrix constructed from a dataframe
    This code assumes the df has columns: MovieID,UserID,Rating
    """
    values = df[column_name].values
    ind_movie = df['movieId'].values
    ind_user = df['userId'].values
    return sparse.csc_matrix((values,(ind_user, ind_movie)),shape=(nrows, ncols))

In [6]:
def predict(df, emb_user, emb_movie):
    """ This function computes df["prediction"] without doing (U*V^T)
    """
 
    df['prediction']=np.sum(np.multiply(emb_user[df['userId']],emb_movie[df['movieId']]),axis=1)
    return df

In [7]:
def cost(df, emb_user, emb_movie):
    """ Computes mean square error
    Prediction for user i and movie j is emb_user[i]*emb_movie[j]
    
    Arguments:
      df: dataframe with all data or a subset of the data
      emb_user: embedings for users
      emb_movie: embedings for movies
      
    Returns:
      error(float): this is the MSE
    """
    predict(df,emb_user, emb_movie)
    error = sum((df['prediction']-df['rating'])**2)/len(df)
    return error

In [8]:
def gradient(df, emb_user, emb_movie):
    """ Computes the gradient.
    Arguments:
      df: dataframe with all data or a subset of the data
      Y: sparse representation of df
      emb_user: embedings for users
      emb_movie: embedings for movies
      
    Returns:
      d_emb_user
      d_emb_movie
    """
    df = predict(df,emb_movie=emb_movie, emb_user=emb_user)
    Y = df2matrix(df,emb_user.shape[0], emb_movie.shape[0])
    Y_hat = df2matrix(df,emb_user.shape[0], emb_movie.shape[0], column_name='prediction')
    R = Y > 0
    e = Y - Y_hat
    grad = e.multiply(R) 
    d_emb_user= (-2/len(df))*(grad.dot(emb_movie))
    d_emb_movie= (-2/len(df))* (grad.transpose().dot(emb_user))
    return d_emb_user, d_emb_movie

In [9]:
def gradient_descent(df, emb_user, emb_movie, iterations=100, learning_rate=0.01, df_val=None):
    """ Computes gradient descent with momentum (0.9) for a number of iterations.
    Prints training cost and validation cost (if df_val is not None) every 50 iterations.
    
    Returns:
    emb_user: the trained user embedding
    emb_movie: the trained movie embedding
    """
    Y = df2matrix(df, emb_user.shape[0], emb_movie.shape[0])
    beta=0.9
    for i in range(iterations):
        d_emb_user, d_emb_movie = gradient(df,emb_movie=emb_movie, emb_user=emb_user)
        momentum_user= d_emb_user
        momentum_movie= d_emb_movie
        momentum_user=beta*momentum_user + (1-beta)*d_emb_user
        momentum_movie=beta*momentum_movie + (1-beta)*d_emb_movie
        
        emb_user= emb_user - learning_rate*momentum_user
        emb_movie= emb_movie - learning_rate*momentum_movie
        
        if (i+1)%50==0:
            print('training loss: '+ str(cost(df,emb_user,emb_movie)))
            if df_val is not None: 
                print('validation cost: '+ str(cost(df_val,emb_user,emb_movie)))
    return emb_user, emb_movie

### Predicting on new data
Now we should write a function that given new data is able to predict ratings. First we write a function that encodes new data. If a new user or item is present that row should be remove. Collaborative Filtering is not good at handling new users or new items.

In [10]:
def proc_col_new(col, mapping):
    new_mapping=list()
    for o in col:
        if o in mapping.keys(): new_mapping.append(mapping[o])
        else: new_mapping.append(None)
    return new_mapping

In [11]:
def encode_new_data(df_val, df_train):
    """ Encodes df_val with the same encoding as df_train.
    Returns:
    df_val: dataframe with the same encoding as df_train
    """
    user2id, userArray, num_users= proc_col(df_train.userId)
    movie2id, movieArray, num_movies= proc_col(df_train.movieId)
    df_train['userId']= userArray
    df_train['movieId']= movieArray

    df_val['userId']=proc_col_new(df_val['userId'],user2id)
    df_val['movieId']=proc_col_new(df_val['movieId'],movie2id)

    df_val=df_val.dropna()
    return df_val

### Load the data

In [None]:
#! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o movielens-ml-latest.zip

In [14]:
path = "../ml-latest-small/"
data = pd.read_csv(path + "ratings.csv")
# sorting by timestamp take as validation data the most recent data doesn't work so let's just take 20%
# at random
np.random.seed(3)
msk = np.random.rand(len(data)) < 0.8
train = data[msk].copy()
val = data[~msk].copy()
df_train, num_users, num_movies = encode_data(train.copy())
df_val = encode_new_data(val.copy(), train.copy())
df_val.movieId=df_val.movieId.astype('int64')
print(len(val), len(df_val))

20386 19591


### Training the embeddings

In [15]:
K = 50
emb_user = create_embedings(num_users, K)
emb_movie = create_embedings(num_movies, K)
emb_user, emb_movie = gradient_descent(df_train, emb_user, emb_movie, iterations=2000, learning_rate=1, df_val=df_val)

training loss: 9.396604197054799
validation cost: 9.522214163589737
training loss: 6.571786860324157
validation cost: 6.7001849104095115
training loss: 4.731420690900383
validation cost: 4.849609948607673
training loss: 3.707740929307275
validation cost: 3.813395864951322
training loss: 3.065219657270483
validation cost: 3.1587983632545846
training loss: 2.621268689059941
validation cost: 2.7060732677805333
training loss: 2.2980596955361214
validation cost: 2.377359634836783
training loss: 2.053985734006945
validation cost: 2.1300940459911653
training loss: 1.8641522365835246
validation cost: 1.9385780990036197
training loss: 1.7128195272542859
validation cost: 1.786545355717238
training loss: 1.589645154487459
validation cost: 1.6633225619448928
training loss: 1.4876078081218607
validation cost: 1.5616785240222497
training loss: 1.4018012176887606
validation cost: 1.4765711191093203
training loss: 1.3287079625615101
validation cost: 1.404393377525737
training loss: 1.265748051772432
v

In [16]:
train_mse = cost(df_train, emb_user, emb_movie)
val_mse = cost(df_val, emb_user, emb_movie)
print(train_mse, val_mse)

0.7384205759576179 0.8563815484250422


Now the embeddings are trained. We can use them for calculating predicted ratings, for computing item-item similarity user-user similarity using cosine similarity.