In [1]:
import numpy as np
import pandas as pd
from scipy import sparse

# Load Data

In [2]:
pd.set_option('display.max_columns', None)
df = pd.read_csv('../data/cleaned_review_data.csv', 
                 parse_dates=['SubmissionTime', 'FirstSubmissionTime', 'LastSubmissionTime'], 
                 low_memory=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1364686 entries, 0 to 1364685
Data columns (total 24 columns):
 #   Column                Non-Null Count    Dtype         
---  ------                --------------    -----         
 0   pd_id                 1364686 non-null  object        
 1   brand                 1364686 non-null  object        
 2   Name                  1364686 non-null  object        
 3   Description           1364686 non-null  object        
 4   AverageOverallRating  1364686 non-null  float64       
 5   love_count            1364686 non-null  float64       
 6   reviews_count         1364686 non-null  float64       
 7   Price                 1364686 non-null  float64       
 8   category_1            1364686 non-null  object        
 9   category_2            1006831 non-null  object        
 10  category_3            957356 non-null   object        
 11  FirstSubmissionTime   1364686 non-null  datetime64[ns]
 12  LastSubmissionTime    1364686 non-null  da

# CF without packages

## Data Cleaning & Encoding

In [3]:
cf_df = df[['pd_id', 'AuthorId', 'Rating']]
cf_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1364686 entries, 0 to 1364685
Data columns (total 3 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   pd_id     1364686 non-null  object
 1   AuthorId  1364686 non-null  object
 2   Rating    1364686 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 31.2+ MB


In [4]:
def encode_data(df):
    """Encodes rating data with continous user and movie."""
    
    uniq_user = df['AuthorId'].unique()
    uniq_pd = df['pd_id'].unique()
    
    name2idx_user = {o:i for i,o in enumerate(uniq_user)}
    name2idx_pd = {o:i for i,o in enumerate(uniq_pd)}
    
    df_c = df.copy()
    df_c['AuthorId'] = np.array([name2idx_user[x] for x in df['AuthorId']])
    df_c['pd_id'] = np.array([name2idx_pd[x] for x in df['pd_id']]) 

    return df_c, len(uniq_user), len(uniq_pd)

## Encoding Y as a sparse matrix
This code helps you encode a $Y$ as a sparse matrix from the dataframe. 

In [6]:
def df2matrix(df, nrows, ncols, column_name='Rating'):
    """Returns a sparse matrix constructed from a dataframe."""
    
    rating_values = df[column_name].values
    ind_pd = df['pd_id'].values
    ind_user = df['AuthorId'].values
    return sparse.csc_matrix((rating_values,(ind_user, ind_pd)), shape=(nrows, ncols))

## Predicting Ratings

In [7]:
def create_embedings(n, K):
    """ Create a numpy random matrix of shape n, K
    
    The random matrix should be initialized with uniform values in (0, 6/K)
    Arguments:
    
    Inputs:
    n: number of items/users
    K: number of factors in the embeding 
    
    Returns:
    emb: numpy array of shape (n, K)
    """
    
    np.random.seed(3)
    emb = 6*np.random.random((n, K)) / K
    return emb

In [8]:
def predict(df, emb_user, emb_pd):
    """ This function computes df["prediction"] without doing (U*V^T).
    Compute df["prediction"] by using elementwise multiplication of the corresponding embeddings and then 
    sum to get the prediction u_i*v_j. This avoids creating the dense matrix U*V^T.
    """
    df['prediction'] = np.sum(np.multiply(emb_user[df['AuthorId']], emb_pd[df['pd_id']]), axis=1)
    return df

## Calculating the cost function

In [15]:
# Use vectorized computation for this function. No loops!
def cost(df, emb_user, emb_pd):
    """ Computes mean square error
    First compute prediction using the predict function.
    Prediction for user i and movie j is emb_user[i]*emb_movie[j]
    
    Arguments:
      df: dataframe with all data or a subset of the data
      emb_user: embedings for users
      emb_movie: embedings for movies
      
    Returns:
      error(float): this is the MSE
    """
    pred_df = predict(df, emb_user, emb_pd)
    error = np.sum((pred_df['Rating'] - pred_df['prediction']) ** 2) / df.shape[0]
    return error

## Calculating gradient

In [16]:
def finite_difference(df, emb_user, emb_pd, ind_user=None, ind_pd=None, k=None):
    """ Computes finite difference on MSE(U, V).
    This function is used for testing the gradient function. 
    """
    e = 0.000000001
    c1 = cost(df, emb_user, emb_pd)
    K = emb_user.shape[1]
    x = np.zeros_like(emb_user)
    y = np.zeros_like(emb_movie)
    if ind_user is not None:
        x[ind_user][k] = e
    else:
        y[ind_pd][k] = e
    c2 = cost(df, emb_user + x, emb_pd + y)
    return (c2 - c1)/e

In [17]:
def gradient(df, emb_user, emb_pd):
    """ Computes the gradient. 
    First compute df["prediction"]. Then use df2matrix to get a sparse matrix Y and Y_hat.
    
    Arguments:
      df: dataframe with all data or a subset of the data
      Y: sparse representation of df
      emb_user: embedings for users
      emb_movie: embedings for movies
      
    Returns:
      d_emb_user
      d_emb_movie
    """

    prd_df = predict(df, emb_user, emb_pd)
    df, num_users, num_products = encode_data(prd_df)
    
    Y = df2matrix(df, num_users, num_products)
    Y_hat = df2matrix(df, num_users, num_products, column_name='prediction')
    
    values = np.ones(df.shape[0])
    R = sparse.csc_matrix((values,(df['AuthorId'], df['pd_id'])),shape=(num_users, num_products)).toarray()
    grad_user = np.multiply((Y - Y_hat).toarray(), R) @ emb_pd * (-2) / df.shape[0]
    grad_pd = np.multiply((Y - Y_hat).toarray(), R).T @ emb_user * (-2) / df.shape[0]

    return grad_user, grad_pd

In [18]:
def gradient_descent(df, emb_user, emb_pd, iterations=100, learning_rate=0.01, df_val=None):
    """ Computes gradient descent with momentum (0.9) for a number of iterations.
    
    Prints training cost and validation cost (if df_val is not None) every 50 iterations.
    
    Returns:
    emb_user: the trained user embedding
    emb_movie: the trained movie embedding
    """
    Y = df2matrix(df, emb_user.shape[0], emb_pd.shape[0])

    beta = 0.9
    v_user, v_pd = gradient(df, emb_user, emb_pd)
    
    for i in range(iterations):
        grad_user, grad_pd = gradient(df, emb_user, emb_pd)
        v_user = beta*v_user + (1-beta)*grad_user
        v_pd = beta*v_pd + (1-beta)*grad_pd

        emb_user = emb_user-learning_rate*v_user
        emb_pd = emb_pd-learning_rate*v_pd
        
        if (i+1) % 50 == 0:
            print(cost(df, emb_user, emb_pd))
            if df_val is not None:
                print(cost(df_val, emb_user, emb_pd))

    return emb_user, emb_pd

## Putting all together

In [19]:
cf_df, num_users, num_products = encode_data(cf_df)

K = 6
emb_user = create_embedings(num_users, K)
emb_pd = create_embedings(num_products, K)

grad_user, grad_pd = gradient(cf_df, emb_user, emb_pd)

In [21]:
emb_user, emb_pd = gradient_descent(cf_df, emb_user, emb_pd, iterations=200, learning_rate=0.01)

9.477081249365579
9.354073663334098
9.236192428568275
9.123138873888143


In [63]:
prd = emb_user @ emb_pd.T
sorted_pd = np.argsort(prd, axis=1)
top_n = 10
recommend_matrix = [user[-top_n:] for user in sorted_pd]

recommend_df = pd.DataFrame({'AuthorId': cf_df['AuthorId'].unique()})
for i in range(1, top_n+1):
    recommend_df[f'recommend_{i}'] = np.array([user[-i] for user in recommend_matrix])

recommend_df.sample(5)

Unnamed: 0,AuthorId,recommend_1,recommend_2,recommend_3,recommend_4,recommend_5,recommend_6,recommend_7,recommend_8,recommend_9,recommend_10
184739,184739,1036,1428,2283,1676,1431,1273,821,2025,1516,1394
2221,2221,1428,1036,1273,2186,1961,149,757,1692,503,526
478062,478062,1428,2186,1273,149,1036,2051,1350,1189,392,757
22288,22288,1273,1961,1428,1036,1453,211,821,1204,314,1165
317993,317993,1245,1961,2220,1516,730,1428,1036,2231,1720,1684
