In [1]:
import importlib
import os
import scipy.io
import numpy as np
import scipy.linalg
import matplotlib.pyplot as plt

In [2]:
# Load training data from MAT file
R = scipy.io.loadmat('movie_data/movie_train.mat')['train']

# Load validation data from CSV
val_data = np.loadtxt('movie_data/movie_validate.txt', dtype=int, delimiter=',')

# Helper method to get training accuracy
def get_train_acc(R, user_vecs, movie_vecs):
    num_correct, total = 0, 0
    for i in range(R.shape[0]):
        for j in range(R.shape[1]):
            if not np.isnan(R[i, j]):
                total += 1
                if np.dot(user_vecs[i], movie_vecs[j])*R[i, j] > 0:
                    num_correct += 1
    return num_correct/total

# Helper method to get validation accuracy
def get_val_acc(val_data, user_vecs, movie_vecs):
    num_correct = 0
    for val_pt in val_data:
        user_vec = user_vecs[val_pt[0]-1]
        movie_vec = movie_vecs[val_pt[1]-1]
        est_rating = np.dot(user_vec, movie_vec)
        if est_rating*val_pt[2] > 0:
            num_correct += 1
    return num_correct/val_data.shape[0]

# Helper method to get indices of all rated movies for each user,
# and indices of all users who have rated that title for each movie
def get_rated_idxs(R):
    user_rated_idxs, movie_rated_idxs = [], []
    for i in range(R.shape[0]):
        user_rated_idxs.append(np.argwhere(~np.isnan(R[i, :])).reshape(-1))
    for j in range(R.shape[1]):
        movie_rated_idxs.append(np.argwhere(~np.isnan(R[:, j])).reshape(-1))
    return np.array(user_rated_idxs), np.array(movie_rated_idxs)

# Part (c): SVD to learn low-dimensional vector representations
def svd_lfm(R):
    
    r = np.copy(R)
    
    # Fill in the missing values in R
    r[np.isnan(r)] = 0

    # Compute the SVD of R
    U, s, Vh = scipy.linalg.svd(r, full_matrices = False)
    
    # Construct user and movie representations
    user = U
    movie = np.diag(s) @ Vh
    
    user_vecs = user
    movie_vecs = movie.T
    return user_vecs, movie_vecs

def get_train_mse_old(R, user_vecs, movie_vecs):

    # Compute the training MSE loss
    mse_loss = 0
    count = 0
    for i in range(user_vecs.shape[0]):
        for j in range(movie_vecs.shape[0]):
            if not np.isnan(R[i, j]):
                dots = user_vecs[i].dot(movie_vecs[j])
                mse_loss += (dots - R[i, j]) ** 2
                count += 1
    return mse_loss / count


# Part (d): Compute the training MSE loss of a given vectorization
def get_train_mse(R, user_vecs, movie_vecs):
    
    UDVT = user_vecs @ movie_vecs.T
    
    r = np.copy(R)
    k = np.isnan(r)
    r[k] = 0
    UDVT[k] = 0
    print(np.sum(~k))
    mse_loss = np.linalg.norm(r - UDVT) ** 2 / np.sum(~k)
    
    return mse_loss 


In [4]:
user_rated_idxs, movie_rated_idxs = get_rated_idxs(np.copy(R))



In [5]:
a = user_rated_idxs

In [9]:
b = movie_rated_idxs

In [12]:
b[0]

array([    1,     5,     8, ..., 24977, 24980, 24982])

In [13]:
a[0]

array([ 5,  6, 10, 12, 13, 14, 15, 17, 20, 26, 31, 33, 34, 37, 38, 39, 41,
       44, 45, 47, 52, 53, 54, 64, 65, 68, 69, 85])

In [265]:
R.shape

(24983, 100)

In [260]:
k = np.isnan(R)

In [261]:
np.sum(k)

1593944

In [3]:
a, b = svd_lfm(R)

In [23]:
a[:, :5].T @ a[:, :5]

array([[ 1.00000000e+00, -7.34275921e-17, -6.12845278e-17,
         1.78025997e-16,  1.03900449e-16],
       [-7.34275921e-17,  1.00000000e+00,  7.25737829e-18,
         2.66713734e-17, -1.57100895e-16],
       [-6.12845278e-17,  7.25737829e-18,  1.00000000e+00,
        -2.78992324e-16, -9.35259899e-17],
       [ 1.78025997e-16,  2.66713734e-17, -2.78992324e-16,
         1.00000000e+00, -1.62223750e-17],
       [ 1.03900449e-16, -1.57100895e-16, -9.35259899e-17,
        -1.62223750e-17,  1.00000000e+00]])

In [263]:
get_train_mse(np.copy(R), a[:, :5], b[:, :5])

904356


18.06079068441721

In [264]:
get_train_mse_old(np.copy(R), a[:, :5], b[:, :5])

18.060790684417398

In [248]:
get_train_acc(R, a, b)

0.9970166615801742

In [133]:
a.shape

(24983, 100)

In [118]:
b.shape

(100, 100)

In [119]:
R.shape

(24983, 100)

In [121]:
a[:, :5].shape

(24983, 5)

In [123]:
b[:,:5].shape

(100, 5)

In [5]:
np.outer(a[0], a[0]).shape

(100, 100)