In [14]:
import importlib
import os
import scipy.io
import numpy as np
import scipy.linalg
import matplotlib.pyplot as plt

In [137]:
# Load training data from MAT file
R = scipy.io.loadmat('movie_data/movie_train.mat')['train']

# Load validation data from CSV
val_data = np.loadtxt('movie_data/movie_validate.txt', dtype=int, delimiter=',')

# Helper method to get training accuracy
def get_train_acc(R, user_vecs, movie_vecs):
    num_correct, total = 0, 0
    for i in range(R.shape[0]):
        for j in range(R.shape[1]):
            if not np.isnan(R[i, j]):
                total += 1
                if np.dot(user_vecs[i], movie_vecs[j])*R[i, j] > 0:
                    num_correct += 1
    return num_correct/total

# Helper method to get validation accuracy
def get_val_acc(val_data, user_vecs, movie_vecs):
    num_correct = 0
    for val_pt in val_data:
        user_vec = user_vecs[val_pt[0]-1]
        movie_vec = movie_vecs[val_pt[1]-1]
        est_rating = np.dot(user_vec, movie_vec)
        if est_rating*val_pt[2] > 0:
            num_correct += 1
    return num_correct/val_data.shape[0]

# Helper method to get indices of all rated movies for each user,
# and indices of all users who have rated that title for each movie
def get_rated_idxs(R):
    user_rated_idxs, movie_rated_idxs = [], []
    for i in range(R.shape[0]):
        user_rated_idxs.append(np.argwhere(~np.isnan(R[i, :])).reshape(-1))
    for j in range(R.shape[1]):
        movie_rated_idxs.append(np.argwhere(~np.isnan(R[:, j])).reshape(-1))
    return np.array(user_rated_idxs), np.array(movie_rated_idxs)

# Part (c): SVD to learn low-dimensional vector representations
def svd_lfm(R):
    
    r = np.copy(R)
    
    # Fill in the missing values in R
    r[np.isnan(r)] = 0

    # Compute the SVD of R
    U, s, Vh = scipy.linalg.svd(r, full_matrices = False)
    
    # Construct user and movie representations
    user = U
    movie = np.diag(s) @ Vh
    
    
    user_vecs = user
    movie_vecs = movie.T
    return user_vecs, movie_vecs

import time

# Part (d): Compute the training MSE loss of a given vectorization
def get_train_mse(R, user_vecs, movie_vecs):

    # Compute the training MSE loss
    mse_loss = 0
    count = 0
    a = np.where(np.isnan(R))
    
    for i in a[0]:
        for j in a[1]:
            dots = user_vecs[i].dot(movie_vecs[j])
            mse_loss += (dots - R[i, j]) ** 2
            count += 1
            
    return mse_loss


(array([    0,     0,     0, ..., 24982, 24982, 24982]),
 array([ 0,  1,  2, ..., 97, 98, 99]))

In [139]:
R

array([[  nan,   nan,   nan, ...,   nan,   nan,   nan],
       [-0.15,  1.02,   nan, ...,  1.84,   nan,   nan],
       [  nan,   nan,   nan, ...,   nan,   nan,   nan],
       ...,
       [ 3.25, -7.57,   nan, ...,  2.09,   nan,   nan],
       [  nan,  5.53,   nan, ...,  7.57,   nan,   nan],
       [ 5.97,  0.63,  3.2 , ...,   nan,   nan,   nan]])

In [140]:
a, b = svd_lfm(R)

In [141]:
get_train_mse(np.copy(R), a[:, :5], b[:, :5])

4.385138988494873


6.537799471719562

In [142]:
get_train_acc(R, a, b)

0.9970166615801742

In [133]:
a.shape

(24983, 100)

In [118]:
b.shape

(100, 100)

In [119]:
R.shape

(24983, 100)

In [121]:
a[:, :5].shape

(24983, 5)

In [123]:
b[:,:5].shape

(100, 5)