# Mini Project2


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import requests
from surprise import SVD
from surprise import accuracy, Dataset, Reader

In [3]:
url_dict = {
     'data.csv': 'https://caltech-cs155.s3.us-east-2.amazonaws.com/sets/miniprojects/project2/data/data.csv',
     'movies.csv': 'https://caltech-cs155.s3.us-east-2.amazonaws.com/sets/miniprojects/project2/data/movies.csv',
     'train.csv': 'https://caltech-cs155.s3.us-east-2.amazonaws.com/sets/miniprojects/project2/data/train.csv',
     'test.csv': 'https://caltech-cs155.s3.us-east-2.amazonaws.com/sets/miniprojects/project2/data/test.csv'
}

def download_file(file_path):
    url = url_dict[file_path]
    print('Start downloading...')
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(file_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024 * 1024 * 1024):
                f.write(chunk)
    print('Complete')

download_file('data.csv')
download_file('movies.csv')
download_file('train.csv')
download_file('test.csv')

Start downloading...
Complete
Start downloading...
Complete
Start downloading...
Complete
Start downloading...
Complete


In [9]:
data = np.array(pd.read_csv('data.csv'))
movies = np.array(pd.read_csv('movies.csv'))
train = np.array(pd.read_csv('train.csv'))
test = np.array(pd.read_csv('test.csv'))

## Surprise

In [28]:
M = max(max(train[:,0]), max(test[:,0])).astype(int) # users
N = max(max(train[:,1]), max(test[:,1])).astype(int) # movies
print("Factorizing with ", M, " users, ", N, " movies.")
K = 20

# reader for range of ratings
reader = Reader(rating_scale=(1,5))

# convert train numpy array to data frame
train_df = pd.DataFrame(train, columns= ['User ID', 'Movie ID', 'Rating'])

# create surprise data object
train_ds = Dataset.load_from_df(train_df[['User ID','Movie ID', 'Rating']], reader)

# convert to train set
train_ts = train_ds.build_full_trainset()
train_ts.n_users = M # specify number of users
train_ts.n_items = N # specify number of items

# use SVD algorithm
algo = SVD(n_factors=20,biased=False)
algo.fit(train_ts)
predictions = algo.test(test)

# compute RMSE
accuracy.rmse(predictions)

# extract matrix
U = algo.pu
V = algo.qi
print("U is matrix of size ",U.shape[0]," by ",U.shape[1])
print("V is matrix of size ",V.shape[0]," by ",V.shape[1])

Factorizing with  943  users,  1682  movies.
RMSE: 0.9369
U is matrix of size  943  by  20
V is matrix of size  1682  by  20
