In [5]:
import numpy as np
import pandas as pd

test_ratings = pd.read_csv('./data/ratings_test.csv')
train_ratings = pd.read_csv('./data/ratings_train.csv')

In [67]:
def build_movie_repres(train_ratings):
    movie_user = train_ratings.pivot('movieId', 'userId', 'rating')
    movie_means = movie_user.mean(axis=1)
    for i, col in enumerate(movie_user):
        # using i allows for duplicate columns
        # inplace *may* not always work here, so IMO the next line is preferred
        # df.iloc[:, i].fillna(m, inplace=True)
        movie_user.iloc[:, i] = movie_user.iloc[:, i].fillna(movie_means)
    return movie_user
def get_factors(movie_user, K):
    u, s, vh = np.linalg.svd(movie_user, full_matrices=True)
    # take k factors
    #K = 400
    U = u[:,:K]
    S = np.diag(s[:K])
    print(S[:5,:5])
    VH = vh[:K, :]
    P = U.dot(S)
    return P, VH

def get_prediction_svd(P, VH, movie_user, userId, movieId):
    if not int(userId) in list(movie_user.columns):
        print("Cannon predict for userId=" + str(userId))
        return 'unknown'
    elif not int(movieId) in list(movie_user.index):
        print("Cannot predict for movieId="+ str(movieId))
        return 'unknown'
    else:
        user = movie_user.columns.get_loc(int(userId))
        movie = movie_user.index.get_loc(int(movieId))
        user_predicted_ratings = P.dot(VH[:,user])
        return '{:.4f}'.format(user_predicted_ratings[movie])

def calc_rmse_svd(test_ratings, train_raitings, movie_user, P, VH, userId):
    test_user_ratings = test_ratings[test_ratings.userId == userId]
    train_user_ratings = train_ratings[train_ratings.userId == userId]
    # leave only movies that we can grade
    unique = []
    for i in test_user_ratings.movieId:
        if i in movie_user.index:
            unique.append(i)
    test_user_ratings = test_user_ratings[test_user_ratings.movieId.isin(unique)]
    movieIndexes = []
    for i in test_user_ratings.movieId:
        try:
            movieIndexes.append(movie_user.index.get_loc(i))
        except KeyError:
            pass
    # TODO: calc prediction rationgs fot the userId
    user = movie_user.columns.get_loc(int(userId))
    user_predicted_ratings = P.dot(VH[:,user])
    predicted = np.take(user_predicted_ratings, movieIndexes)
    m = len(predicted)
    print(m)
    err_square = (predicted - test_user_ratings.rating)**2
    return (err_square.sum()/m)**(1/2)

In [68]:
userId = 73
p, vh = get_factors(movie_user, 363)
movie_user = build_movie_repres(train_ratings)
calc_rmse_svd(test_ratings, train_ratings, movie_user, p, vh, userId)

[[6830.75934698    0.            0.            0.            0.        ]
 [   0.           49.76301992    0.            0.            0.        ]
 [   0.            0.           43.23012408    0.            0.        ]
 [   0.            0.            0.           39.72511021    0.        ]
 [   0.            0.            0.            0.           37.76308621]]
205


0.7851174615775267

In [56]:
min = 55
optimal = 999
for i in range(450, 0, -1):
    userId = 73
    p, vh = get_factors(movie_user, i)
    movie_user = build_movie_repres(train_ratings)
    temp = calc_rmse_svd(test_ratings, train_ratings, movie_user, p, vh, userId)
    if temp < min:
        min = temp
        optimal = i
    if(i%10==0):
        print('i= ' + str(i) + ' optimal= ' + str(optimal))
print(optimal)

i= 450 optimal= 450
i= 440 optimal= 440
i= 430 optimal= 439
i= 420 optimal= 439
i= 410 optimal= 439
i= 400 optimal= 439
i= 390 optimal= 439
i= 380 optimal= 439
i= 370 optimal= 373
i= 360 optimal= 363
i= 350 optimal= 363
i= 340 optimal= 363
i= 330 optimal= 363
i= 320 optimal= 363
i= 310 optimal= 363
i= 300 optimal= 363
i= 290 optimal= 363
i= 280 optimal= 363
i= 270 optimal= 363
i= 260 optimal= 363
i= 250 optimal= 363
i= 240 optimal= 363
i= 230 optimal= 363
i= 220 optimal= 363
i= 210 optimal= 363
i= 200 optimal= 363
i= 190 optimal= 363
i= 180 optimal= 363
i= 170 optimal= 173
i= 160 optimal= 163
i= 150 optimal= 150
i= 140 optimal= 150
i= 130 optimal= 132
i= 120 optimal= 132
i= 110 optimal= 132
i= 100 optimal= 132
i= 90 optimal= 132
i= 80 optimal= 85
i= 70 optimal= 73
i= 60 optimal= 73
i= 50 optimal= 73
i= 40 optimal= 73
i= 30 optimal= 73
i= 20 optimal= 73
i= 10 optimal= 73
73


In [38]:
ids = [['1', '31'], ['2', '10'], ['3', '1235'], ['4', '10']]
movie_user = build_movie_repres(train_ratings)
P, VH = get_factors(movie_user)
for i in ids:
    rate = get_prediction_svd(P, VH, movie_user, i[0], i[1])
    print('{},{},{}'.format(i[0], i[1], rate))



1,31,2.4925
2,10,4.0696
3,1235,3.8405
4,10,4.021


In [177]:
userId = 1
test_user_ratings = test_ratings[test_ratings.userId == userId]
train_user_ratings = train_ratings[train_ratings.userId == userId]
count = 0
unique = []
for i in test_user_ratings.movieId:
    if i not in train_user_ratings.movieId:
        unique.append(i)
# remove movies that are in train_raitings        
test_user_ratings = test_user_ratings[test_user_ratings.movieId.isin(unique)]
# test_user_ratings now has movies that he has not graded in train_set
# but this movies might not be in movie_user representating. Thus, drop those movies from test_user_ratings that are not
# in movie_users representation
unique = []
for i in test_user_ratings.movieId:
    if i in movie_user.index:
        unique.append(i)
test_user_ratings = test_user_ratings[test_user_ratings.movieId.isin(unique)]

movieIndexes = []
for i in test_user_ratings.movieId:
    try:
        movieIndexes.append(movie_user.index.get_loc(i))
    except KeyError:
        pass

predicted = np.take(user_predicted_ratings, movieIndexes)
err_square = (predicted - test_user_ratings.rating)**2
err_square.sum()/197
user_predicted_ratings.std()

0.6295180759671316

In [48]:
5//4

1