In [1]:
import numpy as np
import pandas as pd
from time import time
import math
import data_handler as dh
import paths

## Make all definitions here

In [2]:
def calculate_item_mean(ratings: np.ndarray, item_data: np.ndarray):
    """ Calculate a mean based of data variance and offsets

    :param ratings: all ratings in the training data
    :param item_data: data related to the item whose mean we are calculating
    :return:
    """
    overall_mean = np.mean(ratings)
    overall_variance = np.var(ratings)

    item_sum = np.sum(item_data)
    item_variance = np.var(item_data)
    item_size = len(item_data)
    var_ratio = item_variance / overall_variance

    item_mean = (overall_mean * var_ratio + item_sum) / (var_ratio + item_size)
    return item_mean


def calculate_all_means(df_data: pd.DataFrame):
    """ Calculate the mean rating per movie as well as the average user offset

    :param df_data: All training data
    :return: Arrays containing the calculated values for each user and movie
    """
    user_ids = (df_data['row_id'] - 1).values
    movie_ids = (df_data['col_id'] - 1).values
    ratings = df_data['Prediction'].values

    print("Calculating movie average ratings")
    tic = time()
    # find the average ratings for each movie
    movie_ratings = []
    for m in range(0, paths.num_movies):
        # find entries and then ratings for movie m
        m_entries = np.where(np.equal(movie_ids, m))[0]
        m_ratings = ratings[m_entries]
        movie_mean = calculate_item_mean(ratings, m_ratings)
        movie_ratings.append(movie_mean)
    toc = time()
    print(toc - tic)

    print("Calculating rating offsets")
    tic = time()
    # find the offset of each rating
    rating_offsets = []
    for r in range(0, len(ratings)):
        offset = ratings[r] - movie_ratings[movie_ids[r]]
        rating_offsets.append(offset)
    toc = time()
    print(toc - tic)

    print("Calculating user average offsets")
    tic = time()
    rating_offsets = np.asarray(rating_offsets)
    user_offsets = []
    # calculate the mean offset for each user
    for u in range(0, paths.num_users):
        # find all entries for user u and then all their ratings
        u_entries = np.where(np.equal(user_ids, u))[0]
        u_offsets = rating_offsets[u_entries]
        user_mean = calculate_item_mean(ratings, u_offsets)
        user_offsets.append(user_mean)
    toc = time()
    print(toc - tic)

    return {"mean_movie_rating" : movie_ratings, "mean_user_offsets" : user_offsets}


def predict_initial_rating(mean_predictions: dict, user: int, movie: int):
    """ Get a prediction rating for a user-movie pair
    based purely on the movie average rating and user offset

    :param mean_predictions: data with all the averages
    :param user: user index
    :param movie: movie index
    :return: the predicted rating
    """
    movie_ratings = mean_predictions["mean_movie_rating"]
    user_offsets = mean_predictions["mean_user_offsets"]
    rating = movie_ratings[movie] + user_offsets[user]
    rating = min(paths.max_rating, rating)
    rating = max(paths.min_rating, rating)

    return rating


# make a prediction for a single user-movie pair
def make_prediction(mean_predictions, user_features, movie_features, bu, bm, user, movie):
    rating = predict_initial_rating(mean_predictions, user, movie)
    rating += sum(user_features[user] * movie_features[:, movie]) + bu[user] + bm[movie]

    return rating


def calculate_rmse(mean_predictions, user_features, movie_features, bu, bm, test_samples):
    """ Calculate the rmse w.r.t. every sample in the testing set """
    errors = []

    for sample in test_samples:
        rating = sample[paths.rating_id]
        predicted_rating = make_prediction(mean_predictions, user_features, movie_features,
                                           bu, bm, sample[paths.user_id], sample[paths.movie_id])
        err = rating - predicted_rating
        errors.append(err*err)

    rmse = math.sqrt(np.mean(errors))
    return rmse


def train(k, mean_predictions, user_features, movie_features, bu, bm, train_data, test_data):
    rmse: float = calculate_rmse(mean_predictions, user_features, movie_features, bu, bm, test_data)
    prev_rmse: float = rmse
    print("Starting rmse: {0}".format(rmse))
    
    for feature in range(0, k):
        print("Training feature {0}".format(feature))
        user_features[feature] = 0.1
        movie_features[:, feature] = 0.1

        tic = time()
        # train the feature
        for i in range(1, 1000):
            for sample in train_data:
                user = sample[paths.user_id]
                movie = sample[paths.movie_id]

                mf = movie_features[:, movie][feature]
                uf = user_features[user][feature]

                predicted_rating = make_prediction(mean_predictions, user_features, movie_features,
                                                   bu, bm, user, movie)
                err = sample[paths.rating_id] - predicted_rating

                # update the features
                user_features[user][feature] += paths.learning_rate * (err * mf - paths.lambda_term * uf)
                movie_features[:, movie][feature] += paths.learning_rate * (err * uf - paths.lambda_term * mf)

                bu[user] += paths.learning_rate * (err - paths.lambda_term * bu[user])
                bm[movie] += paths.learning_rate * (err - paths.lambda_term * bm[movie])

            rmse = calculate_rmse(mean_predictions, user_features, movie_features, bu, bm, test_data)
            toc = time()
            iter_time = (toc - tic) / i
            print('Iteration: %d, Misfit: %.8f, Improvement: %.8f, Time: %.3f'
                  % (i, rmse, prev_rmse - rmse, iter_time))
            if (rmse > prev_rmse and i > 5):
                break
            prev_rmse = rmse

# Write stuff we want to test here (train function)

In [3]:
print("Processing data")
df_data: pd.DataFrame = dh.read_data(paths.total_dataset_location)
data_dict: dict = dh.split_original_data(df_data, 0.1)

df_train_data: pd.DataFrame = data_dict["train_data"]
df_test_data: pd.DataFrame = data_dict["test_data"]

train_samples: np.ndarray = dh.df_as_array(df_train_data)
test_samples: np.ndarray = dh.df_as_array(df_test_data)

Processing data


In [4]:
print("Calculating mean predictions")
tic = time()
mean_predictions = calculate_all_means(df_train_data)
toc = time()
print(toc - tic)

Calculating mean predictions
Calculating movie average ratings
6.848834037780762
Calculating rating offsets
4.6001904010772705
Calculating user average offsets
66.53981423377991
77.99502110481262


In [None]:
# initialize variables needed for training
k = 10
bu = np.zeros(paths.num_users)
bm = np.zeros(paths.num_movies)
user_features = np.zeros((paths.num_users, k))
movie_features = np.zeros((k, paths.num_movies))

In [None]:
train(k, mean_predictions, user_features, movie_features, bu, bm, train_samples, test_samples)

Starting rmse: 0.9993717095178094
Training feature 0
Iteration: 1, Misfit: 0.99880673, Improvement: 0.00056498, Time: 23.327
Iteration: 2, Misfit: 0.99869303, Improvement: 0.00011369, Time: 23.410
Iteration: 3, Misfit: 0.99864143, Improvement: 0.00005160, Time: 23.392
Iteration: 4, Misfit: 0.99860971, Improvement: 0.00003172, Time: 23.424
Iteration: 5, Misfit: 0.99858764, Improvement: 0.00002207, Time: 23.402
Iteration: 6, Misfit: 0.99857147, Improvement: 0.00001617, Time: 23.430
Iteration: 7, Misfit: 0.99855936, Improvement: 0.00001212, Time: 23.429
Iteration: 8, Misfit: 0.99855018, Improvement: 0.00000917, Time: 23.441
Iteration: 9, Misfit: 0.99854322, Improvement: 0.00000696, Time: 23.434
Iteration: 10, Misfit: 0.99853795, Improvement: 0.00000527, Time: 23.450
Iteration: 11, Misfit: 0.99853399, Improvement: 0.00000396, Time: 23.472
Iteration: 12, Misfit: 0.99853104, Improvement: 0.00000295, Time: 23.471
Iteration: 13, Misfit: 0.99852890, Improvement: 0.00000215, Time: 23.454
Iterati