In [1]:
import numpy as np
import pandas as pd
from scipy import sparse
import copy
from IPython.display import display
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)

In [2]:
def get_data_list(filename):
    """
    Extracts the data in a sparse form. Results in a list of length number of users.
    Each element is again a list of tuples (movie_idx, rating)
    :param filename: filename of the dataset under consideration
    :return: lists, dataframe
    """
    ratings_contents = pd.read_table(filename, names=["user", "movie", "rating", "timestamp"])
    highest_user_id = ratings_contents.user.max()
    highest_movie_id = ratings_contents.movie.max()
    ratings = [[] for _ in range(highest_user_id)]
    for _, row in ratings_contents.iterrows():
        # subtract 1 from id's due to match 0 indexing
        ratings[row.user-1].append((row.movie-1, row.rating))
    return ratings, ratings_contents

def get_items_data():
    item_contents = pd.read_table("data/u.item",
                                  names=["movie_id", "movie_title", "release_date", "video_release_date",
                                         "IMDb URL", "unknown", "Action", "Adventure", "Animation",
                                         "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy" ,
                                         "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi",
                                         "Thriller", "War", "Western"],
                                  sep='|', encoding='latin-1')
    return item_contents


def get_users_data():
    user_contents = pd.read_table("data/u.user",
                                  names=["user_id", "age", "gender", "occupation", "zip"],
                                  sep='|')
    return user_contents


def make_random_data_like(data):
    ## Please ignore this function !!!
    out_data = [[] for _ in range(len(data))]
    for i,row in enumerate(data):
        for rating in row:
            out_data[i].append((rating[0], int(np.random.randint(1,6,1))))
    return out_data


def calc_rmse(data1, data2):
    """
    Calculates the Root Mean Square Error between the two datasets
    :param data1: first dataset
    :param data2: second dataset
    :return:
    """
    num_users = len(data1)

    SE = 0 #the accumulated Squared Error
    num_total = 0 #the accumulated number of ratings evaluated
    for i in range(num_users):
        data1_dict = dict(data1[i])
        for movie, rating2 in data2[i]:
            #Make one of the datasets into a dictionary to make the search more efficient
            rating1 = data1_dict.get(movie, -1)
            SE += (rating1-rating2)**2
            num_total += 1

            if rating1 == -1:
                print('Could not find rating for movie %i at user %i in data1'%(movie, i))
    rmse = np.sqrt(SE/num_total)
    return rmse

In [3]:
train_data, train_df = get_data_list("data/u_train.data")
val_data, val_df = get_data_list("data/u_val.data")
test_data, test_df = get_data_list("data/u_test.data")

#Ignore the next line, we just need some random data to show the rmse calculation
val_random = make_random_data_like(val_data)

#Example calculation of the rmse
rmse = calc_rmse(val_data, val_random)
print('The RMSE is %5.3f  (for random data, this should be around 2)'%rmse)

The RMSE is 1.870  (for random data, this should be around 2)


In [4]:
class Recommender:
    def __init__(self, train_df, similarity_threshold=0.5, verbose=False):
        # Create movies by users rating matrix
        ratings = pd.pivot_table(train_df, values='rating', index='movie', columns=['user'])
        if verbose: display(ratings)
        
        # Cache existing ratings per movie
        self.existing_ratings = dict()
        for movie, rats  in ratings.iterrows():
            self.existing_ratings[movie] = rats.dropna().rename('existing_ratings').to_frame()
        
        # Create users by users similarity matrix
        # by calculating pearson correlations
        similarities = ratings.corr(method='pearson')
        if verbose: display(similarities)
           
        # Cache highest similarities per user
        self.high_similarities = dict()
        for user in similarities.columns:
            ser = similarities.loc[user, similarities[user] > similarity_threshold]
            self.high_similarities[user] = ser.rename('high_similarities').to_frame()
        
    def predict_single(self, user, movie, verbose=False):
        try:
            if verbose: display(self.existing_ratings[movie])
            if verbose: display(self.high_similarities[user])
            
            df = pd.concat([self.existing_ratings[movie], self.high_similarities[user]], 
                           axis=1, join='inner')
            if verbose: display(df)

            rats = np.array(df['existing_ratings'])
            sims = np.array(df['high_similarities'])
            
            pred_rat = np.dot(rats, sims) / np.sum(sims)

            if not pd.isnull(pred_rat):
                return int(round(pred_rat))
            else:
                if verbose: print('user', user, 
                                  '\tmovie', movie, 
                                  '\t--- No similar user rated the movie --- "predicting" 2')
                return 2
            
        except KeyError:
            if verbose: print('user', user, 
                              '\tmovie', movie, 
                              '\t--- Nobody rated the movie before   --- "predicting" 2')
            return 2

    def predict_dataset(self, dataset):
        answer = []
        for user, user_scores in enumerate(dataset, 1):
            user_answer = []
            for movie, score in user_scores:
                user_answer.append((movie, self.predict_single(user, movie+1)))
            answer.append(user_answer)
        return answer

rec = Recommender(train_df, similarity_threshold=0.5, verbose=True)

user,1,2,3,4,5,...,939,940,941,942,943
movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,5.0,,,,4.0,...,,,5.0,,
2,3.0,,,,3.0,...,,,,,
3,4.0,,,,,...,,,,,
4,,,,,,...,,2.0,,,
5,,,,,,...,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
1672,,,,,,...,,,,,
1674,,,,,,...,,,,,
1675,,,,,,...,,,,,
1678,,,,,,...,,,,,


user,1,2,3,4,5,...,939,940,941,942,943
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,1.000000,0.356034,-0.960769,0.802181,0.323459,...,0.517294,-0.071985,-0.408248,-0.171446,0.023818
2,0.356034,1.000000,,,,...,-0.375000,0.155902,-1.000000,0.000000,0.852803
3,-0.960769,,1.000000,-0.400000,,...,,-0.816497,,0.500000,
4,0.802181,,-0.400000,1.000000,,...,,0.866025,,0.454545,
5,0.323459,,,,1.000000,...,1.000000,0.015173,0.500000,0.821584,0.218503
...,...,...,...,...,...,...,...,...,...,...,...
939,0.517294,-0.375000,,,1.000000,...,1.000000,,,,-0.316228
940,-0.071985,0.155902,-0.816497,0.866025,0.015173,...,,1.000000,,0.000000,0.132453
941,-0.408248,-1.000000,,,0.500000,...,,,1.000000,,-0.866025
942,-0.171446,0.000000,0.500000,0.454545,0.821584,...,,0.000000,,1.000000,0.149071


In [5]:
rec.predict_single(user=1, movie=11, verbose=True)

Unnamed: 0_level_0,existing_ratings
user,Unnamed: 1_level_1
1,2.0
7,3.0
8,3.0
11,2.0
13,1.0
...,...
903,2.0
913,4.0
916,4.0
933,4.0


Unnamed: 0_level_0,high_similarities
user,Unnamed: 1_level_1
1,1.000000
4,0.802181
8,0.855724
17,0.577121
21,0.582196
...,...
920,1.000000
922,0.515219
926,0.866025
928,0.654654


Unnamed: 0_level_0,existing_ratings,high_similarities
user,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2.0,1.000000
8,3.0,0.855724
57,3.0,0.528990
122,1.0,0.562388
177,4.0,0.561733
...,...,...
329,3.0,0.787726
414,5.0,0.727607
592,5.0,0.612887
619,2.0,0.624680


3

In [6]:
rec.predict_single(user=5, movie=15)

4

In [7]:
val_predictions = rec.predict_dataset(val_data)



In [8]:
# Check validation score of build recommender
rmse = calc_rmse(val_data, val_predictions)
print('The RMSE of my recommender on validation set is %5.3f'%rmse)

The RMSE of my recommender on validation set is 1.173
