In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split
from surprise import SVD, accuracy

In [3]:
raw_df = pd.read_csv('data/ml-latest-small/ratings.csv')

In [4]:
raw_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
raw_df.drop('timestamp',axis=1,inplace=True)

In [6]:
raw_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


$R_{ij}$

In [7]:
np.unique(raw_df['rating'])

array([0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. ])

In [8]:
#!/usr/bin/env python

"""
http://surprise.readthedocs.io/en/stable/building_custom_algo.html
"""

import sys
import numpy as np
from surprise import AlgoBase, Dataset
from surprise.model_selection.validation import cross_validate

class GlobalMean(AlgoBase):
    def __init__(self):

        # Always call base method before doing anything.
        AlgoBase.__init__(self)

    def fit(self, trainset):

        # Here again: call base method before doing anything.
        AlgoBase.fit(self, trainset)

        # Compute the average rating. We might as well use the
        # trainset.global_mean attribute ;)
        self.the_mean = np.mean([r for (_, _, r) in
                                 self.trainset.all_ratings()])

        return self

    def estimate(self, u, i):

        return self.the_mean


class MeanofMeans(AlgoBase):
    def __init__(self):

    # Always call base method before doing anything.
        AlgoBase.__init__(self)


    def fit(self, trainset):

        # Here again: call base method before doing anything.
        AlgoBase.fit(self, trainset)

        users = np.array([u for (u, _, _) in self.trainset.all_ratings()])
        items = np.array([i for (_, i, _) in self.trainset.all_ratings()])
        ratings = np.array([r for (_, _, r) in self.trainset.all_ratings()])

        user_means,item_means = {},{}
        for user in np.unique(users):
            user_means[user] = ratings[users==user].mean()
        for item in np.unique(items):
            item_means[item] = ratings[items==item].mean()

        self.global_mean = ratings.mean()
        self.user_means = user_means
        self.item_means = item_means

    def estimate(self, u, i):
        """
        return the mean of means estimate
        """

        if u not in self.user_means:
            return(np.mean([self.global_mean,
                            self.item_means[i]]))

        if i not in self.item_means:
            return(np.mean([self.global_mean,
                            self.user_means[u]]))

        return(np.mean([self.global_mean,
                        self.user_means[u],
                        self.item_means[i]]))


if __name__ == "__main__":
    reader = Reader()
    data = Dataset.load_from_df(raw_df, reader)
    print("\nGlobal Mean...")
    algo = GlobalMean()
    ga_scores = cross_validate(algo, data)

    print("\nMeanOfMeans...")
    algo = MeanofMeans()
    mom_scores = cross_validate(algo, data)



Global Mean...

MeanOfMeans...


In [9]:
reader = Reader()
data = Dataset.load_from_df(raw_df, reader)
print("\nGlobal Mean...")
algo = GlobalMean()
ga_scores = cross_validate(algo, data)

print("\nMeanOfMeans...")
algo = MeanofMeans()
mom_scores = cross_validate(algo, data)


Global Mean...

MeanOfMeans...


In [10]:
trainset, testset = train_test_split(data, test_size=0.25)

In [11]:
to_beat = np.mean(mom_scores['test_rmse'])

In [12]:
to_beat

0.9305143763117677

In [13]:
recs = []
user = 572
for movie in range(len(algo.item_means)):
    recs.append(algo.estimate(user,movie))

In [14]:
indices = np.argsort(recs)[:-10:-1]

In [15]:
for idx in indices:
    print(idx)
    print(algo.item_means[idx])

3759
5.0
8141
5.0
8138
5.0
8493
5.0
4527
5.0
8781
5.0
1186
5.0
3126
5.0
5624
5.0


In [53]:
movie_df = pd.read_csv('data/ml-latest-small/movies.csv')

In [17]:
movie_recs = []
for movie in indices:
    movie_recs.append(movie_df['title'].iloc[movie])

In [18]:
movie_recs

['Shogun Assassin (1980)',
 'Disconnect (2012)',
 'Dark Tide (2012)',
 'Giver, The (2014)',
 'Once Upon a Time in Mexico (2003)',
 'Northmen - A Viking Saga (2014)',
 'Simple Wish, A (1997)',
 'Mermaids (1990)',
 'Kirikou and the Sorceress (Kirikou et la sorcière) (1998)']

In [19]:
movie_df[movie_df.index==6569]

Unnamed: 0,movieId,title,genres
6569,55112,Shanghai Kiss (2007),Comedy|Drama|Romance


In [20]:
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [21]:
grouped_df = raw_df.groupby(['userId','movieId','rating'])

In [22]:
pres = data.build_full_trainset()

In [23]:
alberto = pres.ur[2]

In [52]:
pres.ir[264]

[(2, 0.5),
 (3, 5.0),
 (19, 4.5),
 (28, 4.0),
 (32, 5.0),
 (49, 3.0),
 (83, 4.0),
 (99, 4.0),
 (103, 4.0),
 (104, 4.0),
 (131, 3.0),
 (139, 4.0),
 (168, 5.0),
 (176, 4.0),
 (185, 4.0),
 (187, 5.0),
 (291, 2.0),
 (314, 3.0),
 (327, 4.0),
 (356, 4.0),
 (397, 5.0),
 (473, 4.5),
 (482, 4.0),
 (487, 4.5),
 (488, 5.0),
 (516, 4.5),
 (524, 4.0),
 (551, 3.5),
 (553, 4.0),
 (562, 4.0),
 (589, 4.0),
 (592, 4.0),
 (599, 4.5),
 (602, 5.0),
 (605, 5.0)]

In [40]:
alberto

[(259, 0.5),
 (28, 0.5),
 (260, 0.5),
 (261, 0.5),
 (262, 0.5),
 (263, 5.0),
 (264, 0.5),
 (265, 0.5),
 (266, 0.5),
 (267, 0.5),
 (268, 0.5),
 (86, 3.5),
 (269, 0.5),
 (270, 3.0),
 (102, 4.5),
 (120, 0.5),
 (271, 0.5),
 (128, 0.5),
 (133, 2.0),
 (272, 4.0),
 (273, 0.5),
 (274, 5.0),
 (275, 4.5),
 (276, 0.5),
 (224, 5.0),
 (277, 0.5),
 (278, 5.0),
 (279, 0.5),
 (280, 5.0),
 (281, 5.0),
 (282, 4.5),
 (283, 5.0),
 (284, 0.5),
 (285, 5.0),
 (286, 4.5),
 (287, 5.0),
 (288, 4.5),
 (289, 5.0),
 (290, 0.5)]

In [55]:
indices = [750, 1204,  318, 1193,   50,  912, 1136, 1221,  904, 1276]

In [60]:
for movie in indices:
    print(movie_df.loc[movie].title)

Pompatus of Love, The (1996)
Leave It to Beaver (1997)
I Love Trouble (1994)
Event Horizon (1997)
Georgia (1995)
Wings of Desire (Himmel über Berlin, Der) (1987)
Selena (1997)
Soul Food (1997)
Withnail & I (1987)
Amistad (1997)


In [30]:
movie_df.set_index('movieId', inplace=True)

In [49]:
movie_df.loc[259].title

'Kiss of Death (1995)'

In [47]:
movie_df[movie_df.index==259]['title']

movieId
259    Kiss of Death (1995)
Name: title, dtype: object