# This Notebook contains code for Building Collaborative Filtering based Recommendation Engine

In [1]:
import boto3
s3 = boto3.resource('s3')

In [2]:
s3.Bucket('recommendationsystemshubham').download_file('dataset/movies.dat', 'datasets/movies.dat')

In [3]:
s3.Bucket('recommendationsystemshubham').download_file('dataset/ratings.dat', 'datasets/ratings.dat')


In [4]:
s3.Bucket('recommendationsystemshubham').download_file('dataset/users.dat', 'datasets/users.dat')

In [5]:
DATASET_NAME = "datasets/movies.csv"

In [6]:
import pandas as pd

unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('datasets/users.dat', sep='::', header=None, names=unames)



In [8]:
rnames=['user_id', 'movie_id', 'rating', 'timestamp']
rnames = pd.read_table('datasets/ratings.dat', sep='::', names = rnames, header=None)
rnames[:5]

  from ipykernel import kernelapp as app


Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [9]:
mnames = ['movie_id', 'title', 'genres']
mnames = pd.read_table('datasets/movies.dat', sep='::', names = mnames, header=None)
mnames[:5]

  from ipykernel import kernelapp as app


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [10]:
movielens = pd.merge(pd.merge(rnames, users), mnames)

In [11]:
movielens[:5]

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama


## Following will be the procedure to perform collaborative filtering.

### Performing non-personalized recommendation

In [13]:
meanRatings = movielens.pivot_table('rating', index='title', columns='gender', aggfunc='mean')

In [14]:
meanRatings[:5]

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"$1,000,000 Duck (1971)",3.375,2.761905
'Night Mother (1986),3.388889,3.352941
'Til There Was You (1997),2.675676,2.733333
"'burbs, The (1989)",2.793478,2.962085
...And Justice for All (1979),3.828571,3.689024


In [15]:
ratings_by_title = movielens.groupby('title').size()

In [17]:
moviesWithGreaterRatings = ratings_by_title.index[ratings_by_title > 200]

In [18]:
moviesWithGreaterRatings

Index([''burbs, The (1989)', '10 Things I Hate About You (1999)',
       '101 Dalmatians (1961)', '101 Dalmatians (1996)', '12 Angry Men (1957)',
       '13th Warrior, The (1999)', '2 Days in the Valley (1996)',
       '20,000 Leagues Under the Sea (1954)', '2001: A Space Odyssey (1968)',
       '2010 (1984)',
       ...
       'Year of Living Dangerously (1982)', 'Yellow Submarine (1968)',
       'Yojimbo (1961)', 'You've Got Mail (1998)', 'Young Frankenstein (1974)',
       'Young Guns (1988)', 'Young Guns II (1990)',
       'Young Sherlock Holmes (1985)', 'Zero Effect (1998)',
       'eXistenZ (1999)'],
      dtype='object', name='title', length=1420)

In [19]:
meanRatingsForSuchMovies = meanRatings.loc[moviesWithGreaterRatings,:]

In [21]:
meanRatingsForSuchMovies[:10]

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"'burbs, The (1989)",2.793478,2.962085
10 Things I Hate About You (1999),3.646552,3.311966
101 Dalmatians (1961),3.791444,3.5
101 Dalmatians (1996),3.24,2.911215
12 Angry Men (1957),4.184397,4.328421
"13th Warrior, The (1999)",3.112,3.168
2 Days in the Valley (1996),3.488889,3.244813
"20,000 Leagues Under the Sea (1954)",3.670103,3.709205
2001: A Space Odyssey (1968),3.825581,4.129738
2010 (1984),3.446809,3.413712


In [27]:
top_female_ratings = meanRatings.sort_values(by='F', ascending=False)
top_female_ratings[:10]

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Clean Slate (Coup de Torchon) (1981),5.0,3.857143
"Ballad of Narayama, The (Narayama Bushiko) (1958)",5.0,3.428571
Raw Deal (1948),5.0,3.307692
Bittersweet Motel (2000),5.0,
Skipped Parts (2000),5.0,4.0
Lamerica (1994),5.0,4.666667
"Gambler, The (A J�t�kos) (1997)",5.0,3.166667
"Brother, Can You Spare a Dime? (1975)",5.0,3.642857
Ayn Rand: A Sense of Life (1997),5.0,4.0
24 7: Twenty Four Seven (1997),5.0,3.75


In [25]:
rating_std = movielens.groupby('title')['rating'].std()

In [26]:
rating_std[:5].sort_values()

title
...And Justice for All (1979)    0.878110
'Til There Was You (1997)        1.020159
$1,000,000 Duck (1971)           1.092563
'burbs, The (1989)               1.107760
'Night Mother (1986)             1.118636
Name: rating, dtype: float64

In [29]:
rating_std = rating_std.loc[moviesWithGreaterRatings]


In [30]:
rating_std

title
'burbs, The (1989)                                                    1.107760
10 Things I Hate About You (1999)                                     0.989815
101 Dalmatians (1961)                                                 0.982103
101 Dalmatians (1996)                                                 1.098717
12 Angry Men (1957)                                                   0.812731
13th Warrior, The (1999)                                              1.140421
2 Days in the Valley (1996)                                           0.921592
20,000 Leagues Under the Sea (1954)                                   0.869685
2001: A Space Odyssey (1968)                                          1.042504
2010 (1984)                                                           0.946618
28 Days (2000)                                                        0.920278
39 Steps, The (1935)                                                  0.853501
54 (1998)                                     

## Evaluation Mechanism

### -> taking smaller sample from the full dataset

In [31]:
import numpy as np
movielens = movielens.iloc[np.random.choice(movielens.index, size=10000, replace=False),:]
print(movielens.shape)
print(movielens.user_id.nunique())
print(movielens.movie_id.nunique())

(10000, 10)
3707
2279


In [32]:
movielens.iloc[:5,: ]

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres
562119,3485,1288,5,967094696,M,25,0,94121,This Is Spinal Tap (1984),Comedy|Drama|Musical
133604,475,1198,5,976218884,F,25,2,55421,Raiders of the Lost Ark (1981),Action|Adventure
755282,4865,2132,5,962818748,F,35,14,2130,Who's Afraid of Virginia Woolf? (1966),Drama
539532,3642,1220,3,966473693,F,25,7,98188,"Blues Brothers, The (1980)",Action|Comedy|Musical
922842,6036,2672,1,956710231,F,25,15,32603,"Thirteenth Floor, The (1999)",Drama|Sci-Fi|Thriller


In [33]:
condition = lambda x: len(x.index) > 1

In [34]:
movielens = movielens.groupby('user_id').filter(condition)

In [35]:
print (movielens.shape)
np.all(movielens.user_id.value_counts() > 1)

(8443, 10)


True

### Generating train and test subsets of dataset

In [39]:
def assign_to_set(df):
    sampled_ids = np.random.choice(df.index,
                                   size=np.int64(np.ceil(df.index.size * 0.2)),
                                   replace=False)
    df.loc[sampled_ids, 'for_testing'] = True
    return df

movielens['for_testing'] = False
grouped = movielens.groupby('user_id', group_keys=False).apply(assign_to_set)
movielens_train = movielens[grouped.for_testing == False]
movielens_test = movielens[grouped.for_testing == True]
print (movielens_train.shape)
print (movielens_test.shape)
print (movielens_train.index & movielens_test.index)

(5797, 11)
(2646, 11)
Int64Index([], dtype='int64')


In [40]:
movielens_train[:5]

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
133604,475,1198,5,976218884,F,25,2,55421,Raiders of the Lost Ark (1981),Action|Adventure,False
755282,4865,2132,5,962818748,F,35,14,2130,Who's Afraid of Virginia Woolf? (1966),Drama,False
539532,3642,1220,3,966473693,F,25,7,98188,"Blues Brothers, The (1980)",Action|Comedy|Musical,False
922842,6036,2672,1,956710231,F,25,15,32603,"Thirteenth Floor, The (1999)",Drama|Sci-Fi|Thriller,False
685094,4682,1094,4,964566489,M,25,7,5346,"Crying Game, The (1992)",Drama|Romance|War,False


In [45]:
movielens_train.to_csv('datasets/movielens_train.csv')
movielens_test.to_csv('datasets/movielens_test.csv')
movielens.to_csv('datasets/movielens.csv')

## Evaluation Model

#### RMSE
#### Precision/ Recall/ F-score

In [46]:
def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

In [47]:
def evaluate(estimate_f):
    """ RMSE-based predictive performance evaluation with pandas. """
    ids_to_estimate = zip(movielens_test.user_id, movielens_test.movie_id)
    estimated = np.array([estimate_f(u,i) for (u,i) in ids_to_estimate])
    real = movielens_test.rating.values
    return compute_rmse(estimated, real)

In [52]:
def estimate1(user_id, item_id):
    """ Simple content-filtering based on mean ratings. """
    return movielens_train.loc[movielens_train.user_id == user_id, 'rating'].mean()

print ('RMSE for estimate1: %s' % evaluate(estimate1))

RMSE for estimate1: 1.227093899694165


In [51]:
def estimate2(user_id, movie_id):
    """ Simple collaborative filter based on mean ratings. """
    ratings_by_others = movielens_train[movielens_train.movie_id == movie_id]
    if ratings_by_others.empty: return 3.0
    return ratings_by_others.rating.mean()

print ('RMSE for estimate2: %s' % evaluate(estimate2))

RMSE for estimate2: 1.1253049972938671


## The above RMSE is cool but can we do better? let's try.

#### Let's do pivoting.

In [53]:
ratings_mtx_df = movielens_train.pivot_table(values='rating',
                                             index='user_id',
                                             columns='movie_id')

In [57]:
ratings_mtx_df.iloc[-15:, :15]

movie_id,1,2,3,4,5,6,7,9,10,11,12,14,19,21,22
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
6000,,,,,,,,,,,,,,,
6002,,,,,,,,,,,,,,,
6003,,,,,,,,,,,,,,,
6007,,,,,,,,,,,,,,,
6010,,,,,,,,,,,,,,,
6011,,,,,,,,,,,,,,,
6016,,,,,,,,,,,,,,,
6022,5.0,,,,,,,,,,,,,,
6023,,,,,,,,,,,,,,,
6030,,,,,,,,,,,,,,,


In [58]:
def pearson(s1, s2):
    """Take two pd.Series objects and return a pearson correlation."""
    s1_c = s1 - s1.mean()
    s2_c = s2 - s2.mean()
    return np.sum(s1_c * s2_c) / np.sqrt(np.sum(s1_c ** 2) * np.sum(s2_c ** 2))

In [72]:
class CollabFiltering:
    """ Collaborative filtering using a custom sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        self.all_user_profiles = movielens.pivot_table('rating', index='movie_id', columns='user_id')

    def estimate(self, user_id, movie_id):
        """ Ratings weighted by correlation similarity. """
        ratings_by_others = movielens_train[movielens_train.movie_id == movie_id]
        if ratings_by_others.empty: return 3.0
        ratings_by_others.set_index('user_id', inplace=True)
        their_ids = ratings_by_others.index
        their_ratings = ratings_by_others.rating
        their_profiles = self.all_user_profiles[their_ids]
        user_profile = self.all_user_profiles[user_id]
        sims = their_profiles.apply(lambda profile: pearson(profile, user_profile), axis=0)
        ratings_sims = pd.DataFrame({'sim': sims, 'rating': their_ratings})
        ratings_sims = ratings_sims[ ratings_sims.sim > 0]
        if ratings_sims.empty:
            return their_ratings.mean()
        else:
            return np.average(ratings_sims.rating, weights=ratings_sims.sim)
        


In [73]:
reco = CollabFiltering()
reco.learn()
print ('RMSE for CollabFiltering: %s' % evaluate(reco.estimate) )



RMSE for CollabFiltering: 1.0783979099323924


## This was the collaborative Filtering based Recommendation system. We can further optimize this using genres as features.