# Movie Recommendation System (Collaborative filtering)

In [1]:
import os
import urllib.request
import zipfile
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

# About the data
This dataset (ml-latest) describes 5-star rating and free-text tagging activity from [MovieLens](http://movielens.org), a movie recommendation service. It contains 22884377 ratings and 586994 tag applications across 34208 movies. These data were created by 247753 users between January 09, 1995 and January 29, 2016. This dataset was generated on January 29, 2016.

Source: https://grouplens.org/datasets/movielens/

#### Download

In [2]:
DOWNLOAD_ROOT = "https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/"
DATASETS_PATH = os.path.join("Datasets")
FILENAME = "moviedataset.zip"
DOWNLOAD_URL = DOWNLOAD_ROOT + FILENAME

def fetch_data():
    if not os.path.isdir(DATASETS_PATH):
        os.makedirs(DATASETS_PATH)
    zip_path = os.path.join(DATASETS_PATH, FILENAME)
    urllib.request.urlretrieve(DOWNLOAD_URL, zip_path)
    with zipfile.ZipFile(zip_path,"r") as zip_ref:
        zip_ref.extractall(DATASETS_PATH)
        
def load_data():
    movies_path = os.path.join(DATASETS_PATH, "ml-latest", "movies.csv")
    ratings_path = os.path.join(DATASETS_PATH, "ml-latest", "ratings.csv")
    movies = pd.read_csv(movies_path)
    ratings = pd.read_csv(ratings_path)
    return movies, ratings

In [3]:
#fetch_data()

In [4]:
movies, ratings = load_data()

#### Quick view

In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34208 entries, 0 to 34207
Data columns (total 3 columns):
movieId    34208 non-null int64
title      34208 non-null object
genres     34208 non-null object
dtypes: int64(1), object(2)
memory usage: 801.9+ KB


In [7]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [8]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22884377 entries, 0 to 22884376
Data columns (total 4 columns):
userId       int64
movieId      int64
rating       float64
timestamp    int64
dtypes: float64(1), int64(3)
memory usage: 698.4 MB


# Preprocessing
#### Size compression
The size of *ratings* is unnecessarily big. It is possible to save extra space by changing the data type. 

You can see below that int64 numbers are too big:
* Int16 -- (-32,768 to +32,767) 

* Int32 -- (-2,147,483,648 to +2,147,483,647) 

* Int64 -- (-9,223,372,036,854,775,808 to +9,223,372,036,854,775,807) 

Int32 and float16 would be enough for this dataset.

In [9]:
class DfCompressor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        if 'timestamp' in X:
            X.drop('timestamp', axis=1, inplace=True)
        X = X.astype({'userId': 'int32', 'movieId':'int32', 'rating':'float16'})
        return X


In [10]:
compressor = DfCompressor()
ratings = compressor.fit_transform(ratings)
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22884377 entries, 0 to 22884376
Data columns (total 3 columns):
userId     int32
movieId    int32
rating     float16
dtypes: float16(1), int32(2)
memory usage: 218.2 MB


Excellent, changing the data type saved around 500 MB

#### Representative ratings
To be sure in movies quality it is good to count the *total amount* of ratings of each film.

In [11]:
class MovieTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, countRatings=None, showYear=False, deleteGenres=False):
        self.countRatings = countRatings        
        self.showYear = showYear
        self.deleteGenres = deleteGenres
    
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        if self.countRatings is not None:
            countedRatings = pd.merge(self.countRatings, X, on="movieId")
            countedRatings = countedRatings.groupby('movieId').rating.count().reset_index()
            countedRatings.columns = ['movieId', 'countedRatings']
            X = X.merge(countedRatings)
        if self.showYear and 'year' not in X:
                X['year'] = X.title.str.extract(r'(\(\d\d\d\d\))', expand=False)
                X['year'] = X.year.str.extract(r'(\d\d\d\d)', expand=False)
                X['year'] = pd.to_numeric(X['year'], errors='raise')                
                X = X.dropna()
                X = X.astype({'year':'int16'})
        if self.deleteGenres and 'genres' in X:
            X = X.drop('genres', axis=1)        
        return X
    

In [12]:
transformer = MovieTransformer(countRatings=ratings, showYear=True)
movies = transformer.fit_transform(movies)

print('Top-5 most rated movies:')
movies.sort_values(by="countedRatings", ascending=False).head()

Top-5 most rated movies:


Unnamed: 0,movieId,title,genres,countedRatings,year
352,356,Forrest Gump (1994),Comedy|Drama|Romance|War,81296,1994
293,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,79091,1994
315,318,"Shawshank Redemption, The (1994)",Crime|Drama,77887,1994
587,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,76271,1991
476,480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,69545,1993


In [13]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33602 entries, 0 to 33669
Data columns (total 5 columns):
movieId           33602 non-null int64
title             33602 non-null object
genres            33602 non-null object
countedRatings    33602 non-null int64
year              33602 non-null int16
dtypes: int16(1), int64(2), object(2)
memory usage: 1.3+ MB


Okay, now it's time to build a recommending system.

# Colaborative filtering

#### Current user's input

In [81]:
userInput = [
            {'title':'(500) Days of Summer (2009)', 'rating':5}, # Max rating is 5.0
            {'title':'Boy in the Striped Pajamas, The (Boy in the Striped Pyjamas, The) (2008)', 'rating':5},
            {'title':'Into the Wild (2007)', 'rating':5},
            {'title':'Fight Club (1999)', 'rating':3.5},
            {'title':'Back to the Future (1985)', 'rating':2.5},
            {'title':'Professional, The (Le professionnel) (1981)', 'rating':2},     
            {'title':'Dead Alive (Braindead) (1992)', 'rating':1},   
            {'title':'Harry Potter and the Sorcerer\'s Stone (a.k.a. Harry Potter and the Philosopher\'s Stone) (2001)', 'rating':3},   
            {'title':'Movie 43 (2013)', 'rating':2},   
            {'title':'American History X (1998)', 'rating':4.5},   
            {'title':'Grand Budapest Hotel, The (2014)', 'rating':5},   
            {'title':'Basketball Diaries, The (1995)', 'rating':4},   
            {'title':'Scott Pilgrim vs. the World (2010)', 'rating':4.5}, 
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,(500) Days of Summer (2009),5.0
1,"Boy in the Striped Pajamas, The (Boy in the St...",5.0
2,Into the Wild (2007),5.0
3,Fight Club (1999),3.5
4,Back to the Future (1985),2.5
5,"Professional, The (Le professionnel) (1981)",2.0
6,Dead Alive (Braindead) (1992),1.0
7,Harry Potter and the Sorcerer's Stone (a.k.a. ...,3.0
8,Movie 43 (2013),2.0
9,American History X (1998),4.5


In [82]:
# Use this tool to find a movie's full title:
print(list(movies[movies['title'].str.contains('Scott Pilgrim')]['title'].values))

['Scott Pilgrim vs. the World (2010)']


In [83]:
# Adding movieId
inputMovies = pd.merge(inputMovies, movies)
inputMovies = inputMovies[['movieId', 'title', 'genres', 'countedRatings', 'rating']]
inputMovies = inputMovies.sort_values(by='movieId').reset_index(drop=True)
inputMovies

Unnamed: 0,movieId,title,genres,countedRatings,rating
0,147,"Basketball Diaries, The (1995)",Drama,4692,4.0
1,1241,Dead Alive (Braindead) (1992),Comedy|Fantasy|Horror,2740,1.0
2,1270,Back to the Future (1985),Adventure|Comedy|Sci-Fi,49420,2.5
3,2329,American History X (1998),Crime|Drama,27261,4.5
4,2959,Fight Club (1999),Action|Crime|Drama|Thriller,48879,3.5
5,4896,Harry Potter and the Sorcerer's Stone (a.k.a. ...,Adventure|Children|Fantasy,19977,3.0
6,5782,"Professional, The (Le professionnel) (1981)",Action|Drama|Thriller,4773,2.0
7,55247,Into the Wild (2007),Action|Adventure|Drama,6930,5.0
8,64034,"Boy in the Striped Pajamas, The (Boy in the St...",Drama|War,1700,5.0
9,69757,(500) Days of Summer (2009),Comedy|Drama|Romance,8159,5.0


#### The users who have seen the same movies

In [84]:
userSubset = ratings[ratings['movieId'].isin(inputMovies['movieId'])]
userSubset

Unnamed: 0,userId,movieId,rating
118,4,2959,4.0
246,7,2959,4.0
287,9,69757,3.5
602,13,2959,4.0
867,15,1270,3.5
...,...,...,...
22883817,247741,2959,4.0
22884163,247751,1270,4.0
22884183,247751,2329,4.0
22884193,247751,2959,4.5


I separated users to different dataframes. So it will be easier to iterate through them and calculate their similarity with current user.

In [85]:
userSubsetGroup = userSubset.groupby(['userId'])
#userSubsetGroup.get_group(247753)

To save time on computations I will take only 100 users. 

But first, let's sort groups so the users that share the most movies in common will be in these 100.

In [86]:
userSubsetGroup = sorted(userSubsetGroup, key=lambda x: len(x[1]), reverse=True)
userSubsetGroup[0:3]

[(36946,          userId  movieId  rating
  3402711   36946      147     2.5
  3403155   36946     1270     5.0
  3403554   36946     2329     3.0
  3403811   36946     2959     5.0
  3404429   36946     4896     4.0
  3404642   36946     5782     3.0
  3405729   36946    55247     3.0
  3405922   36946    64034     1.5
  3406025   36946    69757     4.0
  3406189   36946    79702     5.0
  3406512   36946   100083     0.5
  3406646   36946   109374     4.0), (46750,          userId  movieId  rating
  4320815   46750      147     2.5
  4321539   46750     1241     4.0
  4321568   46750     1270     4.5
  4322265   46750     2329     3.5
  4322735   46750     2959     4.0
  4324112   46750     4896     4.5
  4324692   46750     5782     3.0
  4327487   46750    55247     4.0
  4327837   46750    64034     2.0
  4327946   46750    69757     4.0
  4328085   46750    79702     4.0
  4328224   46750   109374     4.5), (74551,          userId  movieId  rating
  6953282   74551      147     3

In [87]:
userSubsetGroup = userSubsetGroup[0:100]

<br>

#### Computing users similarity 

I will use Pearson Correlation Coefficient to compare users in userSubsetGroup to the current user. 

Pearson correlation is invariant to scaling, i.e. multiplying all elements by a nonzero constant or adding any constant to all elements. For example, if you have two vectors X and Y,then, pearson(X, Y) == pearson(X, 2 * Y + 3). This is a pretty important property in recommendation systems because for example two users might rate two series of items totally different in terms of absolute rates, but they would be similar users (i.e. with similar ideas) with similar rates in various scales .

Equation:
![image.png](https://wikimedia.org/api/rest_v1/media/math/render/svg/869d208d19ba2481a306aa5b2829d6f147215f22)
Where:
![image.png](https://wikimedia.org/api/rest_v1/media/math/render/svg/ac7289290243ac81a5db64d7ad3e75c72536941d)

In [88]:
pearsonCorrDict = {}
inputMovies = inputMovies.sort_values(by='movieId')

for name, group in userSubsetGroup:
    group = group.sort_values(by='movieId')
    matchingMovies = inputMovies[inputMovies['movieId'].isin(group['movieId'])]  # movies that BOTH users have watched
    tempRatingList = matchingMovies['rating'].tolist()
    tempGroupList = group['rating'].tolist()    
    
    n = len(group)    
    Sxy = sum([i*j for i, j in zip(tempRatingList, tempGroupList)]) - sum(tempRatingList)*sum(tempGroupList)/float(n)
    Sxx = sum([i**2 for i in tempRatingList]) - (sum(tempRatingList)**2)/float(n)
    Syy = sum([i**2 for i in tempGroupList]) - (sum(tempGroupList)**2)/float(n)   
    
    if Sxx!=0 and Syy!=0:
        pearsonCorrDict[name] = Sxy/((Sxx*Syy)**0.5)
    else:
        pearsonCorrDict[name] = 0

In [89]:
pearson_df = pd.DataFrame.from_dict(pearsonCorrDict, orient="index")
pearson_df.columns = ['similarityIndex']
pearson_df['userId'] = pearson_df.index
pearson_df = pearson_df.reset_index(drop=True)
pearson_df.head()

Unnamed: 0,similarityIndex,userId
0,0.121023,36946
1,-0.154969,46750
2,-0.270501,74551
3,0.391516,84039
4,0.027354,204165


In [90]:
pearson_df.shape

(100, 2)

#### Most similar users

In [158]:
# Change similarityIndex filter to get more accurate predictions but less users. But it should be always > 0
topUsers = pearson_df[pearson_df.similarityIndex > 0.5].sort_values(by="similarityIndex", ascending=False)
topUsers.head()

Unnamed: 0,similarityIndex,userId
97,0.679786,4280
80,0.615134,177048
5,0.55301,215174
9,0.540277,54133


In [159]:
topUsers.shape

(4, 2)

#### Movies that similar users rated

In [169]:
topUsersRating = pd.merge(topUsers, ratings, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,0.679786,4280,1,3.5
1,0.679786,4280,2,4.0
2,0.679786,4280,10,3.5
3,0.679786,4280,19,3.0
4,0.679786,4280,29,4.0


In [170]:
topUsersRating.shape

(6228, 4)

In [171]:
topUsersRating['weightedRating'] = topUsersRating.similarityIndex * topUsersRating.rating
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,0.679786,4280,1,3.5,2.379252
1,0.679786,4280,2,4.0,2.719145
2,0.679786,4280,10,3.5,2.379252
3,0.679786,4280,19,3.0,2.039359
4,0.679786,4280,29,4.0,2.719145


In [172]:
topUsersRating = topUsersRating.groupby('movieId').sum()
topUsersRating = topUsersRating[['similarityIndex', 'weightedRating']]
topUsersRating = topUsersRating.sort_values(by='weightedRating')
topUsersRating.head()

Unnamed: 0_level_0,similarityIndex,weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
96448,0.540277,0.270139
2555,0.55301,0.276505
47810,0.55301,0.55301
3754,0.55301,0.55301
33499,0.55301,0.55301


In [173]:
topUsersRating.shape

(4049, 2)

# Building recommendations

In [174]:
movieScore = pd.DataFrame()
movieScore['average_score'] = topUsersRating['weightedRating'] / topUsersRating['similarityIndex']
movieScore['movieId'] = movieScore.index
movieScore = movieScore.reset_index(drop=True).sort_values(by='average_score', ascending=False)

movieScore.head()

Unnamed: 0,average_score,movieId
3868,5.0,3147
2460,5.0,90474
2514,5.0,26064
2525,5.0,26052
2524,5.0,1952


In [175]:
recommendations_df.shape

(15127, 2)

In [178]:
recommendations = pd.merge(movies, movieScore)
recommendations = recommendations[~recommendations['movieId'].isin(inputMovies['movieId'])]  # exclude watched movies
recommendations = recommendations.sort_values(by='average_score', ascending=False)

# Feel free to use different combinations of filtering by age, average_score, num of ratings, etc.
recommendations = recommendations[(recommendations.countedRatings > 1000)&(recommendations.year > 1990)][:50]
recommendations

Unnamed: 0,movieId,title,genres,countedRatings,year,average_score
847,3147,"Green Mile, The (1999)",Crime|Drama,24849,1999,5.0
2721,61236,Waltz with Bashir (Vals im Bashir) (2008),Animation|Documentary|Drama|War,1310,2008,5.0
2760,63072,"Road, The (2009)",Adventure|Drama|Thriller,2024,2009,5.0
2751,62511,"Synecdoche, New York (2008)",Comedy|Drama,1108,2008,5.0
872,3246,Malcolm X (1992),Drama,5113,1992,5.0
879,3260,Howards End (1992),Drama,3036,1992,5.0
2708,60766,Man on Wire (2008),Documentary,1532,2008,5.0
798,2966,"Straight Story, The (1999)",Adventure|Drama,3984,1999,5.0
840,3108,"Fisher King, The (1991)",Comedy|Drama|Fantasy|Romance,8450,1991,5.0
811,3006,"Insider, The (1999)",Drama|Thriller,9994,1999,5.0
