# Popularity Based Recommender

## Imports

In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math
%matplotlib inline

In [2]:
import os
print(os.listdir("./data"))

['movie.csv', 'rating.csv']


## Data Preprocessing

In [3]:
movies = pd.read_csv('./data/movie.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies['year'] = (movies.title.str.extract('(\(\d\d\d\d\))', expand=False).str.extract('(\d\d\d\d)', expand=False))
movies['title'] = (movies.title.str.replace('(\(\d\d\d\d\))', '').apply(lambda x: x.strip()))
movies['genres'] = movies.genres.str.split('|')

movies.head()

  


Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [5]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  27278 non-null  int64 
 1   title    27278 non-null  object
 2   genres   27278 non-null  object
 3   year     27256 non-null  object
dtypes: int64(1), object(3)
memory usage: 852.6+ KB


In [6]:
ratings = pd.read_csv('./data/rating.csv', usecols=['userId', 'movieId', 'rating'],
                     dtype={'userId':np.int32, 'movieId':np.int32, 'rating':np.float32})
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


Due to huge memory usage, we can further decrease our data by multiplying these columns with 2 to make everthing int and then convert back to np.int8.

In [7]:
ratings['rating'] = ratings['rating'] * 2
ratings['rating'] = ratings['rating'].astype(np.int8)
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 3 columns):
 #   Column   Dtype
---  ------   -----
 0   userId   int32
 1   movieId  int32
 2   rating   int8 
dtypes: int32(2), int8(1)
memory usage: 171.7 MB


In [8]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,2,7
1,1,29,7
2,1,32,7
3,1,47,7
4,1,50,7


## Popularity Based Approach

The popularity based algorithm is based on finding the most popular movies and recommend them to users.

### 1. Based on the number of users rating a movie (most rated)

In [11]:
most_voted = (ratings.groupby('movieId')[['rating']]
                     .count()
                     .sort_values('rating', ascending=False)
                     .reset_index())

most_voted = pd.merge(most_voted, movies, on='movieId').drop('rating', axis=1)
most_voted.head()

Unnamed: 0,movieId,title,genres,year
0,296,Pulp Fiction,"[Comedy, Crime, Drama, Thriller]",1994
1,356,Forrest Gump,"[Comedy, Drama, Romance, War]",1994
2,318,"Shawshank Redemption, The","[Crime, Drama]",1994
3,593,"Silence of the Lambs, The","[Crime, Horror, Thriller]",1991
4,480,Jurassic Park,"[Action, Adventure, Sci-Fi, Thriller]",1993


### 2. Based on the total rate of movies

In [10]:
total_vote = (ratings.groupby('movieId')[['rating']]
                     .sum()
                     .sort_values('rating', ascending=False)
                     .reset_index())

total_vote = pd.merge(total_vote, movies, on='movieId').drop('rating', axis=1)
total_vote.head()

Unnamed: 0,movieId,title,genres,year
0,318,"Shawshank Redemption, The","[Crime, Drama]",1994
1,296,Pulp Fiction,"[Comedy, Crime, Drama, Thriller]",1994
2,356,Forrest Gump,"[Comedy, Drama, Romance, War]",1994
3,593,"Silence of the Lambs, The","[Crime, Horror, Thriller]",1991
4,260,Star Wars: Episode IV - A New Hope,"[Action, Adventure, Sci-Fi]",1977


###  3. Based on the average rate of the most rated movies

To have a fair comparison, we only consider the movies with more than "n" number of users rated them.

In [11]:
n = 1000

avg_vote = ((ratings.groupby('movieId')[['rating']]
                     .sum()/ratings.groupby('movieId')[['rating']]
                     .count()))

avg_vote_n = avg_vote[ratings.groupby('movieId')[['rating']]
                      .count()['rating']>=n]

avg_vote_n = pd.merge(avg_vote_n.sort_values('rating', ascending=False)
                      .reset_index(), movies, on='movieId').drop('rating', axis=1)
avg_vote_n.reset_index(drop=True, inplace=True)
avg_vote_n.head()

Unnamed: 0,movieId,title,genres,year
0,318,"Shawshank Redemption, The","[Crime, Drama]",1994
1,858,"Godfather, The","[Crime, Drama]",1972
2,50,"Usual Suspects, The","[Crime, Mystery, Thriller]",1995
3,527,Schindler's List,"[Drama, War]",1993
4,1221,"Godfather: Part II, The","[Crime, Drama]",1974
