# Making Recommendations Based on Popularity

## WBSFLIX dataset 

In [2]:
import numpy as np
import pandas as pd

In [5]:
df_links = pd.read_csv(r'links.csv')
df_movies = pd.read_csv(r'movies.csv')
df_ratings = pd.read_csv(r'ratings.csv')
df_tags = pd.read_csv(r'tags.csv')

In [9]:
df_links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [8]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [10]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [11]:
df_tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [12]:
movies =  df_movies[['movieId', 'title']]
movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


## Popularity/Quality based recommmender system

Let's group movies by rating, and look at their average rating. This is an explicit rating given by users.

In [14]:
rating = pd.DataFrame(df_ratings.groupby('movieId')['rating'].mean())
rating.sort_values("rating", ascending=False).head()

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
88448,5.0
100556,5.0
143031,5.0
143511,5.0
143559,5.0


The top rated movies have a perfect score of 5/5. But how many reviews do these movies have?

In [17]:
df_ratings.query("movieId==100556")

Unnamed: 0,userId,movieId,rating,timestamp
71951,462,100556,5.0,1456150743


We can also look at how many times each movie has received a rating. The ratings count is an implicit rating.

In [19]:
rating['rating_count'] = df_ratings.groupby('movieId')['rating'].count()
rating.sort_values("rating_count", ascending=False).head()

Unnamed: 0_level_0,rating,rating_count
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
356,4.164134,329
318,4.429022,317
296,4.197068,307
593,4.16129,279
2571,4.192446,278


Let's locate the most popular movies, and get some info about it:

In [20]:
# movieId of most popular movie
top_popular_movieId = rating.sort_values('rating_count', ascending=False).head(1).index[0]

# name of the most popular movie
movies[movies['movieId']==top_popular_movieId]

Unnamed: 0,movieId,title
314,356,Forrest Gump (1994)


In [23]:
# Tags of the most popular movie
df_movies[df_movies['movieId']==top_popular_movieId]

Unnamed: 0,movieId,title,genres
314,356,Forrest Gump (1994),Comedy|Drama|Romance|War


The most popular movie is "Forrest Gump (1994)",geners of Comedy|Drama|Romance|War has received 356 rating_count and it has an average rating of 4.

Find a hybrid system to sort movies, so that you can recommend the "best" movies: geners that are both high rated and popular.

In [24]:
df_ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247


In [56]:
n=5
new_ratings = (
df_ratings.groupby('movieId').agg(avg_rating=('rating','mean'),n_ratings=('userId','count'))
                        .query(f'n_ratings > {n}')
                        .nlargest(10,'avg_rating')
)
new_ratings.head(3)

Unnamed: 0_level_0,avg_rating,n_ratings
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
177593,4.75,8
2239,4.666667,6
1041,4.590909,11


In [26]:
df_movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [43]:
new_ratings.merge(df_movies,how='left',on='movieId').merge(df_links,how='left',on='movieId').head()


Unnamed: 0,movieId,avg_rating,n_ratings,title,genres,imdbId,tmdbId
0,177593,4.75,8,"Three Billboards Outside Ebbing, Missouri (2017)",Crime|Drama,5027774,359940.0
1,2239,4.666667,6,Swept Away (Travolti da un insolito destino ne...,Comedy|Drama,73817,37916.0
2,1041,4.590909,11,Secrets & Lies (1996),Drama,117589,11159.0
3,106642,4.571429,7,"Day of the Doctor, The (2013)",Adventure|Drama|Sci-Fi,2779318,253941.0
4,3451,4.545455,11,Guess Who's Coming to Dinner (1967),Drama,61735,1879.0


In [54]:
def n_top_movies(n,movie_names=movies, movie_ratings=df_ratings, movie_genres=movies):
  threshold = 4
  n_top=n
  temp_movie=(
 movie_ratings.groupby('movieId').agg(avg_rating=('rating','mean'),n_ratings=('userId','count'))
                        .query(f'n_ratings > {threshold}')
                        .nlargest(n_top-1,'avg_rating')
  )
  return temp_movie.merge(df_movies,how='left',on='movieId').merge(df_links,how='left',on='movieId')[['title','genres','avg_rating']]

In [57]:
n_top_movies(10)

Unnamed: 0,title,genres,avg_rating
0,"Trial, The (Procès, Le) (1962)",Drama,4.9
1,"Three Billboards Outside Ebbing, Missouri (2017)",Crime|Drama,4.75
2,Memories of Murder (Salinui chueok) (2003),Crime|Drama|Mystery|Thriller,4.7
3,Swept Away (Travolti da un insolito destino ne...,Comedy|Drama,4.666667
4,Yi Yi (2000),Drama,4.6
5,Secrets & Lies (1996),Drama,4.590909
6,"Day of the Doctor, The (2013)",Adventure|Drama|Sci-Fi,4.571429
7,Guess Who's Coming to Dinner (1967),Drama,4.545455
8,Paths of Glory (1957),Drama|War,4.541667
