In [50]:
import pandas as pd

## 1. Load small MovieLens-dataset

In [51]:
user_rating_init = pd.read_csv("./data/ml-latest-small/ratings.csv",index_col=1)
user_rating_init.reset_index(inplace=True)
user_rating_init.drop(['timestamp'], axis=1 , inplace=True)
user_rating_init.head(5)

Unnamed: 0,movieId,userId,rating
0,1,1,4.0
1,3,1,4.0
2,6,1,4.0
3,47,1,5.0
4,50,1,5.0


## 2. Edit data

### 2.1 delete all movies with less than 20 votes

In [52]:
# pivot the dataframe
user_item_init = pd.pivot(data=user_rating_init,
                index='userId',
                columns='movieId',
                values='rating')
user_item_init

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [53]:
# define mask with all movieIds having at least 20 votes
mask_at_least_20_evals = user_item_init.notna().sum() >= 20
# drop these user Ids (within the transposed dataframe (with movieId as index))
user_item_init = user_item_init.T.drop(user_item_init.T.loc[~mask_at_least_20_evals].index).T
user_item_init.head(5)

movieId,1,2,3,5,6,7,10,11,16,17,...,122920,122922,134130,134853,139385,148626,152081,164179,166528,168252
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,4.0,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


### 2.2 Add movie titles

In [54]:
# load the movie titles and genres into a dataframe
movie_genre = pd.read_csv("./data/ml-latest-small/movies.csv",index_col=1,)
# change index from title to movieId
movie_genre.reset_index(inplace=True)
movie_genre.set_index('movieId',inplace=True)
movie_genre.head(5)

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [55]:
# Merge the dataframes on movieId and subsitute movieId with movie title
user_item = pd.merge(movie_genre, user_item_init.T, left_index=True, right_index=True)
# use movie title as columns
user_item.drop('genres', inplace=True, axis=1)
user_item.reset_index(inplace=True, drop=True)
user_item = user_item.set_index('title').T
user_item.index.name = 'userId'
user_item.head(5)


title,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),GoldenEye (1995),"American President, The (1995)",Casino (1995),Sense and Sensibility (1995),...,Captain America: Civil War (2016),Doctor Strange (2016),The Martian (2015),Inside Out (2015),The Revenant (2015),"Big Short, The (2015)",Zootopia (2016),Arrival (2016),Rogue One: A Star Wars Story (2016),Logan (2017)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,4.0,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


## 3. Create a simple movie recommender with collaborative filtering

In [56]:
# input for the recommender
query = {
    'userId': 123,    
    'length': '120',
    'genre': 'thriller',
    'age': 34,
    'time': '20:45'          
}
query

{'userId': 123,
 'length': '120',
 'genre': 'thriller',
 'age': 34,
 'time': '20:45'}

In [57]:
# collaborative filtering = look at ratings only!
def recommend_popular(query, ratings, k=10):
    """
    Filters and recommends the top k movies for any given input query. 
    Returns a list of k movie ids.
    """
    # 1. candidate generation
    # create a list with movies which the user has not seen yet
    item_user = ratings.T
    unseen = item_user[query['userId']].loc[item_user[query['userId']].isna()].index.to_list()
    # filter out movies that have been watched by less than 100 users
    plus_100 = item_user[ratings.count() >= 100].index
    # create a candidates list (intersection of the 2 movies lists above)
    candidates = list(set(unseen).intersection(set(plus_100)))
   
    # 2. scoring
    # create dataframe with average rating for each movie
    scoring = pd.DataFrame(data=ratings.mean())
    scoring.reset_index(inplace=True)
    # drop all movies which are not in the candidates list 
    scoring.drop(scoring.loc[~scoring['title'].isin(candidates)].index, axis=0, inplace=True)
    scoring.set_index(['title'], inplace=True)
    scoring.columns = ['avg_rating']
    
    # 3. ranking 
    scoring.sort_values(by='avg_rating', ascending=False, inplace=True, axis=0)
    # return the top-k highest rated movie titles the user hasn't seen yet
    return scoring.head(k)

In [58]:
# create a list of recommended movies
scoring = recommend_popular(query, user_item, 15)
scoring


Unnamed: 0_level_0,avg_rating
title,Unnamed: 1_level_1
"Godfather, The (1972)",4.289062
"Godfather: Part II, The (1974)",4.25969
"Departed, The (2006)",4.252336
Goodfellas (1990),4.25
Casablanca (1942),4.24
"Dark Knight, The (2008)",4.238255
"Usual Suspects, The (1995)",4.237745
"Princess Bride, The (1987)",4.232394
Schindler's List (1993),4.225
Apocalypse Now (1979),4.219626
