# Recommender Systems
Using Popularity Ranking, User-Based Collaborative Filtering and Item-Based Collaborative Filtering to recommend movies based on user or item inputs. Recommenders scored using various metrics with an offline train-test split

## Imports

In [1]:
%pip install lifelines

Collecting lifelines
  Downloading lifelines-0.27.6-py3-none-any.whl (409 kB)
     -------------------------------------- 409.4/409.4 kB 5.1 MB/s eta 0:00:00
Collecting autograd-gamma>=0.3
  Downloading autograd-gamma-0.5.0.tar.gz (4.0 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting formulaic>=0.2.2
  Downloading formulaic-0.6.0-py3-none-any.whl (82 kB)
     ---------------------------------------- 82.1/82.1 kB ? eta 0:00:00
Collecting autograd>=1.5
  Downloading autograd-1.5-py3-none-any.whl (48 kB)
     ---------------------------------------- 48.9/48.9 kB 2.4 MB/s eta 0:00:00
Collecting interface-meta>=1.2.0
  Downloading interface_meta-1.3.0-py3-none-any.whl (14 kB)
Collecting astor>=0.8
  Downloading astor-0.8.1-py2.py3-none-any.whl (27 kB)
Building wheels for collected packages: autograd-gamma
  Building wheel for autograd-gamma (setup.py): started
  Building wheel for autograd-gamma (setup.py): finished with st



In [1]:
# Importing Libraries

from movie_rec_utils import *

import pandas as pd
from ydata_profiling import ProfileReport
import math
import numpy as np
import matplotlib.pyplot as plt
import difflib

from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import r2_score, mean_absolute_error
from lifelines.utils import concordance_index

In [2]:
# Reading Data
links_df = pd.read_csv('../Data/links.csv')
movies_df = pd.read_csv('../Data/movies.csv')
ratings_df = pd.read_csv('../Data/ratings.csv')
tags_df =pd.read_csv('../Data/tags.csv')

## Data Exploration

In [None]:
# Creating y-data profiling reports as html file
movies_profile = ProfileReport(movies_df, minimal=True).to_file('movies_report.html')
ratings_profile = ProfileReport(ratings_df, minimal=True).to_file('ratings_report.html')

# Recommenders

## Popularity Ranking

### Average Ranking, Filtered with Review Minimum

In [8]:
# Getting movies with the highest average ratings, filtered to only those with more ratings than the threshold
pop_movies = get_pop_rev(ratings_df=ratings_df,
                         movies_df=movies_df,
                         n=10,
                         review_thresh=30)

pop_movies

0                     Shawshank Redemption, The (1994)
1                            Lawrence of Arabia (1962)
2                                Godfather, The (1972)
3                                    Fight Club (1999)
4                                Cool Hand Luke (1967)
5    Dr. Strangelove or: How I Learned to Stop Worr...
6                                   Rear Window (1954)
7                       Godfather: Part II, The (1974)
8                                 Departed, The (2006)
9                     Manchurian Candidate, The (1962)
Name: title, dtype: object

### Laplace Inspired Data Manipulation

In [5]:
# Adding a few negative reviews for each movie thus punishing those with few reviews, then using the highest average rating. Inspired by Laplaces' Rule of Succession
pop_movies = get_pop_laplace(ratings_df=ratings_df,
                             movies_df=movies_df,
                             n=10,
                             rat=0.5,
                             num_fake=2)
pop_movies

0                     Shawshank Redemption, The (1994)
1                                Godfather, The (1972)
2                                    Fight Club (1999)
3                       Godfather: Part II, The (1974)
4            Star Wars: Episode IV - A New Hope (1977)
5                           Usual Suspects, The (1995)
6    Dr. Strangelove or: How I Learned to Stop Worr...
7                              Schindler's List (1993)
8                                    Goodfellas (1990)
9                              Dark Knight, The (2008)
Name: title, dtype: object

### Cumulative Rating

In [3]:
# Summing total ratings thus privledging both those with more reviews and those with higher reviews
pop_movies = get_pop_cumulative(ratings_df=ratings_df,
                                movies_df=movies_df,
                                n=10)
pop_movies

0             Shawshank Redemption, The (1994)
1                          Forrest Gump (1994)
2                          Pulp Fiction (1994)
3                           Matrix, The (1999)
4             Silence of the Lambs, The (1991)
5    Star Wars: Episode IV - A New Hope (1977)
6                            Braveheart (1995)
7                            Fight Club (1999)
8                      Schindler's List (1993)
9                         Jurassic Park (1993)
Name: title, dtype: object

### Comparing Popularity Methods

In [5]:
n=10
pop_titles = {}

pop_movies = get_pop_rev(ratings_df=ratings_df,
                         movies_df=movies_df,
                         n=n,
                         review_thresh=30)
pop_titles['Average Review with Review Count Threshold Method'] = pop_movies

pop_movies = get_pop_laplace(ratings_df=ratings_df,
                             movies_df=movies_df,
                             n=n,
                             rat=0.5,
                             num_fake=2)
pop_titles['Laplace Inspired Method'] = pop_movies

pop_movies = get_pop_cumulative(ratings_df=ratings_df,
                                movies_df=movies_df,
                                n=n)
pop_titles['Cumulative Total Ratings Method'] = pop_movies

pop_df = pd.DataFrame(pop_titles)
pop_df

Unnamed: 0,Average Review with Review Count Threshold Method,Laplace Inspired Method,Cumulative Total Ratings Method
0,"Shawshank Redemption, The (1994)","Shawshank Redemption, The (1994)","Shawshank Redemption, The (1994)"
1,Lawrence of Arabia (1962),"Godfather, The (1972)",Forrest Gump (1994)
2,"Godfather, The (1972)",Fight Club (1999),Pulp Fiction (1994)
3,Fight Club (1999),"Godfather: Part II, The (1974)","Matrix, The (1999)"
4,Cool Hand Luke (1967),Star Wars: Episode IV - A New Hope (1977),"Silence of the Lambs, The (1991)"
5,Dr. Strangelove or: How I Learned to Stop Worr...,"Usual Suspects, The (1995)",Star Wars: Episode IV - A New Hope (1977)
6,Rear Window (1954),Dr. Strangelove or: How I Learned to Stop Worr...,Braveheart (1995)
7,"Godfather: Part II, The (1974)",Schindler's List (1993),Fight Club (1999)
8,"Departed, The (2006)",Goodfellas (1990),Schindler's List (1993)
9,"Manchurian Candidate, The (1962)","Dark Knight, The (2008)",Jurassic Park (1993)


## Collaborative Filtering

### Item-Based Collaborative Filtering

In [5]:
similar_movies = item_based_rec(title='Nausicaä of the valley of the wind',
                                ratings_df=ratings_df,
                                movies_df=movies_df,
                                n=7,
                                shared_thresh=5,
                                total_thresh=10,
                                more_data = False)

similar_movies

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


0    Monty Python's And Now for Something Completel...
1                             Road to Perdition (2002)
2                                          Thor (2011)
3                      X-Men Origins: Wolverine (2009)
4                       Guardians of the Galaxy (2014)
5                            North by Northwest (1959)
6                                   Blue Velvet (1986)
Name: title, dtype: object

### User-Based Collaborative Filtering

In [12]:
user_recs = user_based_rec(user_id=25,
                           ratings_df=ratings_df,
                           movies_df=movies_df,
                           n=5,
                           more_data=False)
user_recs

0                                   Matrix, The (1999)
1                     Shawshank Redemption, The (1994)
2                                  Forrest Gump (1994)
3    Lord of the Rings: The Fellowship of the Ring,...
4                                    Fight Club (1999)
Name: title, dtype: object

#### Evaluating with Offline Methods

In [4]:
# Spliting Training and Testing Data
train, test = train_test_split(ratings_df[['userId', 'movieId', 'rating']], test_size=0.1, random_state=42)

# Creating Training Data Frame
useritem_train = create_train(ratings_df, train)

In [5]:
# Computing User Cosine Similarity
cos_sim_df = pd.DataFrame(cosine_similarity(useritem_train, useritem_train), 
                          index=useritem_train.index, 
                          columns=useritem_train.index)

In [6]:
# Predicting/Estimating Ratings for Test Data Using Custom Function and Similarity Matrix
test['estimated_rating'] = test.apply(lambda row: estimate_rating(useritem_train, row['userId'], row['movieId'], cos_sim_df), axis=1)

In [7]:
# Computing Performance Metrics
scores_df = score_est(test['rating'], test['estimated_rating'])
display(scores_df)

Unnamed: 0,Mean Absolute Error,Concordance Index,R Squared
0,2.99641,0.605972,-8.108931
