# Movie Recommender System

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

### Simple Recommender 
Based on popularity

In [None]:
df = pd.read_csv("movies_metadata.csv")
df.head()

In [None]:
df.shape

In [None]:
df['genres'] = df['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
df['year'] = pd.to_datetime(df['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [None]:
def get_recommendations_by_genre(genre, percentile=0.95):
    t_df = df
    if genre != 'All':
        s = t_df.apply(lambda x: pd.Series(x['genres']), axis=1).stack().reset_index(level=1, drop=True)
        s.name = 'genre'
        t_df = t_df.drop('genres', axis=1).join(s)
        t_df = t_df[t_df['genre'] == genre]
    vote_counts = t_df[t_df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = t_df[t_df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)

    cols = ['title', 'year', 'vote_count', 'vote_average', 'popularity']
    if genre == 'All':
        cols.append('genres')
    qualified_df = t_df[(t_df['vote_count'] >= m) & (t_df['vote_count'].notnull()) & (t_df['vote_average'].notnull())][cols]
    qualified_df['vote_count'] = qualified_df['vote_count'].astype('int')
    qualified_df['vote_average'] = qualified_df['vote_average'].astype('int')

    qualified_df['wr'] = qualified_df.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified_df = qualified_df.sort_values('wr', ascending=False)
    return qualified_df


In [None]:
get_recommendations_by_genre('All').head(10)

In [None]:
get_recommendations_by_genre('Romance', 0.85).head(10)

### Content Based Recommender
Based on movie description

In [None]:
links_df = pd.read_csv('links_small.csv')
links_df.head()

In [None]:
links_df['id'] = links_df[links_df['tmdbId'].notnull()]['tmdbId'].astype('int')
links_df.head()

In [None]:
df = df.drop([19730, 29503, 35587])
df['id'] = df['id'].astype('int')

In [None]:
small_df = df[df['id'].isin(links_df['id'])]
small_df.head()

In [None]:
small_df.shape

In [None]:
small_df['tagline'] = small_df['tagline'].fillna('')
small_df['description'] = small_df['overview'] + small_df['tagline']
small_df['description'] = small_df['description'].fillna('')

In [None]:
small_df = small_df.reset_index()

In [None]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf = tf.fit_transform(small_df['description'])
tfidf.shape

In [None]:
cosine_sim = linear_kernel(tfidf, tfidf)

In [None]:
def get_recommendations_by_movie(title):
    idx = small_df.loc[small_df['title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:]
    movie_indices = [i[0] for i in sim_scores]
    return small_df.loc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']]

In [None]:
get_recommendations_by_movie('The Godfather').head(10)

In [None]:
get_recommendations_by_movie('The Godfather').head(10)

In [None]:
recom_df = get_recommendations_by_movie('The Dark Knight').head(10)
recom_df['title']

### Collaborative Filtering

In [None]:
ratings_df = pd.read_csv('ratings_small.csv')
ratings_df.head()

In [None]:
reader = Reader()
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

In [None]:
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5)

In [None]:
trainset = data.build_full_trainset()
svd.fit(trainset)

In [None]:
svd.predict(1, 302, 3)

### Hybrid Recommender

In [None]:
id_df = pd.read_csv('links_small.csv')
id_df.head()

In [None]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [None]:
id_df = id_df.drop('imdbId', axis=1)
id_df['tmdbId'] = id_df['tmdbId'].apply(convert_int)
id_df.columns = ['movieId', 'id']

In [None]:
id_df = id_df.merge(small_df[['title', 'id']], on='id').set_index('title')
id_df.head()

In [None]:
indices_df = id_df.set_index('id')
indices_df.head()

In [None]:
def get_recommendations(userId, title):
    idx = small_df.loc[small_df['title'] == title].index[0]
    tmdbId = id_df.loc[title]['id']
    movie_id = id_df.loc[title]['movieId']
    
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = small_df.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']]
    movies['estimate'] = movies['id'].apply(lambda x: svd.predict(userId, indices_df.loc[x]['movieId']).est)
    movies = movies.sort_values('estimate', ascending=False)
    return movies

In [None]:
get_recommendations(1, 'The Godfather').head(10)

In [None]:
get_recommendations(1, 'The Dark Knight').head(10)

In [None]:
'''
Inspiration
1. https://www.kaggle.com/rounakbanik/movie-recommender-systems
'''