# Content-based recommender system
The system makes recommendations based on the movies metadata
---
# Used datasets
+ [credits.csv](https://www.kaggle.com/rounakbanik/the-movies-dataset/downloads/credits.csv/7): Consists of Cast and Crew Information for all our movies. Available in the form of a stringified JSON Object.
+ [keywords.csv](https://www.kaggle.com/rounakbanik/the-movies-dataset/downloads/keywords.csv/7): Contains the movie plot keywords for our MovieLens movies. Available in the form of a stringified JSON Object.
+ [movies_metadata.csv](https://www.kaggle.com/rounakbanik/the-movies-dataset/downloads/movies_metadata.csv/7): The main Movies Metadata file. Contains information on 45,000 movies featured in the Full MovieLens dataset. Features include posters, backdrops, budget, revenue, release dates, languages, production countries and companies.
+ [links_small.csv](https://www.kaggle.com/rounakbanik/the-movies-dataset/downloads/links_small.csv/7): Contains the TMDB and IMDB IDs of a small subset of 9,000 movies of the Full Dataset.

# Import libraries

In [96]:
import numpy as np
import pandas as pd
from ast import literal_eval # Transform str to python code
import sys # sys.maxsize
from nltk.stem.snowball import SnowballStemmer # Stem words' endings
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from operator import itemgetter

# Load datasets

In [97]:
credits = pd.read_csv('datasets/credits.csv')
keywords = pd.read_csv('datasets/keywords.csv')
movies = pd.read_csv('datasets/movies_metadata.csv', low_memory=False)
links = pd.read_csv('datasets/links_small.csv')

pd.set_option('precision',1000)

## Peek at the datasets

In [98]:
movies.head(1)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.70000000000000017763568394002504646778106689...,5415.0


In [99]:
keywords.head(1)

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."


In [100]:
credits.head(1)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862


In [101]:
links.head(1)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0


In [102]:
movies.shape

(45466, 24)

# Preprocess attributes 1

In [103]:
credits['id'] = credits.drop_duplicates(subset ='id')['id'].astype('int')
keywords['id'] = keywords.drop_duplicates(subset='id')['id'].astype('int')
movies.drop([19730,29503,35587],inplace=True) # bad rows
movies['id'] = movies[(movies['id'].str.isnumeric()) & (movies['id'].notnull())]['id'].astype('int')
links['tmdbId'] = links.drop_duplicates(subset='tmdbId').dropna(subset=['tmdbId'])['tmdbId'].astype('int')

## Merge data from credits and keywords into movies DataFrame

In [104]:
movies = movies.merge(
    right=credits, on='id', how='inner').merge(
        right=keywords, on='id', how='inner')

## Use subset of movies for better computational time

In [105]:
movies = movies[movies['id'].isin(links['tmdbId'])]

In [106]:
print('Subset shape: {}'.format(movies.shape))

Subset shape: (9099, 27)


# Preprocess attributes 2

In [107]:
movies['crew'] = movies['crew'].apply(literal_eval)
movies['cast'] = movies['cast'].apply(literal_eval)
movies['keywords'] = movies['keywords'].apply(literal_eval)
movies['genres'] = movies['genres'].apply(literal_eval).apply(
    lambda x: [i['name'] for i in x] if isinstance(x, list) else np.nan)

# Feature engineering

In [108]:
def get_director(x):
    for i in x:
        if i['job']=='Director':
            return i['name']
    return np.nan

def get_cast_keywords(x,n=sys.maxsize):
    res = []
    if isinstance(x,list):
        for elem in x:
            n = n-1
            res.append(elem['name'])
            if n == 0:
                break
    return res if len(res)!=0 else np.nan

movies['director'] = movies['crew'].apply(get_director)
movies['cast'] = movies['cast'].apply(get_cast_keywords,args=(3,))
movies['keywords'] = movies['keywords'].apply(get_cast_keywords)

movies.dropna(axis = 'index',subset=['keywords','cast','director'],how='all',inplace = True)

+ created director attribute
+ left only 3 top actors in the cast attribute
+ extracted keywords

## Preprocess cast and director attributes
+ cast to lower
+ delete spaces between words

In [109]:
movies['cast'] = movies['cast'].dropna().apply(
    lambda x: [str.lower(i.replace(' ', '')) for i in x])
movies['director'] = movies['director'].dropna().apply(
    lambda x: [str.lower(str(x).replace(' ', ''))] * 3)

## Drop keywords that occur only once

In [110]:
keywords_series = movies.apply(lambda x: pd.Series(x['keywords']),axis=1).stack()
keywords_series = keywords_series.value_counts()
keywords_series = keywords_series[keywords_series > 1]

+ stem keywords
+ delete spaces
+ cast to lower case

In [111]:
stemmer = SnowballStemmer('english')

In [112]:
def process_keywords(x):
    res = []
    for i in x:
        if i in keywords_series:
            res.append(str.lower(stemmer.stem(i)).replace(' ',''))
    return res

movies['keywords'] = movies['keywords'].dropna().apply(process_keywords)

# Create an attribute with all the needed data for the algorithm combined

In [113]:
movies[
    'soup'] = movies['genres'] + movies['director'] + movies['cast'] + movies['keywords']
movies['soup'] = movies['soup'].map(lambda x: ' '.join(x), na_action='ignore')
movies.dropna(axis='index',subset=['soup'],how='all',inplace=True)

## Create matrix with word counts

In [114]:
count = CountVectorizer(analyzer = 'word', ngram_range=(1,2), min_df=0.,stop_words='english')
count_matrix = count.fit_transform(movies['soup'])

## Calculate similarities between films

In [115]:
cosine_sim = cosine_similarity(count_matrix,count_matrix)

## Make recommendations based on film name
+ weighted_rating - IMDB's weighted rating formula

In [116]:
def weighted_rating(x, m, C):
    v = x['vote_count']
    R = x['vote_average']
    return (v / (v + m) * R) + (m / (m + v) * C)


def metadata_based_recommender(film_name, quantile_=0.6,n_movies=30):
    idx = np.flatnonzero(movies['original_title'] == film_name)
    if len(idx) != 0:
        idx = idx[0]
        top_list = list(enumerate(cosine_sim[idx]))
        del top_list[idx]
        top_list = sorted(top_list, key=itemgetter(1), reverse=True)

        curr_movies = movies.iloc[np.array(top_list)[:n_movies, 0]]
        m = curr_movies['vote_count'].quantile(quantile_)
        C = curr_movies['vote_average'].mean()
        curr_movies = curr_movies[(curr_movies['vote_count'] >= m)
                                  & (curr_movies['vote_average'].notnull()) &
                                  (curr_movies['vote_count'].notnull())]
        curr_movies['wr'] = curr_movies.apply(
            weighted_rating, args=(
                m,
                C,
            ), axis='columns')
        return curr_movies.sort_values(
            by='wr',
            ascending=False)[['title', 'vote_count', 'vote_average', 'genres']]

## Make a prediction

In [117]:
metadata_based_recommender('Deadpool').head(10)

Unnamed: 0,title,vote_count,vote_average,genres
17836,The Avengers,12000.0,7.40000000000000035527136788005009293556213378...,"[Science Fiction, Action, Adventure]"
25545,Kingsman: The Secret Service,6069.0,7.59999999999999964472863211994990706443786621...,"[Crime, Comedy, Action, Adventure]"
23065,Captain America: The Winter Soldier,5881.0,7.59999999999999964472863211994990706443786621...,"[Action, Adventure, Science Fiction]"
12600,Iron Man,8951.0,7.40000000000000035527136788005009293556213378...,"[Action, Science Fiction, Adventure]"
26569,Avengers: Age of Ultron,6908.0,7.29999999999999982236431605997495353221893310...,"[Action, Adventure, Science Fiction]"
26578,Captain America: Civil War,7462.0,7.09999999999999964472863211994990706443786621...,"[Adventure, Action, Science Fiction]"
17234,X-Men: First Class,5252.0,7.09999999999999964472863211994990706443786621...,"[Action, Science Fiction, Adventure]"
26573,Ant-Man,6029.0,7.00000000000000000000000000000000000000000000...,"[Science Fiction, Action, Adventure]"
20849,Iron Man 3,8951.0,6.79999999999999982236431605997495353221893310...,"[Action, Adventure, Science Fiction]"
17462,Captain America: The First Avenger,7174.0,6.59999999999999964472863211994990706443786621...,"[Action, Adventure, Science Fiction]"
