## Simple Recommenders

In [2]:
# Import Pandas
import pandas as pd

# Load Movies Metadata
metadata = pd.read_csv('data/movies_metadata.csv', low_memory=False)

# Print the first three rows
metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [3]:
# Calculate mean of vote average column
C = metadata['vote_average'].mean()
print(C)


5.618207215134185


In [4]:
# Calculate the minimum number of votes required to be in the chart, m
m = metadata['vote_count'].quantile(0.90)
print(m)


160.0


In [5]:
# Filter out all qualified movies into a new DataFrame
q_movies = metadata.copy().loc[metadata['vote_count'] >= m]
q_movies.shape


(4555, 24)

In [6]:
# Function that computes the weighted rating of each movie
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)


In [7]:
# Define a new feature 'score' and calculate its value with `weighted_rating()`
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)


In [8]:
#Sort movies based on score calculated above
q_movies = q_movies.sort_values('score', ascending=False)

#Print the top 15 movies
q_movies[['title', 'vote_count', 'vote_average', 'score']].head(20)


Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.445869
834,The Godfather,6024.0,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
12481,The Dark Knight,12269.0,8.3,8.265477
2843,Fight Club,9678.0,8.3,8.256385
292,Pulp Fiction,8670.0,8.3,8.251406
522,Schindler's List,4436.0,8.3,8.206639
23673,Whiplash,4376.0,8.3,8.205404
5481,Spirited Away,3968.0,8.3,8.196055
2211,Life Is Beautiful,3643.0,8.3,8.187171


## Content-Based Recommender

In [9]:
#Print plot overviews of the first 5 movies.
metadata['overview'].head()


0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [10]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
metadata['overview'] = metadata['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(metadata['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape


(45466, 75827)

In [11]:
#Array mapping from feature integer indices to feature name.
tfidf.get_feature_names_out()[5000:5010]

array(['avails', 'avaks', 'avalanche', 'avalanches', 'avallone', 'avalon',
       'avant', 'avanthika', 'avanti', 'avaracious'], dtype=object)

In [12]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


In [13]:
cosine_sim.shape


(45466, 45466)

In [14]:
cosine_sim[1]


array([0.01504121, 1.        , 0.04681953, ..., 0.        , 0.02198641,
       0.00929411], shape=(45466,))

In [15]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()


In [16]:
indices[:10]


title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
Heat                           5
Sabrina                        6
Tom and Huck                   7
Sudden Death                   8
GoldenEye                      9
dtype: int64

In [17]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]


In [18]:
get_recommendations('The Dark Knight Rises')


12481                                      The Dark Knight
150                                         Batman Forever
1328                                        Batman Returns
15511                           Batman: Under the Red Hood
585                                                 Batman
21194    Batman Unmasked: The Psychology of the Dark Kn...
9230                    Batman Beyond: Return of the Joker
18035                                     Batman: Year One
19792              Batman: The Dark Knight Returns, Part 1
3095                          Batman: Mask of the Phantasm
Name: title, dtype: object

In [19]:
get_recommendations('The Godfather')


1178               The Godfather: Part II
44030    The Godfather Trilogy: 1972-1990
1914              The Godfather: Part III
23126                          Blood Ties
11297                    Household Saints
34717                   Start Liquidation
10821                            Election
38030            A Mother Should Be Loved
17729                   Short Sharp Shock
26293                  Beck 28 - Familjen
Name: title, dtype: object

In [20]:
# Load keywords and credits
credits = pd.read_csv('data/credits.csv')
keywords = pd.read_csv('data/keywords.csv')

# Remove rows with bad IDs.
metadata = metadata.drop([19730, 29503, 35587])

# Convert IDs to int. Required for merging
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
metadata['id'] = metadata['id'].astype('int')

# Merge keywords and credits into your main metadata dataframe
metadata = metadata.merge(credits, on='id')
metadata = metadata.merge(keywords, on='id')


In [21]:
# Print the first two movies of your newly merged metadata
metadata.head(2)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."


In [22]:
# Parse the stringified features into their corresponding python objects
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(literal_eval)


In [23]:
# Import Numpy
import numpy as np


In [24]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan


In [25]:
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []


In [26]:
# Define new director, cast, genres and keywords features that are in a suitable form.
metadata['director'] = metadata['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(get_list)


In [27]:
# Print the new features of the first 3 films
metadata[['title', 'cast', 'director', 'keywords', 'genres']].head(3)


Unnamed: 0,title,cast,director,keywords,genres
0,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]",John Lasseter,"[jealousy, toy, boy]","[Animation, Comedy, Family]"
1,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",Joe Johnston,"[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]"
2,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret]",Howard Deutch,"[fishing, best friend, duringcreditsstinger]","[Romance, Comedy]"


In [28]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''


In [29]:
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    metadata[feature] = metadata[feature].apply(clean_data)


In [30]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])


In [31]:
# Create a new soup feature
metadata['soup'] = metadata.apply(create_soup, axis=1)


In [32]:
metadata[['soup']].head(2)


Unnamed: 0,soup
0,jealousy toy boy tomhanks timallen donrickles ...
1,boardgame disappearance basedonchildren'sbook ...


In [33]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(metadata['soup'])


In [34]:
count_matrix.shape


(46628, 73881)

In [35]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)


In [36]:
# Reset index of your main DataFrame and construct reverse mapping as before
metadata = metadata.reset_index()
indices = pd.Series(metadata.index, index=metadata['title'])


In [37]:
get_recommendations('The Dark Knight Rises', cosine_sim2)


12541      The Dark Knight
10170        Batman Begins
9271                Shiner
9834       Amongst Friends
7732              Mitchell
516      Romeo Is Bleeding
11411         The Prestige
24040            Quicksand
24984             Deadfall
41043                 Sara
Name: title, dtype: object

## Custom Hybrid Recommendation System

In [39]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import itertools
from IPython.display import display

# Load data
metadata = pd.read_csv('data/movies_metadata.csv', low_memory=False)
credits = pd.read_csv('data/credits.csv')
keywords = pd.read_csv('data/keywords.csv')

# Remove problematic rows and convert IDs to integers
metadata = metadata.drop([19730, 29503, 35587])
metadata['id'] = metadata['id'].astype(int)
credits['id'] = credits['id'].astype(int)
keywords['id'] = keywords['id'].astype(int)
# Merge datasets on 'id'
metadata = metadata.merge(credits, on='id')
metadata = metadata.merge(keywords, on='id')

# Parse features
# Convert stringified lists/dictionaries to Python objects
features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(literal_eval)

def get_director(crew):
    for member in crew:
        if member.get('job') == 'Director':
            return member.get('name', '')
    return ''

def get_list(x):
    if isinstance(x, list):
        names = [i.get('name', '') for i in x]
        return names[:3] if len(names) > 3 else names
    return []

# Extract director and top 3 items from cast, keywords, and genres
metadata['director'] = metadata['crew'].apply(get_director)
for feature in ['cast', 'keywords', 'genres']:
    metadata[feature] = metadata[feature].apply(get_list)

# Remove spaces and convert text to lowercase
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    elif isinstance(x, str):
        return str.lower(x.replace(" ", ""))
    return ''

for feature in ['cast', 'keywords', 'director', 'genres']:
    metadata[feature] = metadata[feature].apply(clean_data)

# Fill missing overview and extract release year
metadata['overview'] = metadata['overview'].fillna('')
metadata['release_date'] = pd.to_datetime(metadata['release_date'], errors='coerce')
metadata['year'] = metadata['release_date'].dt.year

# Create soup
# Combine overview, keywords, cast, director and genres into one string
def create_soup(x):
    return x['overview'] + " " + " ".join(x['keywords']) + " " + " ".join(x['cast']) + " " + x['director'] + " " + " ".join(x['genres'])

metadata['soup'] = metadata.apply(create_soup, axis=1)

# Convert the soup into TF-IDF features and compute cosine similarity
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(metadata['soup'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Reset index and create a reverse mapping from movie title to index
metadata = metadata.reset_index(drop=True)
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()

# Weighted rating
C = metadata['vote_average'].mean()
m = metadata['vote_count'].quantile(0.60)

def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    return (v / (v + m) * R) + (m / (m + v) * C)

# Recommender function
def get_recommendations(title, cosine_sim, vote_relaxation=0.5, year_relaxation=6, max_relax_steps=4, desired_results=30):
    if title not in indices:
        raise ValueError(f"Movie '{title}' was not found.")
    
    idx = indices[title]
    base = metadata.loc[idx]
    
    # Base criteria for filtering
    base_votes = base['vote_count'] if base['vote_count'] else 100
    base_year = base['year'] if not pd.isna(base['year']) else 2000
    base_genres = base['genres']
    base_cast = base['cast']
    base_director = base['director']
    base_keywords = base['keywords']
    
    # Get cosine similarity scores for the given movie (excluding the movie itself)
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:500]
    
    results = []
    relax_step = 0

    # Loop to relax filtering criteria if needed
    while len(results) < desired_results and relax_step <= max_relax_steps:
        results.clear()
        for i, score in sim_scores:
            row = metadata.iloc[i]
            if pd.isna(row['year']) or pd.isna(row['vote_average']) or not row['genres']:
                continue
            # For early relaxation steps, require at least one matching genre
            if relax_step < 2 and not set(row['genres']).intersection(base_genres):
                continue
            if abs(row['vote_count'] - base_votes) > base_votes * vote_relaxation * (relax_step + 1):
                continue
            if not (base_year - year_relaxation - 2 * relax_step <= row['year'] <= base_year + year_relaxation + 2 * relax_step):
                continue

            explanation = []
            if set(row['cast']).intersection(base_cast):
                explanation.append("shared actors")
            if row['director'] == base_director:
                explanation.append("same director")
            if set(row['genres']).intersection(base_genres):
                explanation.append("similar genre")
            if set(row['keywords']).intersection(base_keywords):
                explanation.append("similar keywords")

            results.append({
                'Title': row['title'],
                'Year': row['year'],
                'IMDb Rating': row['vote_average'],
                'Vote Count': int(row['vote_count']),
                'Weighted Score': weighted_rating(row),
                'Genres': ', '.join(row['genres']),
                'Explanation': ', '.join(explanation) if explanation else "semantic similarity"
            })

            if len(results) == desired_results:
                break
        relax_step += 1

    df_results = pd.DataFrame(results).drop_duplicates(subset='Title')
    df_results = df_results.sort_values(by='Weighted Score', ascending=False).reset_index(drop=True)
    return df_results

# Configure hyperparameters
grid = {
    'vote_relaxation': [0.4, 0.5, 0.6],
    'year_relaxation': [4, 6, 8]
}

user_input = input("Enter a movie you like: ")

best_score = -1
best_params = None
final_df = pd.DataFrame()

# Iterate over all parameter combinations to tune hyperparameters
for vr, yr in itertools.product(grid['vote_relaxation'], grid['year_relaxation']):
    try:
        temp_df = get_recommendations(user_input, cosine_sim, vote_relaxation=vr, year_relaxation=yr)
        avg_score = temp_df['Weighted Score'].mean()
        if avg_score > best_score:
            best_score = avg_score
            best_params = (vr, yr)
            final_df = temp_df
    except Exception as e:
        continue

# Display results
if not final_df.empty:
    print(f"\nBest parameters: vote_relaxation={best_params[0]}, year_relaxation={best_params[1]}")
    print(f"\nTop {len(final_df)} recommended movies based on: {user_input}\n")
    display(final_df[['Title', 'Year', 'IMDb Rating', 'Vote Count', 'Weighted Score', 'Genres', 'Explanation']])
else:
    print("No recommendations could be found.")



Best parameters: vote_relaxation=0.6, year_relaxation=8

Top 30 recommended movies based on: The Invisible Guest



Unnamed: 0,Title,Year,IMDb Rating,Vote Count,Weighted Score,Genres,Explanation
0,The Body,2012.0,7.5,233,7.38579,"thriller, mystery","shared actors, same director, similar genre"
1,Li'l Quinquin,2014.0,7.9,16,6.792772,"mystery, comedy, crime",similar genre
2,Before I Fall,2017.0,6.8,651,6.773237,"mystery, drama, thriller",similar genre
3,Unthinkable,2010.0,6.7,430,6.663317,"drama, thriller",similar genre
4,Buried,2010.0,6.6,853,6.582922,"drama, thriller, mystery",similar genre
5,Exam,2009.0,6.6,531,6.57285,"thriller, mystery",similar genre
6,Fermat's Room,2007.0,6.6,85,6.451759,"mystery, thriller",similar genre
7,Smoke & Mirrors,2016.0,6.7,50,6.44886,"history, thriller","shared actors, similar genre"
8,Lady Blue Shanghai,2010.0,6.7,22,6.258809,"drama, mystery",similar genre
9,The Bar,2017.0,6.3,122,6.224642,"horror, thriller, comedy","shared actors, similar genre"
