## Movie Recommendation System
### Credits, Genres and Keywords Based Recommender
This recommendation system is based on the following metadata: the top 3 actors, the director, related genres and the movie plot keywords.

In [1]:
import pandas as pd 
import numpy as np 
import pickle

from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('data/tmdb_5000.csv')

In [3]:
# Parse the stringified features into their corresponding python objects
from ast import literal_eval

def parse_stringified_features(df):
    features = ['cast', 'crew', 'keywords', 'genres']
    for feature in features:
        # Replace NaN values with an empty string
        df[feature] = df[feature].fillna('')
        
        # df[feature] = df[feature].apply(literal_eval)
        df[feature] = df[feature].apply(lambda x: literal_eval(x) if x else [])


In [4]:
# Get the director's name from the crew feature. If director is not listed, return NaN
def get_director(crew_list):
    for crew in crew_list:
        if crew['job'] == 'Director':
            return crew['name']
    return np.nan


# Returns only the names from given list
def get_names_from_list(x):
    if isinstance(x, list):
        names = [row['name'] for row in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names
    return []


# Convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(' ', '')) for i in x]
    else:
        # Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(' ', ''))
        else:
            return ''


def create_combined_features_data(df):
    return ' '.join(df['cast']) + ' ' + df['director'] + ' ' + ' '.join(df['genres'])  + ' ' + ' '.join(df['keywords']) 


def preprocess_df(df):
    # Parse the stringified features into their corresponding python objects
    parse_stringified_features(df)
    
    # Define new director, cast, genres and keywords features that are in  a suitable form
    df['director'] = df['crew'].apply(get_director)

    for feature in ['cast', 'keywords', 'genres']:
        df[feature] = df[feature].apply(get_names_from_list)

    # Apply clean_data function to features
    features = ['cast', 'keywords', 'director', 'genres']
    for feature in features:
        df[feature] = df[feature].apply(clean_data)

    df['combined_features_data'] = df.apply(create_combined_features_data, axis=1)

In [5]:
def get_recommendations(input_df):
    preprocess_df(input_df)

    # Load the vectorizer
    with open("./data/vectorizer.pkl", "rb") as vectorizer_file:
        vectorizer = pickle.load(vectorizer_file)
    # Load the count matrix
    with open("./data/count_matrix.pkl", "rb") as count_matrix_file:
        count_matrix = pickle.load(count_matrix_file)

    # Transform the combined_features_data of input_df into a count matrix
    count_matrix_input = vectorizer.transform(input_df["combined_features_data"])

    # Calculate cosine similarity
    cosine_sim_input = cosine_similarity(count_matrix, count_matrix_input)

    # Aggregate similarity scores for all movies in input_df
    total_sim_scores = cosine_sim_input.sum(axis=1)

    # Get the indices of the top recommendations
    top_indices = total_sim_scores.argsort()[::-1]

    # top_indices may include movies in input_df
    top_n = 25 + input_df.shape[0]

    # Get the top recommended movies and their corresponding scores
    top_recommendations = df.iloc[top_indices[:top_n]].copy()

    top_scores = total_sim_scores[top_indices[:top_n]]
    top_recommendations["Similarity Score"] = top_scores

    # Exclude rows present in input_df based on 'id'
    top_recommendations = top_recommendations[
        ~top_recommendations["id"].isin(input_df["id"])
    ]

    return top_recommendations

In [6]:
ids = [550, 210577, 2649, 1949]
idxs=[]
for idx in ids:
    # Get the index of the movie with the given id
    movie_index = df[df['id'] == idx].index[0]    
    idxs.append(movie_index)
idxs

[662, 693, 946, 421]

Input movies

In [11]:
df.loc[idxs]['title']

662    Fight Club
693     Gone Girl
946      The Game
421        Zodiac
Name: title, dtype: object

In [7]:
liked_movies_df = df.loc[idxs].copy()
top_recommendations = get_recommendations(liked_movies_df)
top_recommendations[['id', 'title', 'vote_count', 'vote_average', 'Similarity Score']]

Unnamed: 0,id,title,vote_count,vote_average,Similarity Score
4589,43743,Fabled,0,0.0,1.441688
100,4922,The Curious Case of Benjamin Button,3292,7.3,1.13541
1010,4547,Panic Room,1267,6.5,1.123607
1553,807,Se7en,5765,8.1,1.123607
354,65754,The Girl with the Dragon Tattoo,2434,7.2,1.111803
439,11324,Shutter Island,6336,7.8,1.111803
3486,347764,Goddess of Love,9,6.2,1.089814
3978,351043,Amnesiac,52,4.1,1.089814
2915,206563,Trash,242,7.1,1.025708
2670,319910,Broken Horses,10,5.0,1.019427


Here, Some movies have very low vote_count and vote_average.

### Ratings
- It recommends movies regardless of ratings and popularity. If a movie has a lot of similar cast, genre as compared to input movie, it will have high score. But, it may have very poor vote_count or vote_average and shouldn't be recommended to anyone.

- Therefore, we will add a mechanism to remove bad movies and return movies which are popular and have had a good critical response.

- **Solution**: Take the top 25 movies based on similarity scores and calculate the vote of the 60th percentile movie. Then, using this as the value of  m, we will calculate the weighted rating of each movie using IMDB's formula.

In [8]:
# Define the function to calculate weighted rating
def calculate_weighted_rating(vote_count, vote_average, C, m):
    return (vote_count / (vote_count + m)) * vote_average + (m / (m + vote_count)) * C

In [9]:
def calculate_and_filter_recommendations(top_recommendations):
    # Calculate C (mean vote) and m (minimum vote count threshold)
    C = top_recommendations["vote_average"].mean()
    m = top_recommendations["vote_count"].quantile(0.50)

    # Calculate weighted ratings
    top_recommendations["Weighted Rating"] = calculate_weighted_rating(
        top_recommendations["vote_count"],
        top_recommendations["vote_average"],
        C,
        m
    )

    # Define MIN_WEIGHTED_RATING as a percentile threshold (adjust as needed)
    MIN_WEIGHTED_RATING = top_recommendations["Weighted Rating"].quantile(0.6)  # This sets it to the 60th percentile

    print(f"Mean Vote (C): {C}")
    print(f"Minimum Vote Count Threshold (m): {m}")
    print(f"Minimum Weighted Rating Threshold (MIN_WEIGHTED_RATING): {MIN_WEIGHTED_RATING}")

    # Filter based on weighted rating
    top_recommendations = top_recommendations[
        top_recommendations["Weighted Rating"] >= MIN_WEIGHTED_RATING
    ]

    return top_recommendations[['id', 'title', 'vote_count', 'vote_average', 'Weighted Rating', 'Similarity Score']]


In [10]:
top_recommendations = get_recommendations(df.loc[idxs].copy())
top_recommendations = calculate_and_filter_recommendations(top_recommendations)

top_recommendations[['id', 'title', 'vote_count', 'vote_average', 'Weighted Rating', 'Similarity Score']]

Mean Vote (C): 5.892
Minimum Vote Count Threshold (m): 104.0
Minimum Weighted Rating Threshold (MIN_WEIGHTED_RATING): 6.439922109533469


Unnamed: 0,id,title,vote_count,vote_average,Weighted Rating,Similarity Score
100,4922,The Curious Case of Benjamin Button,3292,7.3,7.256881,1.13541
1010,4547,Panic Room,1267,6.5,6.453879,1.123607
1553,807,Se7en,5765,8.1,8.060874,1.123607
354,65754,The Girl with the Dragon Tattoo,2434,7.2,7.146402,1.111803
439,11324,Shutter Island,6336,7.8,7.769188,1.111803
2915,206563,Trash,242,7.1,6.736902,1.025708
2381,11468,Salton Sea,104,7.0,6.446,0.975971
3060,14283,The Red Violin,99,7.3,6.57866,0.961125
1328,23168,The Town,1468,7.0,6.926697,0.923646
1161,37799,The Social Network,3391,7.1,7.064054,0.92082
