In [2]:
import pandas as pd 
import numpy as np 
from ast import literal_eval
from tabulate import tabulate  # Import tabulate for clean table formatting
import itertools


In [3]:
# movies = pd.read_parquet("tmbd_exports/tmbd_5000_api-call_merged.parquet")
movies = pd.read_parquet("../tmdb_api/tmdb_api_cleaned/movies_cleaned_hard.parquet")

In [4]:
movies.columns

Index(['movie_id', 'budget', 'genres', 'homepage', 'keywords',
       'original_language', 'title', 'original_title', 'overview',
       'popularity', 'production_companies', 'production_countries',
       'release_date', 'revenue', 'runtime', 'spoken_languages', 'status',
       'tagline', 'vote_average', 'vote_count', 'cast', 'director',
       'poster_url', 'score'],
      dtype='object')

In [5]:
movies.head(5)

Unnamed: 0,movie_id,budget,genres,homepage,keywords,original_language,title,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,vote_average,vote_count,cast,director,poster_url,score
0,278,25000000,"Drama, Crime",,"prison, friendship, police brutality, corrupti...",en,The Shawshank Redemption,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,80.704,...,142.0,English,Released,Fear can hold you prisoner. Hope can set you f...,8.7,27833,"Morgan Freeman, Tim Robbins, Bob Gunton, Willi...",Frank Darabont,https://image.tmdb.org/t/p/w500/9cqNxx0GxF0bfl...,8.67294
1,238,6000000,"Drama, Crime",http://www.thegodfather.com/,"based on novel or book, loss of loved one, lov...",en,The Godfather,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",80.976,...,175.0,"English, Italiano, Latin",Released,An offer you can't refuse.,8.689,21124,"Marlon Brando, Al Pacino, James Caan, Robert D...",Francis Ford Coppola,https://image.tmdb.org/t/p/w500/3bhkrj58Vtu7en...,8.6536
2,240,13000000,"Drama, Crime",,"new year's eve, new york city, based on novel ...",en,The Godfather Part II,The Godfather Part II,In the continuing saga of the Corleone crime f...,43.3,...,202.0,"English, Italiano, Latin, Español",Released,The rise and fall of the Corleone empire.,8.6,12750,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",Francis Ford Coppola,https://image.tmdb.org/t/p/w500/hek3koDUyRQk7F...,8.543745
3,424,22000000,"Drama, History, War",http://www.schindlerslist.com/,"factory, hero, nazi, concentration camp, ss (n...",en,Schindler's List,Schindler's List,The true story of how businessman Oskar Schind...,42.53,...,195.0,"Deutsch, Polski, עִבְרִית, English",Released,"Whoever saves one life, saves the world entire.",8.567,16198,"Liam Neeson, Ben Kingsley, Ralph Fiennes, Caro...",Steven Spielberg,https://image.tmdb.org/t/p/w500/sF1U4EUQS8YHUY...,8.523075
4,157336,165000000,"Adventure, Drama, Science Fiction",http://www.interstellarmovie.net/,"rescue, future, spacecraft, race against time,...",en,Interstellar,Interstellar,The adventures of a group of explorers who mak...,100.309,...,169.0,English,Released,Mankind was born on Earth. It was never meant ...,8.5,36654,"Matthew McConaughey, Anne Hathaway, Michael Ca...",Christopher Nolan,https://image.tmdb.org/t/p/w500/gEU2QniE6E77NI...,8.480908


* STEP 1: TURN STRING COLUMNS INTO USEABLE OBJECTS

In [6]:
# List of columns that need fixing
columns_to_fix = ['cast', 'genres', 'director','keywords']

# Convert raw comma-separated text into properly formatted lists (Vectorized)
movies[columns_to_fix] = movies[columns_to_fix].apply(lambda col: 
    col.str.split(', ').apply(lambda x: str(x) if isinstance(x, list) else "[]")
)

# Apply literal_eval to convert them into actual lists
movies[columns_to_fix] = movies[columns_to_fix].applymap(literal_eval)

  movies[columns_to_fix] = movies[columns_to_fix].applymap(literal_eval)


In [7]:
#check
movies[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

Unnamed: 0,title,cast,director,keywords,genres
0,The Shawshank Redemption,"[Morgan Freeman, Tim Robbins, Bob Gunton, Will...",[Frank Darabont],"[prison, friendship, police brutality, corrupt...","[Drama, Crime]"
1,The Godfather,"[Marlon Brando, Al Pacino, James Caan, Robert ...",[Francis Ford Coppola],"[based on novel or book, loss of loved one, lo...","[Drama, Crime]"
2,The Godfather Part II,"[Al Pacino, Robert Duvall, Diane Keaton, Rober...",[Francis Ford Coppola],"[new year's eve, new york city, based on novel...","[Drama, Crime]"


In [8]:
#return the list top 3 elements or entire list, whichever is more. This makes the handling
def get_list(x):
    if isinstance(x, list):
        # Check if the list contains dictionaries or just plain strings -> fixes potential TypeError:string indices must be integers, not 'str'
        if all(isinstance(i, dict) for i in x):  
            names = [i['name'] for i in x if 'name' in i]  # Extract 'name' if it's a dictionary
        else:
            names = x  # If it's a list of strings, use it directly
        
        # Return top 3 elements or the entire list
        return names[:3] if len(names) > 3 else names
    
    return []  # Return an empty list if data is missing/malformed

# Define the features to apply the function
features = ['cast', 'keywords', 'genres']

for feature in features:
    movies[feature] = movies[feature].apply(get_list)

In [9]:
#check
movies[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

Unnamed: 0,title,cast,director,keywords,genres
0,The Shawshank Redemption,"[Morgan Freeman, Tim Robbins, Bob Gunton]",[Frank Darabont],"[prison, friendship, police brutality]","[Drama, Crime]"
1,The Godfather,"[Marlon Brando, Al Pacino, James Caan]",[Francis Ford Coppola],"[based on novel or book, loss of loved one, lo...","[Drama, Crime]"
2,The Godfather Part II,"[Al Pacino, Robert Duvall, Diane Keaton]",[Francis Ford Coppola],"[new year's eve, new york city, based on novel...","[Drama, Crime]"


* STEP 2: CREATE FILTER BASED ON GENRE USER INPUT, CALCULATE WEIGHTED RATING OF SUBSET AND RETURN TOP 10 MOVIES

In [10]:
# Function to get all unique genres in the dataset -> this allows to provide a list of available genres before user input
def get_all_genres(movies_df):
    unique_genres = set(genre for sublist in movies_df['genres'] for genre in sublist)
    return sorted(unique_genres)  # Sort for better readability

# Function to filter movies by genre (case-insensitive) -> without this if user wrote 'action' instead of 'Action', no movies would come up 
def filter_movies_by_genre(movies_df, selected_genre):
    selected_genre = selected_genre.lower()  # Convert user input to lowercase
    return movies_df[movies_df['genres'].apply(lambda x: any(g.lower() == selected_genre for g in x))]

# Function to calculate weighted rating based on vote count threshold
def weighted_rating_threshold(data_frame, qntl=0.9):  
    if not {"vote_average", "vote_count"}.issubset(data_frame.columns):
        raise ValueError("The columns 'vote_average' and 'vote_count' are missing in the DataFrame!")

    m = data_frame["vote_count"].quantile(qntl)  # Minimum votes required
    C = data_frame["vote_average"].mean()  # Mean vote across all movies

    # Filter movies that meet the vote threshold
    q_movies = data_frame.loc[data_frame["vote_count"] >= m].copy()

    # Compute weighted rating
    def weighted_rating(q_movs, m=m, C=C):
        R = q_movs["vote_average"]
        v = q_movs["vote_count"]
        return (v/(v+m) * R) + (m/(m+v) * C)

    q_movies['score'] = q_movies.apply(weighted_rating, axis=1)    

    return q_movies

# Display available genres
available_genres = get_all_genres(movies)
print("\nAvailable Genres:\n" + ", ".join(available_genres) + "\n")

#  Get user input
user_genre = input("Enter a genre from the list above: ").strip()

# Step 2: Filter dataset based on input genre
filtered_movies = filter_movies_by_genre(movies, user_genre)

#  Apply weighted rating function to the filtered dataset
if not filtered_movies.empty:
    ranked_movies = weighted_rating_threshold(filtered_movies)
    # Step 4: Sort by weighted score and return top 10
    top_movies = ranked_movies.sort_values(by="score", ascending=False).head(10)

    print("\nTop 10 Movies in the Selected Genre (Ranked by Weighted Rating):\n")
    print(tabulate(top_movies[['title', 'score', 'vote_average', 'vote_count']], 
                   headers="keys", tablefmt="pretty", showindex=False))  # Pretty table format

else:
    print("\nNo movies found in this genre.")



Available Genres:
Action, Adventure, Animation, Comedy, Crime, Documentary, Drama, Family, Fantasy, History, Horror, Music, Mystery, Romance, Science Fiction, TV Movie, Thriller, War, Western



Enter a genre from the list above:  Thriller



Top 10 Movies in the Selected Genre (Ranked by Weighted Rating):

+--------------------------+-------------------+--------------+------------+
|          title           |       score       | vote_average | vote_count |
+--------------------------+-------------------+--------------+------------+
|       Pulp Fiction       | 8.204222910883608 |     8.5      |   28399    |
|         Parasite         | 8.082735526338777 |     8.5      |   18749    |
|          Se7en           | 8.04392901321065  |     8.4      |   21454    |
| The Silence of the Lambs | 7.920322964514592 |    8.345     |   16607    |
|      Shutter Island      | 7.912210607267146 |    8.201     |   24288    |
|   Inglourious Basterds   | 7.893658195191541 |     8.2      |   22609    |
|          Joker           | 7.878221691251503 |     8.14     |   26178    |
|       The Shining        | 7.827537536465515 |     8.2      |   17753    |
|         Memento          | 7.777936406549166 |     8.2      |   15109    |
|        

* STEP 2.B: FILTER WITH DIRECTOR
* Caveat of this is that for now you need to enter full name of director

In [11]:
# Function to get all unique directors in the dataset -> this allows to provide a list of available genres before user input
def get_all_directors(movies_df):
    unique_directors = set(director for sublist in movies_df['director'] for director in sublist)
    return sorted(unique_directors)  # Sort for better readability

# Function to filter movies by genre (case-insensitive) -> without this if user wrote 'action' instead of 'Action', no movies would come up 
def filter_movies_by_director(movies_df, selected_director):
    selected_director = selected_director.lower()  # Convert user input to lowercase
    return movies_df[movies_df['director'].apply(lambda x: any(g.lower() == selected_director for g in x))]

# Function to calculate weighted rating based on vote count threshold
def weighted_rating_threshold(data_frame, qntl=0.9):  
    if not {"vote_average", "vote_count"}.issubset(data_frame.columns):
        raise ValueError("The columns 'vote_average' and 'vote_count' are missing in the DataFrame!")

    m = data_frame["vote_count"].quantile(qntl)  # Minimum votes required
    C = data_frame["vote_average"].mean()  # Mean vote across all movies

    # Filter movies that meet the vote threshold
    q_movies = data_frame.loc[data_frame["vote_count"] >= m].copy()

    # Compute weighted rating
    def weighted_rating(q_movs, m=m, C=C):
        R = q_movs["vote_average"]
        v = q_movs["vote_count"]
        return (v/(v+m) * R) + (m/(m+v) * C)

    q_movies['score'] = q_movies.apply(weighted_rating, axis=1)    

    return q_movies

# Display available genres
available_director = get_all_directors(movies)
print("\nAvailable Directors:\n" + ", ".join(available_director) + "\n")

#  Get user input
user_director = input("Enter a director: ").strip()

# Step 2: Filter dataset based on input genre
filtered_movies = filter_movies_by_director(movies, user_director)

#  Apply weighted rating function to the filtered dataset
if not filtered_movies.empty:
    ranked_movies = weighted_rating_threshold(filtered_movies)
    # Step 4: Sort by weighted score and return top 10
    top_movies = ranked_movies.sort_values(by="score", ascending=False).head(10)

    print("\nTop 10 Movies from selected Director (Ranked by Weighted Rating):\n")
    print(tabulate(top_movies[['title', 'score', 'vote_average', 'vote_count']], 
                   headers="keys", tablefmt="pretty", showindex=False))  # Pretty table format

else:
    print("\nNo movies found for this director.")



Available Directors:
A.R. Murugadoss, Aamir Khan, Aaron Blaise, Aaron Hann, Aaron Harvey, Aaron Horvath, Aaron Moorhead, Aaron Nee, Aaron Schneider, Aaron Seltzer, Aaron Sorkin, Abbas Kiarostami, Abby Kohn, Abdellatif Kechiche, Abderrahmane Sissako, Abe Forsythe, Abel Ferrara, Abel Ferry, Adam Alleca, Adam B. Stein, Adam Berg, Adam Brooks, Adam Doench, Adam Egypt Mortimer, Adam Elliot, Adam Gierasch, Adam Green, Adam MacDonald, Adam Marcus, Adam Mason, Adam McKay, Adam Nee, Adam Randall, Adam Rifkin, Adam Robitel, Adam Salky, Adam Schindler, Adam Shankman, Adam Smith, Adam Wingard, Adam Wood, Adele Lim, Adil El Arbi, Aditya Chopra, Aditya Dhar, Adrian Grünberg, Adrian Lyne, Adrienne Shelly, Adrián García Bogliano, Afonso Poyart, Agnieszka Holland, Agnieszka Wojtowicz-Vosloo, Agnès Jaoui, Agnès Obadia, Agnès Varda, Aharon Keshales, Aisling Walsh, Akan Satayev, Aki Kaurismäki, Akifumi Zako, Akira Kurosawa, Akira Takamura, Akiva Goldsman, Akiva Schaffer, Al Campbell, Alain Berbérian, Ala

Enter a director:  Roland Emmerich



Top 10 Movies from selected Director (Ranked by Weighted Rating):

+------------------+-------------------+--------------+------------+
|      title       |       score       | vote_average | vote_count |
+------------------+-------------------+--------------+------------+
| Independence Day | 6.622223268466207 |    6.892     |    9704    |
|       2012       | 6.038456437718277 |     5.8      |   12063    |
+------------------+-------------------+--------------+------------+


* STEP 2C: FILTER BY GENRE AND DIRECTOR

In [12]:
# Function to get all unique genres in the dataset
def get_all_genres(movies_df):
    unique_genres = set(genre for sublist in movies_df['genres'] for genre in sublist)
    return sorted(unique_genres)  # Sort for better readability

# Function to get all directors for a selected genre
def get_directors_for_genre(movies_df, selected_genre):
    # Filter movies by genre (case-insensitive)
    genre_filtered = movies_df[movies_df['genres'].apply(lambda x: selected_genre.lower() in [g.lower() for g in x])]
    
    # Flatten the lists of directors and get unique directors
    directors_flattened = list(itertools.chain(*genre_filtered['director']))  # Flatten the list of directors
    return sorted(set(directors_flattened))  # Return unique directors

# Function to filter movies by genre (case-insensitive)
def filter_movies_by_genre(movies_df, selected_genre):
    selected_genre = selected_genre.lower()  # Convert user input to lowercase
    return movies_df[movies_df['genres'].apply(lambda x: any(g.lower() == selected_genre for g in x))]

# Function to filter movies by director (case-insensitive)
def filter_movies_by_director(movies_df, selected_director):
    selected_director = selected_director.lower()  # Convert user input to lowercase
    return movies_df[movies_df['director'].apply(lambda directors: any(d.lower() == selected_director for d in directors))]

# Function to calculate weighted rating based on vote count threshold
def weighted_rating_threshold(data_frame, qntl=0.9):  
    if not {"vote_average", "vote_count"}.issubset(data_frame.columns):
        raise ValueError("The columns 'vote_average' and 'vote_count' are missing in the DataFrame!")

    m = data_frame["vote_count"].quantile(qntl)  # Minimum votes required
    C = data_frame["vote_average"].mean()  # Mean vote across all movies

    # Filter movies that meet the vote threshold
    q_movies = data_frame.loc[data_frame["vote_count"] >= m].copy()

    # Compute weighted rating
    def weighted_rating(q_movs, m=m, C=C):
        R = q_movs["vote_average"]
        v = q_movs["vote_count"]
        return (v/(v+m) * R) + (m/(m+v) * C)

    q_movies['score'] = q_movies.apply(weighted_rating, axis=1)    

    return q_movies

# Step 0: Display available genres
available_genres = get_all_genres(movies)
print("\nAvailable Genres:\n" + ", ".join(available_genres) + "\n")

# Step 1: Get user input for genre
user_genre = input("Enter a genre from the list above: ").strip()

# Step 2: Filter dataset based on selected genre
filtered_movies_by_genre = filter_movies_by_genre(movies, user_genre)

# Step 3: Get a list of directors for the selected genre
available_directors = get_directors_for_genre(filtered_movies_by_genre, user_genre)

if available_directors:
    print("\nAvailable Directors in this Genre:\n" + ", ".join(available_directors) + "\n")
else:
    print(f"\nNo directors found for the genre '{user_genre}'.")
    exit()

# Step 4: Get user input for director (ensure the director is from the available list)
user_director = input(f"Enter a director from the list above: ").strip()

# Step 5: Filter dataset by both genre and director
filtered_movies_by_genre_and_director = filter_movies_by_director(filtered_movies_by_genre, user_director)

# Step 6: Apply weighted rating function to the filtered dataset
if not filtered_movies_by_genre_and_director.empty:
    ranked_movies = weighted_rating_threshold(filtered_movies_by_genre_and_director)
    # Step 7: Sort by weighted score and return top 10
    top_movies = ranked_movies.sort_values(by="score", ascending=False).head(10)

    print("\nTop 10 Movies in the Selected Genre and Director (Ranked by Weighted Rating):\n")
    print(tabulate(top_movies[['title', 'score', 'vote_average', 'vote_count']], 
                   headers="keys", tablefmt="pretty", showindex=False))  # Pretty table format

else:
    print("\nNo movies found in this genre and director combination.")


Available Genres:
Action, Adventure, Animation, Comedy, Crime, Documentary, Drama, Family, Fantasy, History, Horror, Music, Mystery, Romance, Science Fiction, TV Movie, Thriller, War, Western



Enter a genre from the list above:  Action



Available Directors in this Genre:
A.R. Murugadoss, Aaron Harvey, Aaron Horvath, Aaron Nee, Aaron Schneider, Aaron Seltzer, Abel Ferrara, Abel Ferry, Adam Berg, Adam Doench, Adam McKay, Adam Nee, Adam Randall, Adam Rifkin, Adam Shankman, Adam Smith, Adam Wingard, Adil El Arbi, Aditya Dhar, Adrian Grünberg, Akifumi Zako, Akira Kurosawa, Akiva Schaffer, Alain Desrochers, Alan Mak Siu-Fai, Alan Parker, Alan Taylor, Albert Hughes, Albert Pyun, Alejandro Monteverde, Aleksander Bach, Alessandro Carloni, Alex Kurtzman, Alex Proyas, Alex Winter, Alex Zamm, Alexander Witt, Alexey Sidorov, Alfonso Cuarón, Alfred Hitchcock, Alister Grierson, Allan A. Goldstein, Allen Hughes, Amp Wong, Anders Thomas Jensen, Andrea Berloff, Andrei Konchalovsky, Andrew Davis, Andrew Fleming, Andrew Goth, Andrew Lau Wai-Keung, Andrew Lawrence, Andrew Marton, Andrew Morahan, Andrew Niccol, Andrew Sipes, Andrew Stanton, Andrew Traucki, Andrzej Bartkowiak, André Øvredal, Andy Fickman, Andy Muschietti, Andy Serkis, Andy

Enter a director from the list above:  Roland Emmerich



Top 10 Movies in the Selected Genre and Director (Ranked by Weighted Rating):

+------------------+-------------------+--------------+------------+
|      title       |       score       | vote_average | vote_count |
+------------------+-------------------+--------------+------------+
| Independence Day | 6.577590909090909 |    6.892     |    9704    |
|       2012       | 6.006492229688812 |     5.8      |   12063    |
+------------------+-------------------+--------------+------------+
