In [1]:
#Imports
import pandas as pd 
import numpy as np

from ast import literal_eval
from tabulate import tabulate 
import itertools

import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.preprocessing import MinMaxScaler

q_movies = pd.read_parquet("../tmdb_api/tmdb_api_cleaned/movies_cleaned_hard.parquet")

In [2]:
## Sara's Filtering ##

# Function to get all unique genres in the dataset -> this allows to provide a list of available genres before user input
def get_all_genres(movies_df):
    unique_genres = set(genre.strip() for genres in movies_df['genres'].dropna() for genre in genres.split(','))
    return sorted(unique_genres)

# Function to filter movies by genre (case-insensitive) -> without this if user wrote 'action' instead of 'Action', no movies would come up 
def filter_movies_by_genre(movies_df, selected_genre):
    cleaned_genre = selected_genre.lower().strip()
    genre_filter = movies_df["genres"].apply(lambda x: cleaned_genre in [g.lower().strip() for g in x.split(",")])
    return movies_df.loc[genre_filter][0:10] 

In [3]:
## Benedikt's tdf_idf ##

# Schritt 1: Cosinus-Ähnlichkeit aus den Filmbeschreibungen (cos1)
tfidf = TfidfVectorizer(stop_words='english')
q_movies['overview'] = q_movies['overview'].fillna('')  # Leere Strings für NaN-Werte

tfidf_matrix = tfidf.fit_transform(q_movies['overview'])
cosine_sim1 = linear_kernel(tfidf_matrix, tfidf_matrix)

# Schritt 2: Cosinus-Ähnlichkeit aus der "metadata soup" (cos2)
# Beispiel für "metadata soup": Hier gehen wir davon aus, dass es sich um eine Kombination von verschiedenen Metadaten handelt
# Du kannst z.B. Genre, Jahr, Bewertung, etc. kombinieren.
# Hier als Dummy-Beispiel kombinieren wir nur Genre-Daten (ersetze dies durch deine tatsächlichen Daten):

q_movies_3 = q_movies[["genres", "keywords", "title", "tagline", "cast", "director"]].copy()
# q_movies_3 = q_movies[["genres", "keywords", "cast", "director"]].copy()

# NaN-Werte vor dem Join entfernen & dann kombinieren
q_movies_3['soup'] = q_movies_3[["genres", "keywords", "title", "tagline", "cast", "director"]].fillna('').astype(str).agg(' '.join, axis=1)
# q_movies_3['soup'] = q_movies_3[["genres", "keywords", "title", "tagline", "cast", "director"]].fillna('').astype(str).agg(' '.join, axis=1)

# TF-IDF-Vektorisierung
metadata_vectorizer = TfidfVectorizer(stop_words='english')
metadata_matrix = metadata_vectorizer.fit_transform(q_movies_3['soup'])

# Kosinus-Ähnlichkeit berechnen
cosine_sim2 = linear_kernel(metadata_matrix, metadata_matrix)

# Schritt 3: Kombinieren der beiden Matrizen (Gewichtung ist optional)
# Beispiel für Gewichtung: cos1 bekommt 0.7 Gewicht, cos2 bekommt 0.3 Gewicht

# Sicherstellen, dass beide Matrizen Werte im gleichen Bereich haben.
# scaler = MinMaxScaler()
# cosine_sim1_scaled = scaler.fit_transform(cosine_sim1)
# cosine_sim2_scaled = scaler.fit_transform(cosine_sim2)

# cosine_sim_combined = 0.3 * cosine_sim1 + 0.7 * cosine_sim2
cosine_sim_combined = 0.8 * cosine_sim1 + 0.2 * cosine_sim2

## Updated for Filtering ##
## We prentend ther was a user user interface. User selected TDF based on genre filter or unfiltered TDF (0 or 1) 
## Selectioed Option: user_sel

def get_recommendations_filtered(title, genre, user_sel, cosine_sim1=cosine_sim1, cosine_sim2=cosine_sim2, cosine_sim_combined=cosine_sim_combined, method="combined"):
    # Der Index des Filmes, der dem Titel entspricht
    indices = pd.Series(q_movies.index, index=q_movies['title']).to_dict()

    idx = indices[title]  # Sicherer Weg, um den Index zu bekommen
    
    if method == "cos1":
        cosine_sim = cosine_sim1
    elif method == "cos2":
        cosine_sim = cosine_sim2
    else:
        cosine_sim = cosine_sim_combined
    
    # Berechne die paarweisen Ähnlichkeiten
    sim_scores = list(enumerate(cosine_sim[idx]))

    ## User selected genre-based filtering of TDF (1)
    if user_sel == 1:
    
        filtered_sim_scores = [
            (i, score) for i, score in sim_scores 
            if genre.lower().strip() in [g.lower().strip() for g in q_movies.iloc[i]['genres'].split(",")]
        ]
    else: 
        ## User selected unfiltered TDF (0)
        filtered_sim_scores = sim_scores
    
    # Sort by similarity score and take top 10 (excluding the input movie if present)
    filtered_sim_scores = sorted(filtered_sim_scores, key=lambda x: x[1], reverse=True)
    filtered_sim_scores = [s for s in filtered_sim_scores if s[0] != idx][1:21]
    
    # Get the movie indices
    movie_indices = [i[0] for i in filtered_sim_scores]
    
    # Return the filtered DataFrame
    return q_movies.iloc[movie_indices]

In [4]:
## Search Functions ##
available_genres = get_all_genres(q_movies)
available_genres_lower = [genre.lower() for genre in available_genres]
selected_genre = None
selected_title = None 

def search_genre(user_genre):
    global selected_genre
    # Exact Match
    if user_genre in available_genres_lower:
        selected_genre = available_genres[available_genres_lower.index(user_genre)]
        filtered_movies = filter_movies_by_genre(q_movies, selected_genre)
        
        if not filtered_movies.empty:
            print(f"\nTop 10 {selected_genre} Movies (Ranked by Weighted Rating):\n")
            print(tabulate(filtered_movies[['title', 'score', 'vote_average', 'vote_count']], 
                           headers="keys", tablefmt="pretty", showindex=False))
            return
        else:
            print("\nNo movies found in this genre.")
            return
    
    # Partial Match
    partial_matches = [genre for genre in available_genres if user_genre in genre.lower()]
    
    if partial_matches:
        print("\nDid you mean one of these? " + ", ".join(partial_matches))
        user_retry = input("Please re-enter genre: ").strip().lower()
        return search_genre(user_retry)
    
    user_retry = input("\nGenre not found, please try again: ").strip().lower()
    return search_genre(user_retry)

from tabulate import tabulate

selected_genre = None
selected_title = None

def search_titles(user_title):
    global selected_title
    global selected_genre
    
    # Exact Match
    if user_title.lower() in q_movies['title'].str.lower().values:
        selected_title = q_movies['title'][q_movies['title'].str.lower() == user_title.lower()].iloc[0]
        
        # This function call implements a HARD CODED user selection get_recommendations_filtered
        # User selects "genre based TDF": Thats 1 
        #  User selects "unfiltered TDF": Thats 0 
        
        filtered_recommendations = get_recommendations_filtered(selected_title, selected_genre, 0, method="combined")
    
        if isinstance(filtered_recommendations, str):
            print(f"\n{filtered_recommendations}")
            return
        
        print(f"\nTop 10 {selected_genre} Movies Similar to {selected_title}:\n")
        print(tabulate(filtered_recommendations[['title', 'score', 'vote_average', 'vote_count']], 
                       headers="keys", tablefmt="pretty", showindex=False))
        return
    
    # Partial Match
    partial_matches = q_movies['title'][q_movies['title'].str.lower().str.contains(user_title.lower(), na=False)]
    if not partial_matches.empty:
        print(f"\nDid you mean one of these?\n{'\n'.join(partial_matches[:5])}")
        user_retry = input("Please re-enter title: ").strip().lower()
        return search_titles(user_retry)
    
    user_retry = input("\nTitle not found, please try again: ").strip().lower()
    return search_titles(user_retry)

In [5]:
##  Search ##
print("\nAvailable Genres:\n" + ", ".join(available_genres) + "\n")
user_input = input("Enter a genre: ").strip().lower()
search_genre(user_input)

print(f"\nIf you want to search for more {selected_genre} movies, we can search by movie.")
user_input = input("\nEnter a movie: ").strip().lower()
search_titles(user_input)


Available Genres:
Action, Adventure, Animation, Comedy, Crime, Documentary, Drama, Family, Fantasy, History, Horror, Music, Mystery, Romance, Science Fiction, TV Movie, Thriller, War, Western



Enter a genre:  War



Top 10 War Movies (Ranked by Weighted Rating):

+------------------------+-------------------+--------------+------------+
|         title          |       score       | vote_average | vote_count |
+------------------------+-------------------+--------------+------------+
|    Schindler's List    | 8.523074580590686 |    8.567     |   16198    |
| Grave of the Fireflies | 8.337377761360749 |    8.452     |    5757    |
|      The Pianist       | 8.329785485155725 |     8.4      |    9368    |
|     Apocalypse Now     | 8.225001245936735 |     8.3      |    8382    |
|  Inglourious Basterds  | 8.172836579272605 |     8.2      |   22609    |
|  Saving Private Ryan   | 8.162024970362966 |     8.2      |   16093    |
|     Hacksaw Ridge      | 8.156299085464088 |     8.2      |   13948    |
|   The Great Dictator   | 8.125292263334806 |    8.301     |    3420    |
|     Paths of Glory     | 8.100586671726164 |     8.3      |    2979    |
|   Full Metal Jacket    | 8.04579324478022  |     


Enter a movie:  Schindler's List



Top 10 War Movies Similar to Schindler's List:

+-------------------------+-------------------+--------------+------------+
|          title          |       score       | vote_average | vote_count |
+-------------------------+-------------------+--------------+------------+
|       Resistance        | 6.594811237429137 |     7.0      |    433     |
|      The Tin Drum       | 6.581604773686326 |    6.944     |    474     |
|     The Railway Man     | 6.644061740228237 |     6.9      |    739     |
|   The Counterfeiters    | 6.956345322013464 |     7.4      |    622     |
|    Life Is Beautiful    | 8.394890741357733 |    8.446     |   13224    |
|        Defiance         | 6.685360350005975 |     6.8      |    1750    |
|      A Hidden Life      | 6.77400494304656  |     7.1      |    691     |
|      Woman in Gold      | 7.07634605045883  |     7.3      |    1383    |
|  Inglourious Basterds   | 8.172836579272605 |     8.2      |   22609    |
|     The Last Metro      | 6.774317787