In [1]:
#Imports
import pandas as pd 
import numpy as np

from tabulate import tabulate 

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

q_movies = pd.read_parquet("movies_cleaned_hard.parquet")

In [2]:
## Sara's Filtering ##

# Function to get all unique genres in the dataset -> this allows to provide a list of available genres before user input
def get_all_genres(movies_df):
    unique_genres = set(genre.strip().title() for genres in movies_df['genres'].dropna() for genre in genres.split(','))
    return sorted(unique_genres)

# Function to filter movies by genre (case-insensitive) -> without this if user wrote 'action' instead of 'Action', no movies would come up 
def filter_movies_by_genre(movies_df, selected_genre):
    cleaned_genre = selected_genre.lower().strip()
    genre_filter = movies_df["genres"].apply(lambda x: cleaned_genre in [g.lower().strip() for g in x.split(",")])
    return movies_df.loc[genre_filter].head(10)

In [3]:
## Benedikt's tdf_idf ##

# Schritt 1: Cosinus-Ähnlichkeit aus den Filmbeschreibungen (cos1)
tfidf = TfidfVectorizer(stop_words='english')
q_movies['overview'] = q_movies['overview'].fillna('')  # Leere Strings für NaN-Werte
tfidf_matrix = tfidf.fit_transform(q_movies['overview'])
cosine_sim1 = linear_kernel(tfidf_matrix, tfidf_matrix)

# Schritt 2: Cosinus-Ähnlichkeit aus der "metadata soup" (cos2)
# Beispiel für "metadata soup": Hier gehen wir davon aus, dass es sich um eine Kombination von verschiedenen Metadaten handelt
# Du kannst z.B. Genre, Jahr, Bewertung, etc. kombinieren.
# Hier als Dummy-Beispiel kombinieren wir nur Genre-Daten (ersetze dies durch deine tatsächlichen Daten):
q_movies_soup = q_movies[["genres", "keywords", "title", "tagline", "cast", "director"]].copy()

# NaN-Werte vor dem Join entfernen & dann kombinieren
q_movies_soup['soup'] = q_movies_soup[["genres", "keywords", "title", "tagline", "cast", "director"]].fillna('').astype(str).agg(' '.join, axis=1)

# TF-IDF-Vektorisierung
metadata_vectorizer = TfidfVectorizer(stop_words='english')
metadata_matrix = metadata_vectorizer.fit_transform(q_movies_soup['soup'])

# Kosinus-Ähnlichkeit berechnen
cosine_sim2 = linear_kernel(metadata_matrix, metadata_matrix)

# Schritt 3: Kombinieren der beiden Matrizen (Gewichtung ist optional)
# Beispiel für Gewichtung: cos1 bekommt 0.7 Gewicht, cos2 bekommt 0.3 Gewicht
cosine_sim_combined = 0.8 * cosine_sim1 + 0.2 * cosine_sim2

## Updated for Filtering ##

def get_recommendations_filtered(df, title, genre=None, cosine_sim1=None, cosine_sim2=None, cosine_sim_combined=None, method="combined"):
    # Der Index des Filmes, der dem Titel entspricht
    indices = pd.Series(df.index, index=df['title']).to_dict()
    if title not in indices:
        return "Title not found."
    idx = indices[title]  # Sicherer Weg, um den Index zu bekommen
    
    if method == "cos1":
        cosine_sim = cosine_sim1
    elif method == "cos2":
        cosine_sim = cosine_sim2
    else:
        cosine_sim = cosine_sim_combined
    
    if cosine_sim is None:
        raise ValueError("Selected method's cosine similarity matrix is not provided.")
    
    # Berechne die paarweisen Ähnlichkeiten
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    if genre is not None:
        filtered_sim_scores = [
            (i, score) for i, score in sim_scores 
            if genre.lower().strip() in [g.lower().strip() for g in df.iloc[i]['genres'].split(",")]
        ]
    else:
        filtered_sim_scores = sim_scores
    
    # Sort by similarity score and take top 10 (excluding the input movie if present)
    filtered_sim_scores = sorted(filtered_sim_scores, key=lambda x: x[1], reverse=True)
    filtered_sim_scores = [s for s in filtered_sim_scores if s[0] != idx][:10]
    
    # Get the movie indices
    movie_indices = [i[0] for i in filtered_sim_scores]
    
    # Return the filtered DataFrame
    return df.iloc[movie_indices]

In [4]:
## Search Functions ##

def search_genre(df, available_genres_lower, available_genres, user_genre=None):
    # Exact Match
    if not user_genre:
        user_genre = input("Enter a genre: ").strip().lower()
    if user_genre in available_genres_lower:
        selected_genre = available_genres[available_genres_lower.index(user_genre)]
        filtered_movies = filter_movies_by_genre(df, selected_genre)
        
        if not filtered_movies.empty:
            print(f"\nTop 10 {selected_genre} Movies (Ranked by Weighted Rating):\n")
            print(tabulate(filtered_movies[['title', 'score', 'vote_average', 'vote_count']], 
                           headers="keys", tablefmt="pretty", showindex=False))
            return selected_genre
        else:
            print("\nNo movies found in this genre.")
            return None
    
    # Partial Match
    partial_matches = [genre for genre in available_genres if user_genre in genre.lower()]
    
    if partial_matches:
        print("\nDid you mean one of these? " + ", ".join(partial_matches))
        user_retry = input("Please re-enter genre: ").strip().lower()
        return search_genre(df, available_genres_lower, available_genres, user_retry)
    
    user_retry = input("\nGenre not found, please try again: ").strip().lower()
    return search_genre(df, available_genres_lower, available_genres, user_retry)

def search_titles(df, genre, cosine_sim1, cosine_sim2, cosine_sim_combined, user_title=None):
    # Exact Match
    if not user_title:
        user_title = input("\nEnter a movie: ").strip().lower()
    if user_title.lower() in df['title'].str.lower().values:
        selected_title = df['title'][df['title'].str.lower() == user_title.lower()].iloc[0]
        filtered_recommendations = get_recommendations_filtered(df, selected_title, genre, cosine_sim1, cosine_sim2, cosine_sim_combined)
    
        if isinstance(filtered_recommendations, str):
            print(f"\n{filtered_recommendations}")
            return
        
        if genre:
            print(f"\nTop 10 {genre} Movies Similar to {selected_title}:\n")
        else:
            print(f"\nTop 10 Movies Similar to {selected_title}:\n")
        print(tabulate(filtered_recommendations[['title', 'score', 'vote_average', 'vote_count']], 
                       headers="keys", tablefmt="pretty", showindex=False))
        return
    
    # Partial Match
    partial_matches = df['title'][df['title'].str.lower().str.contains(user_title.lower(), na=False)]
    if not partial_matches.empty:
        print(f"\nDid you mean one of these?\n{'\n'.join(partial_matches[:5])}")
        user_retry = input("Please re-enter title: ").strip().lower()
        return search_titles(df, genre, cosine_sim1, cosine_sim2, cosine_sim_combined, user_retry)
    
    user_retry = input("\nTitle not found, please try again: ").strip().lower()
    return search_titles(df, genre, cosine_sim1, cosine_sim2, cosine_sim_combined, user_retry)

In [None]:
##  Search ##

# Get available genres
available_genres = get_all_genres(q_movies)
available_genres_lower = [genre.lower() for genre in available_genres]

while True:
    print("Welcome to the Movie Recommendation System")
    print("Please choose an option:")
    print("1. Explore by genre (Branch 1)")
    print("2. Search directly by movie title (Branch 2)")
    user_choice = input("Enter 1 or 2: ").strip()
    
    if user_choice == "1":
        print("\nAvailable Genres:\n" + ", ".join(available_genres) + "\n")
        selected_genre = search_genre(q_movies, available_genres_lower, available_genres)
        if selected_genre:
            print(f"\nIf you want to search for more {selected_genre} movies, we can search by movie.")
            search_titles(q_movies, selected_genre, cosine_sim1, cosine_sim2, cosine_sim_combined)
        break
    elif user_choice == "2":
        search_titles(q_movies, None, cosine_sim1, cosine_sim2, cosine_sim_combined)
        break
    else:
        print("Invalid choice. Please enter 1 or 2.")

Welcome to the Movie Recommendation System
Please choose an option:
1. Explore by genre (Branch 1)
2. Search directly by movie title (Branch 2)
