In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

from sklearn.preprocessing import MinMaxScaler


In [2]:
#q_movies = pd.read_parquet("../tmbd_exports/quality_movs_weighted_rating.parquet")
q_movies = pd.read_parquet("../tmdb_api/tmdb_api_cleaned/movies_cleaned_hard.parquet")

In [3]:
indices = pd.Series(q_movies.index, index = q_movies["title"])
indices.shape

(10797,)

In [4]:
# Schritt 1: Cosinus-Ähnlichkeit aus den Filmbeschreibungen (cos1)
tfidf = TfidfVectorizer(stop_words='english')
q_movies['overview'] = q_movies['overview'].fillna('')  # Leere Strings für NaN-Werte

tfidf_matrix = tfidf.fit_transform(q_movies['overview'])
cosine_sim1 = linear_kernel(tfidf_matrix, tfidf_matrix)

In [15]:
# Schritt 2: Cosinus-Ähnlichkeit aus der "metadata soup" (cos2)
# Beispiel für "metadata soup": Hier gehen wir davon aus, dass es sich um eine Kombination von verschiedenen Metadaten handelt
# Du kannst z.B. Genre, Jahr, Bewertung, etc. kombinieren.
# Hier als Dummy-Beispiel kombinieren wir nur Genre-Daten (ersetze dies durch deine tatsächlichen Daten):

q_movies_3 = q_movies[["genres", "keywords", "title", "tagline", "cast", "director"]].copy()

# NaN-Werte vor dem Join entfernen & dann kombinieren
#q_movies_3['soup'] = q_movies_3[['genres', 'keywords', 'cast', 'director']].fillna('').astype(str).agg(' '.join, axis=1)
q_movies_3['soup'] = q_movies_3[['genres', 'keywords', 'title', 'tagline', 'cast', 'director']].fillna('').astype(str).agg(' '.join, axis=1)

# TF-IDF-Vektorisierung
metadata_vectorizer = TfidfVectorizer(stop_words='english')
metadata_matrix = metadata_vectorizer.fit_transform(q_movies_3['soup'])

# Kosinus-Ähnlichkeit berechnen
cosine_sim2 = linear_kernel(metadata_matrix, metadata_matrix)

In [18]:
# Schritt 3: Kombinieren der beiden Matrizen (Gewichtung ist optional)
# Beispiel für Gewichtung: cos1 bekommt 0.7 Gewicht, cos2 bekommt 0.3 Gewicht

# Sicherstellen, dass beide Matrizen Werte im gleichen Bereich haben.
scaler = MinMaxScaler()
cosine_sim1_scaled = scaler.fit_transform(cosine_sim1)
cosine_sim2_scaled = scaler.fit_transform(cosine_sim2)

# cosine_sim_combined = 0.3 * cosine_sim1 + 0.7 * cosine_sim2
cosine_sim_combined = 0 * cosine_sim1_scaled + 1 * cosine_sim2_scaled

# Funktion zur Empfehlung
def get_recommendations_2(title, cosine_sim1=cosine_sim1, cosine_sim2=cosine_sim2, cosine_sim_combined=cosine_sim_combined, method="combined"):
    # Der Index des Filmes, der dem Titel entspricht

   # Get the index of the movie that matches the title
    indices = pd.Series(q_movies.index, index=q_movies['title']).to_dict()

    if title not in indices:
        return "Title not found."

    idx = indices[title]  # Sicherer Weg, um den Index zu bekommen
    
    if method == "cos1":
        cosine_sim = cosine_sim1
    elif method == "cos2":
        cosine_sim = cosine_sim2
    else:
        cosine_sim = cosine_sim_combined
    
    # Berechne die paarweisen Ähnlichkeiten
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Hole die Top-10 ähnlichen Filme
    sim_scores = sim_scores[1:11]  # Die ersten 10 ähnlichen Filme (den Film selbst ausschließen)
    
    # Holen der Film-Indizes
    movie_indices = [i[0] for i in sim_scores]
    
    # Rückgabe der entsprechenden Titel
    return q_movies['title'].iloc[movie_indices]

# Teste die Funktion für den Titel "Ex Machina" (ersetze dies durch einen Titel aus deinem Dataset)
print(get_recommendations_2("The Dark Knight Rises", method="cos1"))  # Nur basierend auf Cosinus der Filmbeschreibungen
print(get_recommendations_2("The Dark Knight Rises", method="cos2"))  # Nur basierend auf Cosinus der metadata soup
print(get_recommendations_2("The Dark Knight Rises", method="combined"))  # Kombiniert

1977        It's Kind of a Funny Story
5161                      The Paperboy
10905    Haunted Hospital: Heilstatten
11042                      The Kingdom
9597                          Catch-22
2484                          Papillon
5411                       He Got Game
10262                 Love and Bullets
8464               The Lazarus Project
6954                           Win Win
Name: title, dtype: object
5478     Jim & Andy: The Great Beyond
2642                  Man on the Moon
6007          Gonjiam: Haunted Asylum
6914      I'm a Cyborg, But That's OK
3095                 Grave Encounters
10561                  Shock Corridor
4509               Grave Encounters 2
4706            House on Haunted Hill
9807             The Mad Women's Ball
3212               Stonehearst Asylum
Name: title, dtype: object
5478     Jim & Andy: The Great Beyond
2642                  Man on the Moon
6007          Gonjiam: Haunted Asylum
6914      I'm a Cyborg, But That's OK
3095                 Gra

In [8]:
# Teste die Funktion für den Titel "Ex Machina" (ersetze dies durch einen Titel aus deinem Dataset)
print(get_recommendations_2("Black Swan", method="cos1"))  # Nur basierend auf Cosinus der Filmbeschreibungen
print(get_recommendations_2("Black Swan", method="cos2"))  # Nur basierend auf Cosinus der metadata soup
print(get_recommendations_2("Black Swan", method="combined"))  # Kombiniert

1952                           Babylon
5916                   Boris: The Film
1487                        The Artist
1891                  Sunset Boulevard
5615              She's Funny That Way
8772            Berberian Sound Studio
2969    Jay and Silent Bob Strike Back
6775         Jay and Silent Bob Reboot
7717          Downton Abbey: A New Era
155                      A Quiet Place
Name: title, dtype: object
1487             The Artist
1952                Babylon
11004           On the Town
1235          Hail, Caesar!
9773           Silent Movie
1891       Sunset Boulevard
2999          All About Eve
4167              Entourage
7060               Hoosiers
2868     Cannibal Holocaust
Name: title, dtype: object
1952                           Babylon
1487                        The Artist
5916                   Boris: The Film
1891                  Sunset Boulevard
2969    Jay and Silent Bob Strike Back
6775         Jay and Silent Bob Reboot
5615              She's Funny That Way
877

In [9]:
print(cosine_sim2.mean(), cosine_sim1.mean())
print(cosine_sim2_scaled.mean(), cosine_sim1_scaled.mean())
print(cosine_sim1.std(), cosine_sim2.std())

0.009672082973746112 0.0059963789860346985
0.009672082973746114 0.0059963789860347
0.016936937909565672 0.018516669082339152


In [11]:
q_movies_3.head()

Unnamed: 0,genres,keywords,title,tagline,cast,director,soup
14,"Drama, Crime","prison, friendship, police brutality, corrupti...",The Shawshank Redemption,Fear can hold you prisoner. Hope can set you f...,"Morgan Freeman, Tim Robbins, Bob Gunton, Willi...",Frank Darabont,"Drama, Crime prison, friendship, police brutal..."
53,"Drama, Crime","based on novel or book, loss of loved one, lov...",The Godfather,An offer you can't refuse.,"Marlon Brando, Al Pacino, James Caan, Robert D...",Francis Ford Coppola,"Drama, Crime based on novel or book, loss of l..."
119,"Drama, History, War","factory, hero, nazi, concentration camp, ss (n...",Schindler's List,"Whoever saves one life, saves the world entire.","Liam Neeson, Ben Kingsley, Ralph Fiennes, Caro...",Steven Spielberg,"Drama, History, War factory, hero, nazi, conce..."
215,"Drama, Crime","new year's eve, new york city, based on novel ...",The Godfather Part II,The rise and fall of the Corleone empire.,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",Francis Ford Coppola,"Drama, Crime new year's eve, new york city, ba..."
107,"Animation, Family, Fantasy","witch, parent child relationship, darkness, ba...",Spirited Away,On the other side of the tunnel was a mysterio...,"Rumi Hiiragi, Miyu Irino, Mari Natsuki, Takash...",Hayao Miyazaki,"Animation, Family, Fantasy witch, parent child..."
