In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

from sklearn.preprocessing import MinMaxScaler


In [2]:
#q_movies = pd.read_parquet("../tmbd_exports/quality_movs_weighted_rating.parquet")
q_movies = pd.read_parquet("../tmdb_api/tmdb_api_cleaned/movies_cleaned_hard.parquet")

In [3]:
indices = pd.Series(q_movies.index, index = q_movies["title"])
indices.shape

(9965,)

In [4]:
# Schritt 1: Cosinus-Ähnlichkeit aus den Filmbeschreibungen (cos1)
tfidf = TfidfVectorizer(stop_words='english')
q_movies['overview'] = q_movies['overview'].fillna('')  # Leere Strings für NaN-Werte

tfidf_matrix = tfidf.fit_transform(q_movies['overview'])
cosine_sim1 = linear_kernel(tfidf_matrix, tfidf_matrix)

In [20]:
# Schritt 2: Cosinus-Ähnlichkeit aus der "metadata soup" (cos2)
# Beispiel für "metadata soup": Hier gehen wir davon aus, dass es sich um eine Kombination von verschiedenen Metadaten handelt
# Du kannst z.B. Genre, Jahr, Bewertung, etc. kombinieren.
# Hier als Dummy-Beispiel kombinieren wir nur Genre-Daten (ersetze dies durch deine tatsächlichen Daten):

q_movies_3 = q_movies[["genres", "keywords", "title", "tagline", "cast", "director"]].copy()
# q_movies_3 = q_movies[["genres", "keywords", "cast", "director"]].copy()

# NaN-Werte vor dem Join entfernen & dann kombinieren
q_movies_3['soup'] = q_movies_3[["genres", "keywords", "title", "tagline", "cast", "director"]].fillna('').astype(str).agg(' '.join, axis=1)
# q_movies_3['soup'] = q_movies_3[["genres", "keywords", "title", "tagline", "cast", "director"]].fillna('').astype(str).agg(' '.join, axis=1)

# TF-IDF-Vektorisierung
metadata_vectorizer = TfidfVectorizer(stop_words='english')
metadata_matrix = metadata_vectorizer.fit_transform(q_movies_3['soup'])

# Kosinus-Ähnlichkeit berechnen
cosine_sim2 = linear_kernel(metadata_matrix, metadata_matrix)

In [38]:
# Schritt 3: Kombinieren der beiden Matrizen (Gewichtung ist optional)
# Beispiel für Gewichtung: cos1 bekommt 0.7 Gewicht, cos2 bekommt 0.3 Gewicht

# Sicherstellen, dass beide Matrizen Werte im gleichen Bereich haben.
# scaler = MinMaxScaler()
# cosine_sim1_scaled = scaler.fit_transform(cosine_sim1)
# cosine_sim2_scaled = scaler.fit_transform(cosine_sim2)

# cosine_sim_combined = 0.3 * cosine_sim1 + 0.7 * cosine_sim2
cosine_sim_combined = 0.2 * cosine_sim1 + 0.8 * cosine_sim2

# Funktion zur Empfehlung
def get_recommendations_2(title, cosine_sim1=cosine_sim1, cosine_sim2=cosine_sim2, cosine_sim_combined=cosine_sim_combined, method="combined"):
    # Der Index des Filmes, der dem Titel entspricht

   # Get the index of the movie that matches the title
    indices = pd.Series(q_movies.index, index=q_movies['title']).to_dict()

    if title not in indices:
        return "Title not found."

    idx = indices[title]  # Sicherer Weg, um den Index zu bekommen
    
    if method == "cos1":
        cosine_sim = cosine_sim1
    elif method == "cos2":
        cosine_sim = cosine_sim2
    else:
        cosine_sim = cosine_sim_combined
    
    # Berechne die paarweisen Ähnlichkeiten
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Hole die Top-10 ähnlichen Filme
    sim_scores = sim_scores[1:21]  # Die ersten 10 ähnlichen Filme (den Film selbst ausschließen)
    
    # Holen der Film-Indizes
    movie_indices = [i[0] for i in sim_scores]
    
    # Rückgabe der entsprechenden Titel
    return q_movies['title'].iloc[movie_indices]

# Teste die Funktion für den Titel "Ex Machina" (ersetze dies durch einen Titel aus deinem Dataset)
#print(get_recommendations_2("The Dark Knight Rises", method="cos1"))  # Nur basierend auf Cosinus der Filmbeschreibungen
#print(get_recommendations_2("The Dark Knight Rises", method="cos2"))  # Nur basierend auf Cosinus der metadata soup
#print(get_recommendations_2("The Dark Knight Rises", method="combined"))  # Kombiniert

cos1 = pd.DataFrame(get_recommendations_2("The Dark Knight Rises", method="cos1"))
cos2 = pd.DataFrame(get_recommendations_2("The Dark Knight Rises", method="cos2"))
combined= pd.DataFrame(get_recommendations_2("The Dark Knight Rises", method="combined"))

display("Recommendations for The Dark Knight Rises, Cos1: ", cos1)
display("Recommendations for The Dark Knight Rises, Cos2: ", cos2)
display("Recommendations for The Dark Knight Rises, Cos2: ", combined)

'Recommendations for The Dark Knight Rises, Cos1: '

Unnamed: 0,title
2377,"Batman: The Long Halloween, Part Two"
1854,"Batman: The Long Halloween, Part One"
5,The Dark Knight
4757,Batman: Gotham by Gaslight
1882,Justice League Dark
1289,Batman: Mask of the Phantasm
2420,Batman Returns
414,The Batman
1842,Batman Beyond: Return of the Joker
9571,Batman Forever


'Recommendations for The Dark Knight Rises, Cos2: '

Unnamed: 0,title
336,Batman Begins
5,The Dark Knight
17,Inception
8596,The Punisher
1307,Batman
5545,Kick-Ass 2
7514,7500
2020,Legend
7777,Don Jon
148,Spider-Man: No Way Home


'Recommendations for The Dark Knight Rises, Cos2: '

Unnamed: 0,title
5,The Dark Knight
336,Batman Begins
1307,Batman
17,Inception
8596,The Punisher
414,The Batman
2420,Batman Returns
5545,Kick-Ass 2
1289,Batman: Mask of the Phantasm
1854,"Batman: The Long Halloween, Part One"


In [24]:
# Teste die Funktion für den Titel wie "Ex Machina" oder "Black Swan" (ersetze dies durch einen Titel aus deinem Dataset)
print(get_recommendations_2("Ex Machina", method="cos1"))  # Nur basierend auf Cosinus der Filmbeschreibungen
print(get_recommendations_2("Ex Machina", method="cos2"))  # Nur basierend auf Cosinus der metadata soup
print(get_recommendations_2("Ex Machina", method="combined"))  # Kombiniert

9658                  Welcome to the Jungle
6212                    A Cure for Wellness
4211                           Fear of Rain
7238                        The Wrong Missy
6723                              Cold Skin
9895                            Escape Room
6070                 Can You Keep a Secret?
6074                     The Boss of It All
9436                                Stealth
7827                         The Love Punch
3804     The Seventh Company Has Been Found
2005                                  M3GAN
6572                         The Internship
1171                                RoboCop
1627                        La Belle Époque
5695    The Imaginarium of Doctor Parnassus
9413                           Empire State
5413                        The Quiet Earth
2027      Charlie and the Chocolate Factory
4178                               The Hunt
Name: title, dtype: object
8623                     Mother/Android
6729                            Archive
6616         

In [25]:
# Teste die Funktion für den Titel wie "Ex Machina" oder "Black Swan" (ersetze dies durch einen Titel aus deinem Dataset)
print(get_recommendations_2("Black Swan", method="cos1"))  # Nur basierend auf Cosinus der Filmbeschreibungen
print(get_recommendations_2("Black Swan", method="cos2"))  # Nur basierend auf Cosinus der metadata soup
print(get_recommendations_2("Black Swan", method="combined"))  # Kombiniert

4675                     Barbie in the Pink Shoes
1820                                    Ballerina
3881                                Take the Lead
4648                                  Red Sparrow
782                                 The Red Shoes
4868                                 Center Stage
6176                               StreetDance 3D
2976                  What Happened, Miss Simone?
5419    The Meyerowitz Stories (New and Selected)
368                                  Nightcrawler
4699                                    Hitchcock
1119                         Synecdoche, New York
2118                                        Bambi
4505                       Something's Gotta Give
5288                                   Flashdance
8470                                         Noel
8718               House at the End of the Street
2763                                         Rise
3164        Are You There God? It's Me, Margaret.
3398                     Barbie in the Nutcracker


In [26]:
print(cosine_sim2.mean(), cosine_sim1.mean())
print(cosine_sim2_scaled.mean(), cosine_sim1_scaled.mean())
print(cosine_sim1.std(), cosine_sim2.std())

0.00993363713513403 0.006051547436765764
0.00993363713513403 0.006051547436765764
0.017191494908895356 0.01891630380551497


In [27]:
q_movies_3.head()

Unnamed: 0,genres,keywords,title,tagline,cast,director,soup
0,"Drama, Crime","prison, friendship, police brutality, corrupti...",The Shawshank Redemption,Fear can hold you prisoner. Hope can set you f...,"Morgan Freeman, Tim Robbins, Bob Gunton, Willi...",Frank Darabont,"Drama, Crime prison, friendship, police brutal..."
1,"Drama, Crime","based on novel or book, loss of loved one, lov...",The Godfather,An offer you can't refuse.,"Marlon Brando, Al Pacino, James Caan, Robert D...",Francis Ford Coppola,"Drama, Crime based on novel or book, loss of l..."
2,"Drama, Crime","new year's eve, new york city, based on novel ...",The Godfather Part II,The rise and fall of the Corleone empire.,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",Francis Ford Coppola,"Drama, Crime new year's eve, new york city, ba..."
3,"Drama, History, War","factory, hero, nazi, concentration camp, ss (n...",Schindler's List,"Whoever saves one life, saves the world entire.","Liam Neeson, Ben Kingsley, Ralph Fiennes, Caro...",Steven Spielberg,"Drama, History, War factory, hero, nazi, conce..."
4,"Adventure, Drama, Science Fiction","rescue, future, spacecraft, race against time,...",Interstellar,Mankind was born on Earth. It was never meant ...,"Matthew McConaughey, Anne Hathaway, Michael Ca...",Christopher Nolan,"Adventure, Drama, Science Fiction rescue, futu..."
