In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

from sklearn.preprocessing import MinMaxScaler


In [4]:
q_movies = pd.read_parquet("../tmbd_exports/quality_movs_weighted_rating.parquet")

In [5]:
indices = pd.Series(q_movies.index, index = q_movies["title"])
indices.shape

(607,)

In [7]:
# Schritt 1: Cosinus-Ähnlichkeit aus den Filmbeschreibungen (cos1)
tfidf = TfidfVectorizer(stop_words='english')
q_movies['overview'] = q_movies['overview'].fillna('')  # Leere Strings für NaN-Werte

tfidf_matrix = tfidf.fit_transform(q_movies['overview'])
cosine_sim1 = linear_kernel(tfidf_matrix, tfidf_matrix)

In [21]:
# Schritt 2: Cosinus-Ähnlichkeit aus der "metadata soup" (cos2)
# Beispiel für "metadata soup": Hier gehen wir davon aus, dass es sich um eine Kombination von verschiedenen Metadaten handelt
# Du kannst z.B. Genre, Jahr, Bewertung, etc. kombinieren.
# Hier als Dummy-Beispiel kombinieren wir nur Genre-Daten (ersetze dies durch deine tatsächlichen Daten):

q_movies_3 = q_movies[["genres", "keywords", "title", "tagline", "cast", "director"]].copy()

def create_soup(x): 
    q_movies['genres'] = q_movies['genres'].fillna('')
    q_movies['keywords'] = q_movies['keywords'].fillna('')
    q_movies['title'] = q_movies['title'].fillna('')
    q_movies['tagline'] = q_movies['tagline'].fillna('')
    q_movies['cast'] = q_movies['cast'].fillna('')
    q_movies['director'] = q_movies['director'].fillna('')

    return ' '.join(x['genres']) + ' ' + ' '.join(x['keywords']) + ' ' + x['title'] + ' ' + x['tagline'] + ' ' + ' '.join(x['cast']) + ' ' + x['director']

q_movies_3['soup'] = q_movies_3.apply(create_soup, axis=1)

metadata_vectorizer = TfidfVectorizer(stop_words='english')
metadata_matrix = metadata_vectorizer.fit_transform(q_movies_3['soup'])
cosine_sim2 = linear_kernel(metadata_matrix, metadata_matrix)

In [41]:
# Schritt 2: Cosinus-Ähnlichkeit aus der "metadata soup" (cos2)
# Beispiel für "metadata soup": Hier gehen wir davon aus, dass es sich um eine Kombination von verschiedenen Metadaten handelt
# Du kannst z.B. Genre, Jahr, Bewertung, etc. kombinieren.
# Hier als Dummy-Beispiel kombinieren wir nur Genre-Daten (ersetze dies durch deine tatsächlichen Daten):

q_movies_3 = q_movies[["genres", "keywords", "title", "tagline", "cast", "director"]].copy()

# NaN-Werte vor dem Join entfernen & dann kombinieren
#q_movies_3['soup'] = q_movies_3[['genres', 'keywords', 'cast', 'director']].fillna('').astype(str).agg(' '.join, axis=1)
q_movies_3['soup'] = q_movies_3[['genres', 'keywords', 'title', 'tagline', 'cast', 'director']].fillna('').astype(str).agg(' '.join, axis=1)

# TF-IDF-Vektorisierung
metadata_vectorizer = TfidfVectorizer(stop_words='english')
metadata_matrix = metadata_vectorizer.fit_transform(q_movies_3['soup'])

# Kosinus-Ähnlichkeit berechnen
cosine_sim2 = linear_kernel(metadata_matrix, metadata_matrix)

In [42]:
# Schritt 3: Kombinieren der beiden Matrizen (Gewichtung ist optional)
# Beispiel für Gewichtung: cos1 bekommt 0.7 Gewicht, cos2 bekommt 0.3 Gewicht

# Sicherstellen, dass beide Matrizen Werte im gleichen Bereich haben.
scaler = MinMaxScaler()
cosine_sim1_scaled = scaler.fit_transform(cosine_sim1)
cosine_sim2_scaled = scaler.fit_transform(cosine_sim2)

# cosine_sim_combined = 0.3 * cosine_sim1 + 0.7 * cosine_sim2
cosine_sim_combined = 0.8 * cosine_sim1_scaled + 0.2 * cosine_sim2_scaled

# Funktion zur Empfehlung
def get_recommendations_2(title, cosine_sim1=cosine_sim1, cosine_sim2=cosine_sim2, cosine_sim_combined=cosine_sim_combined, method="combined"):
    # Der Index des Filmes, der dem Titel entspricht

   # Get the index of the movie that matches the title
    indices = pd.Series(q_movies.index, index=q_movies['title']).to_dict()

    if title not in indices:
        return "Title not found."

    idx = indices[title]  # Sicherer Weg, um den Index zu bekommen
    
    if method == "cos1":
        cosine_sim = cosine_sim1
    elif method == "cos2":
        cosine_sim = cosine_sim2
    else:
        cosine_sim = cosine_sim_combined
    
    # Berechne die paarweisen Ähnlichkeiten
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Hole die Top-10 ähnlichen Filme
    sim_scores = sim_scores[1:11]  # Die ersten 10 ähnlichen Filme (den Film selbst ausschließen)
    
    # Holen der Film-Indizes
    movie_indices = [i[0] for i in sim_scores]
    
    # Rückgabe der entsprechenden Titel
    return q_movies['title'].iloc[movie_indices]

# Teste die Funktion für den Titel "Ex Machina" (ersetze dies durch einen Titel aus deinem Dataset)
print(get_recommendations_2("The Dark Knight Rises", method="cos1"))  # Nur basierend auf Cosinus der Filmbeschreibungen
print(get_recommendations_2("The Dark Knight Rises", method="cos2"))  # Nur basierend auf Cosinus der metadata soup
print(get_recommendations_2("The Dark Knight Rises", method="combined"))  # Kombiniert

2                            The Dark Knight
291                                   Batman
398                           Batman Returns
74                             Batman Begins
583       Batman v Superman: Dawn of Justice
139                          American Sniper
558             Teenage Mutant Ninja Turtles
372                         Zero Dark Thirty
390    The Mortal Instruments: City of Bones
181                               District 9
Name: title, dtype: object
2      The Dark Knight
74       Batman Begins
36        The Prestige
398     Batman Returns
7         Interstellar
291             Batman
5            Inception
463         Kick-Ass 2
198           Kick-Ass
467           Child 44
Name: title, dtype: object
2                         The Dark Knight
398                        Batman Returns
291                                Batman
74                          Batman Begins
583    Batman v Superman: Dawn of Justice
372                      Zero Dark Thirty
558          T

In [43]:
# Teste die Funktion für den Titel "Ex Machina" (ersetze dies durch einen Titel aus deinem Dataset)
print(get_recommendations_2("The Godfather", method="cos1"))  # Nur basierend auf Cosinus der Filmbeschreibungen
print(get_recommendations_2("The Godfather", method="cos2"))  # Nur basierend auf Cosinus der metadata soup
print(get_recommendations_2("The Godfather", method="combined"))  # Kombiniert

24         The Godfather: Part II
537                       The Boy
164                     Furious 7
502                      The Rite
293             The Bélier Family
572                      Hercules
45             The Usual Suspects
447                           War
152    Back to the Future Part II
291                        Batman
Name: title, dtype: object
24                The Godfather: Part II
109                       Apocalypse Now
62                              Scarface
415                     Ocean's Thirteen
300                        Love Actually
433                               Wanted
273                            True Grit
423    Cloudy with a Chance of Meatballs
104                  Requiem for a Dream
210                   The Age of Adaline
Name: title, dtype: object
24         The Godfather: Part II
537                       The Boy
164                     Furious 7
45             The Usual Suspects
109                Apocalypse Now
447                       

In [33]:
print(cosine_sim2.mean(), cosine_sim1.mean())
print(cosine_sim2_scaled.mean(), cosine_sim1_scaled.mean())
print(cosine_sim1.std(), cosine_sim2.std())

0.006230559396650522 0.009874253013023849
0.006230559396650524 0.009874253013023849
0.043820061123089116 0.04909710815109426


In [37]:
q_movies_3.head()

Unnamed: 0,genres,keywords,title,tagline,cast,director,soup
0,"Drama, Crime","prison, corruption, police brutality, prison c...",The Shawshank Redemption,Fear can hold you prisoner. Hope can set you f...,"Tim Robbins, Morgan Freeman, Bob Gunton, Clanc...",Frank Darabont,"Drama, Crime prison, corruption, police brutal..."
1,Drama,"support group, dual identity, nihilism, rage a...",Fight Club,Mischief. Mayhem. Soap.,"Edward Norton, Brad Pitt, Meat Loaf, Jared Let...",David Fincher,"Drama support group, dual identity, nihilism, ..."
2,"Drama, Action, Crime, Thriller","dc comics, crime fighter, secret identity, sca...",The Dark Knight,Why So Serious?,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",Christopher Nolan,"Drama, Action, Crime, Thriller dc comics, crim..."
3,"Thriller, Crime","transporter, brothel, drug dealer, boxer, massage",Pulp Fiction,Just because you are a character doesn't mean ...,"John Travolta, Samuel L. Jackson, Uma Thurman,...",Quentin Tarantino,"Thriller, Crime transporter, brothel, drug dea..."
4,"Drama, Crime","italy, love at first sight, loss of father, pa...",The Godfather,An offer you can't refuse.,"Marlon Brando, Al Pacino, James Caan, Richard ...",Francis Ford Coppola,"Drama, Crime italy, love at first sight, loss ..."
