In [1]:
# hybrid_recommender.ipynb (kann auch als .py ausgeführt werden)

import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
import requests
from dotenv import load_dotenv
import os
from IPython.display import Image, display

In [2]:
# --- Load Environment Variables ---
load_dotenv()
api_key = os.getenv("TMDB_API_KEY")

In [3]:
# --- Load Data ---
movies = pd.read_csv("../data/raw/movies.csv")
tags = pd.read_csv("../data/raw/tags.csv")
scores = pd.read_csv("../data/raw/genome-scores.csv")
genome_tags = pd.read_csv("../data/raw/genome-tags.csv")
links = pd.read_csv("../data/raw/links.csv")
ratings = pd.read_csv("../data/raw/ratings.csv")

In [4]:
# --- Preprocessing ---
movies = movies.dropna()

# Combine genres and tags to form a movie content string
tags = tags.dropna(subset=["tag"])
tags_combined = tags.groupby("movieId")["tag"].apply(lambda t: " ".join(str(x) for x in t)).reset_index()

movies = pd.merge(movies, tags_combined, on="movieId", how="left")
movies["combined"] = movies["genres"].str.replace("|", " ", regex=False) + " " + movies["tag"].fillna("")

In [5]:
# --- TF-IDF Movie Embedding ---
vectorizer = TfidfVectorizer(max_features=300)
content_embeddings = vectorizer.fit_transform(movies["combined"])

In [6]:
# --- Collaborative Embedding: Mean relevance vector from genome-scores ---
collab_embeddings = scores.pivot(index="movieId", columns="tagId", values="relevance").fillna(0)

# Match movies present in both sets
common_ids = movies[movies["movieId"].isin(collab_embeddings.index)].copy()
content_embeddings = content_embeddings[[i for i, mid in enumerate(movies["movieId"]) if mid in common_ids["movieId"].values]]
collab_embeddings = collab_embeddings.loc[common_ids["movieId"]]

# Normalize and combine
scaler = MinMaxScaler()
collab_scaled = scaler.fit_transform(collab_embeddings)
content_scaled = scaler.fit_transform(content_embeddings.toarray())

hybrid_matrix = np.hstack([collab_scaled, content_scaled])

In [7]:
# --- Fit KNN Model ---
knn = NearestNeighbors(n_neighbors=10, metric="cosine")
knn.fit(hybrid_matrix)

In [8]:
# --- Recommendation Function ---
def get_movie_recommendations(movie_title, n_recs=10):
    """displays certain amount of movie recommendations based on a given title

    Args:
        movie_title (_type_): name of the movie to search for
        n_recs (int, optional): number of recommendations. Defaults to 10.
    """
    idx = common_ids[common_ids["title"].str.contains(movie_title, case=False, regex=False)].index[0]
    distances, indices = knn.kneighbors([hybrid_matrix[idx]], n_neighbors=n_recs+1)
    recommendations = common_ids.iloc[indices[0][1:]]

    for i, row in recommendations.iterrows():
        print(f"🎬 {row['title']} ({row['genres']})")
        link = links[links["movieId"] == row["movieId"]]["tmdbId"]
        if not link.empty:
            tmdb_id = int(link.values[0])
            poster_url = f"https://image.tmdb.org/t/p/w342/{get_poster_path(tmdb_id)}"
            try:
                display(Image(url=poster_url))
            except:
                print("[Poster not available]")

In [9]:
# --- Get Poster from TMDb ---
def get_poster_path(tmdb_id):
    """returns the poster path for a given TMDb ID

    Args:
        tmdb_id (_type_): id of the movie in TMDb

    Returns:
        str: poster path
    """
    url = f"https://api.themoviedb.org/3/movie/{tmdb_id}?api_key={api_key}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        return data.get("poster_path", None)
    return None

In [10]:
def get_user_vector(user_id, ratings_df, movie_embeddings_df):
    """returns the user vector based on the ratings given by the user

    Args:
        user_id (_type_): id of the user
        ratings_df (_type_): ratings dataframe
        movie_embeddings_df (_type_): movie embeddings dataframe

    Returns:
        _type_: user vector
    """
    user_ratings = ratings_df[ratings_df["userId"] == user_id]
    
    rated_movies = user_ratings[user_ratings["movieId"].isin(movie_embeddings_df.index)]
    
    if rated_movies.empty:
        return None

    vectors = movie_embeddings_df.loc[rated_movies["movieId"]]
    user_vector = vectors.mean(axis=0).values.reshape(1, -1)
    return user_vector

In [11]:
def recommend_for_user(user_id, ratings_df, movie_embeddings_df, movies_df, knn_model, links_df, n=10):
    """generates movie recommendations for a given user based on their ratings

    Args:
        user_id (_type_): id of the user
        ratings_df (_type_): ratings dataframe
        movie_embeddings_df (_type_): movie embeddings dataframe
        movies_df (_type_): movies dataframe
        knn_model (_type_): knn model
        links_df (_type_): links dataframe
        n (int, optional): number of recommendations. Defaults to 10.
    """
    seen = set(ratings_df[ratings_df["userId"] == user_id]["movieId"])
    user_vec = get_user_vector(user_id, ratings_df, movie_embeddings_df)

    if user_vec is None:
        print("Nicht genug Daten für diesen Nutzer.")
        return

    distances, indices = knn_model.kneighbors(user_vec, n_neighbors=n * 2)

    recommended = []
    for idx in indices[0]:
        movie_id = movie_embeddings_df.index[idx]
        if movie_id not in seen:
            recommended.append(movie_id)
        if len(recommended) >= n:
            break

    for mid in recommended:
        row = movies_df[movies_df["movieId"] == mid].iloc[0]
        print(f"🎬 {row['title']} ({row['genres']})")
        tmdb_row = links_df[links_df["movieId"] == mid]
        if not tmdb_row.empty:
            tmdb_id_val = tmdb_row["tmdbId"].values[0]
            if pd.notna(tmdb_id_val):
                tmdb_id = int(tmdb_id_val)
                poster = get_poster_path(tmdb_id)
                if poster:
                    display(Image(url=f"https://image.tmdb.org/t/p/w342/{poster}"))
            else:
                print("[TMDb ID fehlt – kein Poster verf\u00fcgbar]")

In [12]:
# Recommendations for a specific movie
get_movie_recommendations("Toy Story")

🎬 Monsters, Inc. (2001) (Adventure|Animation|Children|Comedy|Fantasy)


🎬 Toy Story 2 (1999) (Adventure|Animation|Children|Comedy|Fantasy)


🎬 Bug's Life, A (1998) (Adventure|Animation|Children|Comedy)


🎬 Finding Nemo (2003) (Adventure|Animation|Children|Comedy)


🎬 Toy Story 3 (2010) (Adventure|Animation|Children|Comedy|Fantasy|IMAX)


🎬 Ratatouille (2007) (Animation|Children|Drama)


🎬 Ice Age (2002) (Adventure|Animation|Children|Comedy)


🎬 Up (2009) (Adventure|Animation|Children|Drama)


🎬 Shrek (2001) (Adventure|Animation|Children|Comedy|Fantasy|Romance)


🎬 Antz (1998) (Adventure|Animation|Children|Comedy|Fantasy)


In [13]:
# recommendations for a specific user
recommend_for_user(
    user_id=3000,
    ratings_df=ratings,
    movie_embeddings_df=pd.DataFrame(hybrid_matrix, index=common_ids["movieId"]),
    movies_df=common_ids,
    knn_model=knn,
    links_df=links,
    n=10
)


🎬 Stand by Me (1986) (Adventure|Drama)


🎬 Life Itself (2014) (Documentary)


🎬 Lilies of the Field (1963) (Drama)


🎬 Unfinished Life, An (2005) (Drama)


🎬 Quiet Man, The (1952) (Drama|Romance)


🎬 Green Mile, The (1999) (Crime|Drama)


🎬 Forrest Gump (1994) (Comedy|Drama|Romance|War)


🎬 Silverado (1985) (Action|Western)


🎬 Christmas Story, A (1983) (Children|Comedy)


🎬 Magnificent Seven, The (1960) (Adventure|Western)
