In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
import os
import pprint
import matplotlib.pyplot as plt
import seaborn as sns
import random
from collections import defaultdict

In [3]:
#Gefilterte Datensätze wieder den Variablen zuweisen:
output_dir = Path("Data/filtered_data")
csv_path_links = output_dir / "links_cleaned.csv"
csv_path_movies = output_dir / "movies_cleaned.csv"
csv_path_ratings = output_dir / "ratings_cleaned.csv"
csv_path_tags = output_dir / "tags_cleaned.csv"
links = pd.read_csv(csv_path_links)
movies = pd.read_csv(csv_path_movies)
ratings = pd.read_csv(csv_path_ratings)
tags = pd.read_csv(csv_path_tags)

In [4]:
output_dir = Path("Data/Train_Test_Validate")
csv_path_train = output_dir / "train_df.csv"
csv_path_test = output_dir / "test_df.csv"
training = pd.read_csv(csv_path_train)
test = pd.read_csv(csv_path_test)
csv_path_test_masked = output_dir / "test_df_masked.csv"
test_masked = pd.read_csv(csv_path_test_masked)

# Aufgabe 2

In [11]:
def calculate_biases(df, beta_u=20, beta_i=20, iteration=10):
    #erstellt einen Key, wenn der aufgerufene Key nicht existiert
    b_i = defaultdict(float)
    b_u = defaultdict(float)
    #Globalen Durchschnitt berechnen
    mu = df["rating"].mean()

    # Vorab gruppieren nach User und Movie
    user_groups = df.groupby("userId")
    movie_groups = df.groupby("movieId")

    for _ in range(iteration):
        # Update b_u
        for user_id, group in user_groups:
            numer = (group["rating"] - mu - group["movieId"].map(b_i)).sum()#Summe der Abweichung für den nutzer
            denom = beta_u + len(group) #Filme mit weniger bewertungen sollen damit stärker reguliert werden
            b_u[user_id] = numer / denom #berechneter user bias

        # Update b_i
        for movie_id, group in movie_groups:
            numer = (group["rating"] - mu - group["userId"].map(b_u)).sum()#Summe der Abweichung für den Film
            denom = beta_i + len(group)
            b_i[movie_id] = numer / denom #berechneter item bias

    return mu, dict(b_u), dict(b_i)

mu_baseline, b_u_baseline, b_i_baseline = calculate_biases(ratings)


KeyboardInterrupt



In [None]:
#Vorhersagen für die nicht bewerteten Filme eines Users
def predict(mu, b_u, b_i, user_ID, unrated_Itmes):
    prediction = []
    for item_ID in unrated_Itmes:
        predicted_rating = mu + b_u[user_ID] + b_i[item_ID]
        prediction.append((item_ID, predicted_rating))
    prediction.sort(key=lambda x: x[1], reverse=True)
    return prediction

#Vorhersagen für Top-N Empfehlungen
def get_top_n_reco(df,mu, b_u, b_i, user_ID, N = 20):
    #Filtern nach welche Filme bereits von User bewertet wurden
    rated = df[df["userId"] == user_ID]["movieId"]
    #Liste erstellen für alle nicht bertetetn Filme
    unrated_prod = [item for item in df["movieId"].unique() if item not in rated]
    #Vorhersage berechnen wie der Nutzer den Film finden würde
    top_n_pred = predict(mu, b_u, b_i,user_ID, unrated_prod)[:N]
    print(f"\n Top {N} Empfehlungen für Benutzer {user_ID} sind:")
    for rank, (movie_id, score) in enumerate(top_n_pred, start=1):
        movie_title = movies.loc[movies['movieId'] == movie_id, 'title'].values[0]
        print(f"{rank}. {movie_title} - Vorhergesagter Wert: {score:.2f}")


get_top_n_reco(ratings,mu_baseline, b_u_baseline, b_i_baseline, 1)
get_top_n_reco(ratings,mu_baseline, b_u_baseline, b_i_baseline, 3)
get_top_n_reco(ratings,mu_baseline, b_u_baseline, b_i_baseline, 7)

In [None]:
def random_recom(df, user_ID, mu, b_u, b_i, movies, N=20):
    # Filme, die der User schon gesehen hat
    rated = df[df['userId'] == user_ID]['movieId']
    
    # Alle anderen Filme
    unrated_prod = [item for item in df["movieId"].unique() if item not in rated.values]
    
    # Zufällige Auswahl
    random_recommendations = random.sample(unrated_prod, min(N, len(unrated_prod)))
    
    print(f"\nZufällige Empfehlungen für User {user_ID} mit vorhergesagtem Score:")
    
    for rank, movie_id in enumerate(random_recommendations, start=1):
        title = movies.loc[movies['movieId'] == movie_id, 'title'].values[0]
        
        # Vorhersage berechnen, wenn User- oder Item-Bias fehlen, nutze 0
        bu = b_u.get(user_ID, 0)
        bi = b_i.get(movie_id, 0)
        pred_score = mu + bu + bi
        
        print(f"{rank}. {title} — Vorhergesagter Score: {pred_score:.2f}")
        
random_recom(ratings, user_ID=1, mu=mu_baseline, b_u=b_u_baseline, b_i=b_i_baseline, movies=movies, N=20)
random_recom(ratings, user_ID=3, mu=mu_baseline, b_u=b_u_baseline, b_i=b_i_baseline, movies=movies, N=20)
random_recom(ratings, user_ID=7, mu=mu_baseline, b_u=b_u_baseline, b_i=b_i_baseline, movies=movies, N=20)

# Aufgabe 3

In [5]:
#debug_TEST
def cross_validation(dataset, parts=5, keep_frac=0.2, random_state=42):
    user_unique = dataset["userId"].unique()
    user_shuffled = pd.Series(user_unique).sample(frac=1, random_state=random_state).values
    user_folds = np.array_split(user_shuffled, parts)

    train_sets = []
    validation_visible_sets = []
    validation_ground_truth_sets = []

    for fold_users in user_folds:
        is_validation = dataset["userId"].isin(fold_users)
        train_set = dataset[~is_validation].copy()
        validation_set = dataset[is_validation].copy()

        # Sichtbare Bewertungen (All-but-X%)
        validation_visible = (
            validation_set
            .groupby("userId", group_keys=False)
            .apply(lambda x: x.sample(
                frac=keep_frac if len(x) > 1 else 1.0,
                random_state=random_state
            ))
            .reset_index(drop=True)
        )

        # Maskierte Bewertungen = Ground Truth
        validation_ground_truth = validation_set.merge(
            validation_visible,
            how="outer",
            on=["userId", "movieId", "rating"],
            indicator=True
        )
        validation_ground_truth = validation_ground_truth[
            validation_ground_truth["_merge"] == "left_only"
        ].drop(columns=["_merge"])

        train_sets.append(train_set)
        validation_visible_sets.append(validation_visible)
        validation_ground_truth_sets.append(validation_ground_truth)

    return train_sets, validation_visible_sets, validation_ground_truth_sets


In [6]:
#Versuch
df_train, df_val_visible, df_val_ground = cross_validation(training, parts=5, keep_frac=0.2)

output_dir = Path("Data/Train_Test_Validate/Versuch")
output_dir.mkdir(parents=True, exist_ok=True)

for idx in range(5):
    df_train[idx].to_csv(output_dir / f"train_set_{idx+1}.csv", index=False)
    df_val_visible[idx].to_csv(output_dir / f"validate_masked_{idx+1}.csv", index=False)
    df_val_ground[idx].to_csv(output_dir / f"validate_groundtruth_{idx+1}.csv", index=False)

In [13]:
#Versuch
output_dir = Path("Data/Train_Test_validate/Versuch")

# Trainingssets
train_set_1 = pd.read_csv(output_dir / "train_set_1.csv")
train_set_2 = pd.read_csv(output_dir / "train_set_2.csv")
train_set_3 = pd.read_csv(output_dir / "train_set_3.csv")
train_set_4 = pd.read_csv(output_dir / "train_set_4.csv")
train_set_5 = pd.read_csv(output_dir / "train_set_5.csv")

# Validierungssätze (sichtbare Ratings, All-but-X%)
validate_masked_1 = pd.read_csv(output_dir / "validate_masked_1.csv")
validate_masked_2 = pd.read_csv(output_dir / "validate_masked_2.csv")
validate_masked_3 = pd.read_csv(output_dir / "validate_masked_3.csv")
validate_masked_4 = pd.read_csv(output_dir / "validate_masked_4.csv")
validate_masked_5 = pd.read_csv(output_dir / "validate_masked_5.csv")

# Ground Truths (maskierte Bewertungen, zum Auswerten)
validate_groundtruth_1 = pd.read_csv(output_dir / "validate_groundtruth_1.csv")
validate_groundtruth_2 = pd.read_csv(output_dir / "validate_groundtruth_2.csv")
validate_groundtruth_3 = pd.read_csv(output_dir / "validate_groundtruth_3.csv")
validate_groundtruth_4 = pd.read_csv(output_dir / "validate_groundtruth_4.csv")
validate_groundtruth_5 = pd.read_csv(output_dir / "validate_groundtruth_5.csv")


In [15]:
# Mean Absolute Error (MAE)
def mean_absolute_error(y_true, y_pred):
    return np.mean(np.abs(np.array(y_true) - np.array(y_pred)))

# Root Mean Square Error (RMSE)
def root_mean_square_error(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

# Precision@N berechnen
def precision_at_n(predictions, relevant_items, N=15, threshold=4.0):
    # Filtere relevante Items
    relevant_items = [item for item in relevant_items if item[1] >= threshold]
    
    # Berechne die Top-N Empfehlungen
    top_n_predictions = sorted(predictions, key=lambda x: x[1], reverse=True)[:N]
    
    # Berechne Precision
    relevant_in_top_n = sum(1 for item in top_n_predictions if item[0] in [x[0] for x in relevant_items])
    return relevant_in_top_n / N if N > 0 else 0

# Recall@N berechnen
def recall_at_n(predictions, relevant_items, N=15, threshold=4.0):
    # Filtere relevante Items
    relevant_items = [item for item in relevant_items if item[1] >= threshold]
    
    # Berechne die Top-N Empfehlungen
    top_n_predictions = sorted(predictions, key=lambda x: x[1], reverse=True)[:N]
    
    # Berechne Recall
    relevant_in_top_n = sum(1 for item in top_n_predictions if item[0] in [x[0] for x in relevant_items])
    return relevant_in_top_n / len(relevant_items) if len(relevant_items) > 0 else 0

def precision_recall_at_n(predicted_list, true_list, N=15, threshold=4.0):
    precisions = []
    recalls = []

    for uid in predicted_list:
        if uid not in true_list:
            continue

        preds = predicted_list[uid]
        truths = true_list[uid]

        # Relevante Items im Ground Truth
        true_relevant = set(iid for iid, rating in truths if rating >= threshold)
        if not true_relevant:
            continue

        # Top-N vorhergesagte Items
        top_n = sorted(preds, key=lambda x: x[1], reverse=True)[:N]
        recommended = set(iid for iid, rating in top_n if rating >= threshold)

        # Schnittmenge
        rel_and_rec = recommended & true_relevant

        precision = len(rel_and_rec) / len(recommended) if recommended else 0
        recall = len(rel_and_rec) / len(true_relevant)

        precisions.append(precision)
        recalls.append(recall)

    return np.mean(precisions), np.mean(recalls)

# Aufgabe 4

In [8]:
#Anpassung, da ich true und prediction am besten direkt hier berechne deshalb compute_rmse
from sklearn.metrics import mean_squared_error
def compute_rmse(mu, b_u, b_i, test_df):
    preds = []   # Liste zur Speicherung der vorhergesagten Bewertungen
    actuals = [] # Liste zur Speicherung der tatsächlichen Bewertungen
    
    # Iteration über alle Zeilen des Test-Datensatzes
    for _, row in test_df.iterrows():
        # Extrahiere Nutzer-ID, Film-ID und tatsächliche Bewertung aus der aktuellen Zeile
        u, i, true_rating = row["userId"], row["movieId"], row["rating"]
        
        # Berechne die vorhergesagte Bewertung:
        # globale Durchschnittsbewertung + Nutzer-Bias + Item-Bias
        # Falls kein Bias vorhanden ist, verwende 0 als Standardwert
        pred_rating = mu + b_u.get(u, 0) + b_i.get(i, 0)
        
        # Speichere die Vorhersage und die tatsächliche Bewertung
        preds.append(pred_rating)
        actuals.append(true_rating)
    
    # Berechne und gib die RMSE zwischen den tatsächlichen und vorhergesagten Bewertungen zurück
    return np.sqrt(mean_squared_error(actuals, preds))

In [None]:
# === Parameter und Ergebnis-Speicher ===
beta_values = [1, 5, 10, 20]
rmse_list, mae_values, precision_values, recall_values = [], [], [], []

train_sets = [pd.read_csv(output_dir / f"train_set_{i}.csv") for i in range(1, 6)]
validate_masked_sets = [pd.read_csv(output_dir / f"validate_masked_{i}.csv") for i in range(1, 6)]
validate_groundtruth_sets = [pd.read_csv(output_dir / f"validate_groundtruth_{i}.csv") for i in range(1, 6)]

# === Hauptschleife über alle Betas & Folds ===
for beta_u in beta_values:
    for beta_i in beta_values:
        all_rmse, all_mae, all_prec, all_rec = [], [], [], []

        for fold in range(5):
            train_set = train_sets[fold]
            val_masked = validate_masked_sets[fold]
            val_groundtruth = validate_groundtruth_sets[fold]

            mu, b_u, b_i = calculate_biases(train_set, beta_u=beta_u, beta_i=beta_i, iteration=10)

            # Vorhersagen auf Groundtruth
            predictions = [mu + b_u.get(u, 0) + b_i.get(i, 0)
                           for u, i in zip(val_groundtruth["userId"], val_groundtruth["movieId"])]
            true_ratings = val_groundtruth["rating"].values

            rmse = compute_rmse(mu, b_u, b_i, val_groundtruth)
            mae = mean_absolute_error(true_ratings, predictions)

            user_preds = defaultdict(list)
            user_truths = defaultdict(list)

            for u, i, true_r, pred_r in zip(val_groundtruth["userId"], val_groundtruth["movieId"], val_groundtruth["rating"], predictions):
                user_preds[u].append((i, pred_r))
                user_truths[u].append((i, true_r))

            precision, recall = precision_recall_at_n(user_preds, user_truths, N=15, threshold=4.0)

            all_rmse.append(rmse)
            all_mae.append(mae)
            all_prec.append(precision)
            all_rec.append(recall)

        rmse_list.append((beta_u, beta_i, np.mean(all_rmse)))
        mae_values.append((beta_u, beta_i, np.mean(all_mae)))
        precision_values.append((beta_u, beta_i, np.mean(all_prec)))
        recall_values.append((beta_u, beta_i, np.mean(all_rec)))

        print(f"β_u: {beta_u}, β_i: {beta_i}, RMSE: {np.mean(all_rmse):.4f}, MAE: {np.mean(all_mae):.4f}, "
              f"Precision@N: {np.mean(all_prec):.4f}, Recall@N: {np.mean(all_rec):.4f}")

# === Heatmaps erzeugen ===
def plot_heatmap(df, metric_name, cmap="viridis"):
    pivot = df.pivot(index="beta_u", columns="beta_i", values=metric_name)
    plt.figure(figsize=(10, 8))
    sns.heatmap(pivot, annot=True, fmt=".3f", cmap=cmap)
    plt.title(f"{metric_name} Heatmap für verschiedene β-Werte")
    plt.xlabel("β_i")
    plt.ylabel("β_u")
    plt.show()

df_results = pd.DataFrame(rmse_list, columns=["beta_u", "beta_i", "rmse"])
df_mae = pd.DataFrame(mae_values, columns=["beta_u", "beta_i", "mae"])
df_precision = pd.DataFrame(precision_values, columns=["beta_u", "beta_i", "precision"])
df_recall = pd.DataFrame(recall_values, columns=["beta_u", "beta_i", "recall"])

plot_heatmap(df_results, "rmse")
plot_heatmap(df_mae, "mae")
plot_heatmap(df_precision, "precision")
plot_heatmap(df_recall, "recall")