# Imports

In [138]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import random
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import StepLR

# Daten laden und vorbereiten

In [139]:
rating_path = 'Datensaetze/rating.csv/rating.csv'
anime_path = 'Datensaetze/anime.csv'

# Laden der Daten
ratings_df = pd.read_csv(rating_path)
anime_df = pd.read_csv(anime_path)


valid_anime_id = set(anime_df['anime_id'])
ratings_df = ratings_df[ratings_df['anime_id'].isin(valid_anime_id)]

# (Eventuelle) Zeilen mit fehlenden IDs entfernen
ratings_df = ratings_df.dropna(subset=['user_id', 'anime_id'])

# Zeilen mit '-1' als Bewertungen entfernen
ratings_df = ratings_df[ratings_df['rating'] != -1]

# Benutzer- und Anime-IDs als Indizes
user_id_mapping = {id: idx for idx, id in enumerate(ratings_df['user_id'].unique())}
anime_id_mapping = {id: idx for idx, id in enumerate(ratings_df['anime_id'].unique())}


# IDs in der Tabelle umwandeln
ratings_df['user_id'] = ratings_df['user_id'].map(user_id_mapping)
ratings_df['anime_id'] = ratings_df['anime_id'].map(anime_id_mapping)


# Trainings- und Testdaten aufteilen
train_data, test_data = train_test_split(ratings_df, test_size=0.2, random_state=42)

# Torch-Daten
train_user = torch.tensor(train_data['user_id'].values, dtype = torch.long)
train_anime = torch.tensor(train_data['anime_id'].values, dtype = torch.long)
train_rating = torch.tensor(train_data['rating'].values, dtype = torch.float32)

test_user = torch.tensor(test_data['user_id'].values, dtype = torch.long)
test_anime = torch.tensor(test_data['anime_id'].values, dtype = torch.long)
test_rating = torch.tensor(test_data['rating'].values, dtype = torch.float32)

train_dataset = TensorDataset(train_user, train_anime, train_rating)
test_dataset = TensorDataset(test_user, test_anime, test_rating)

batch_size = 64

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Genres vorbereiten
anime_df['genre'] = anime_df['genre'].fillna('Unknown')
anime_df['genres'] = anime_df['genre'].str.split(',')
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(anime_df['genres'])
genre_tensor = torch.tensor(genre_matrix, dtype=torch.float32)


#   Collaboratives Filtern

In [140]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [141]:
class NeuralCollaborativeFiltering(nn.Module):
    def __init__(self, num_users, num_animes, embedding_dim, hidden_dim, dropout_rate=0.5):
        super(NeuralCollaborativeFiltering, self).__init__()
        # Embeddings für Benutzer und Anime
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.anime_embedding = nn.Embedding(num_animes, embedding_dim)
        
        self.fc1 = nn.Linear(embedding_dim*2, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, int(hidden_dim/2))
        self.fc3 = nn.Linear(int(hidden_dim/2), 1)
        self.dropout = nn.Dropout(p=dropout_rate)

    def forward(self, user_id, anime_id):
        user_embedding = self.user_embedding(user_id)
        anime_embedding = self.anime_embedding(anime_id)

        x = torch.cat([user_embedding, anime_embedding], dim=-1)

        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)
        pred = self.fc3(x)
        
        return pred.flatten()


# Hybrid Recommander system (collaborative & content-based Filtering)

In [142]:
class HybridRecommendationModelWithNCF(nn.Module):
    def __init__(self, num_users, num_animes, embedding_dim, hidden_dim, num_content_features, dropout_rate=0.5):
        super(HybridRecommendationModelWithNCF, self).__init__()
        self.ncf = NeuralCollaborativeFiltering(num_users, num_animes, embedding_dim, hidden_dim, dropout_rate)
        self.content_fc = nn.Linear(num_content_features, embedding_dim)
        self.dropout = nn.Dropout(p=dropout_rate)

    def forward(self, user_id, anime_id, num_content_features):
        ncf_output = self.ncf(user_id, anime_id)
        content_based_vector = self.content_fc(num_content_features)
        content_based_vector = self.dropout(content_based_vector)
        return ncf_output + content_based_vector.sum(1)


In [143]:

hybridRecommendationModelWithNCF = HybridRecommendationModelWithNCF(
    num_users = len(user_id_mapping),
    num_animes = len(anime_id_mapping), 
    embedding_dim = 300, 
    hidden_dim = 128,
    num_content_features = genre_tensor.shape[1]).to(device)


# Trainigs- und Evaluierungsvorbereitung

In [144]:
loss_func = nn.MSELoss()
optimizer = torch.optim.Adam(hybridRecommendationModelWithNCF.parameters(), lr=0.001)
scheduler = StepLR(optimizer, step_size=3, gamma=0.5)

In [145]:
def calculate_precision(y_prediction, y_target, tolerance=0.5):
    if not isinstance(y_prediction, torch.Tensor):
        y_prediction = torch.tensor(y_prediction)
    if not isinstance(y_target, torch.Tensor):
        y_target = torch.tensor(y_target) 

    deviations = torch.abs(y_prediction - y_target)

    correct_predictions = (deviations <= tolerance).sum().item()
    precision = correct_predictions/len(y_target)
    return precision

### Funtionen zur Berechnung von precision@k und recall@k

In [146]:
def precision_at_k(y_pred, y_true, threshold, k):
    precisions = []
    for user_idx in range(y_pred.shape[0]):
        # Top-K-Indizes der Vorhersage
        top_k_indices = np.argsort(y_pred[user_idx])[::-1][:k]
        # Relevante Elemente in der tachsächlichen Bewertung extrahieren
        relevante_items = np.where(y_true[user_idx] > threshold)[0]
        if len(relevante_items) == 0:
            precisions.append(1) # Das ist logisch, denn in diesem Fall haben wir kein empfholenes Item, das nicht relevant ist??????????????
        # Anzahl relevanter Treffer im Top-K
        else:
            numb_relv_item_in_top_k = len(set(relevante_items) & set(top_k_indices))
            # Precision für diesen Benutzer
            precisions.append(numb_relv_item_in_top_k/k)
    return np.mean(precisions)  

def recall_at_k(y_pred, y_true, threshold, k):
    recalls = []
    for user_idx in range(y_pred.shape[0]):
        # Top-K-Indizes der Vorhersage
        top_k_indices = np.argsort(y_pred[user_idx])[::-1][:k]
        # Relevante Elemente in der tachsächlichen Bewertung extrahieren
        relevante_items = np.where(y_true[user_idx] > threshold)[0]
        if len(relevante_items) == 0:
            recalls.append(1) # Das ist logisch, denn in diesem Fall haben kein relevantes Item, das nicht im Top-K empfholen ist????????????????
        # Anzahl relevanter Treffer im Top-K
        else:
            numb_relv_item_in_top_k = len(set(relevante_items) & set(top_k_indices))
            # Precision für diesen Benutzer
            recalls.append(numb_relv_item_in_top_k/len(relevante_items))
    return np.mean(recalls)  

In [147]:
def train(dataloader, model, loss_func, device, genre_tensor, optimizer):
  model.train()

  running_loss = 0.0
  THRESHOLD = 7
  K = 10
  y_predictions = []
  y_true = []

  for user_id, anime_id, rating in dataloader:

    user_id = user_id.to(device)
    anime_id = anime_id.to(device)
    rating = rating.to(torch.float32).to(device)
    
    num_content_features = genre_tensor[anime_id].to(device)

    optimizer.zero_grad()

    pred = model(user_id, anime_id, num_content_features)
    pred = pred.squeeze() 

    loss = loss_func(pred, rating)
    loss.backward()
    optimizer.step()

    running_loss += loss.item()

    y_predictions.extend(pred.detach().cpu().numpy())
    y_true.extend(rating.detach().cpu().numpy())
    
  train_accuracy = calculate_precision(y_true, y_predictions)
  rmse = root_mean_squared_error(y_true, y_predictions)
  rma = mean_absolute_error(y_true, y_predictions)
  r_quadrat = r2_score(y_true, y_predictions)
  train_loss = running_loss / len(dataloader)
  precision_at_k = precision_at_k(y_predictions, y_true, THRESHOLD, K)
  recall_at_k = recall_at_k(y_predictions, y_true, THRESHOLD, K)

  print(f"Average train loss :{train_loss:.4f}")
  print(f"Average train accuracy :{train_accuracy:.4f}")
  print(f"Train RMSE :{rmse:.4f}")
  print(f"Train RMA :{rma:.4f}")
  print(f"Train R-Quadrat :{r_quadrat:.4f}")
  print(f"Train Precision@{K}: {precision_at_k:.4f}")
  print(f"Train Recall@{K}: {recall_at_k:.4f}")
  return train_loss, train_accuracy, rmse, rma, r_quadrat, precision_at_k, recall_at_k

In [148]:
def test(dataloader, model, loss_func, device, genre_tensor):
  y_predictions = []
  y_true = []
  total_loss = 0.0
  THRESHOLD = 7
  K = 10
  model.eval()
  
  with torch.no_grad():
    for user_id, anime_id, rating in dataloader:
      user_id = user_id.to(device)
      anime_id = anime_id.to(device)
      rating = rating.to(torch.float32).to(device)

      num_content_features = genre_tensor[anime_id].to(device)

      pred = model(user_id, anime_id, num_content_features)
      pred = pred.squeeze() 
      loss = loss_func(pred, rating) 

      total_loss += loss.item()  

      y_predictions.extend(pred.detach().cpu().numpy())
      y_true.extend(rating.detach().cpu().numpy())

  #Berechnung der Metriken
  test_accuracy = calculate_precision(y_true, y_predictions)
  test_loss = total_loss / len(dataloader)
  rmse = root_mean_squared_error(y_true, y_predictions) 
  rma = mean_absolute_error(y_true, y_predictions)
  r_quadrat = r2_score(y_true, y_predictions)
  precision_at_k = precision_at_k(y_predictions, y_true, THRESHOLD, K)
  recall_at_k = recall_at_k(y_predictions, y_true, THRESHOLD, K)
  
  print(f"Average test loss: {test_loss:.4f}")
  print(f"Average test accuracy: {test_accuracy:.4f}")
  print(f"Test RMSE: {rmse:.4f}") 
  print(f"Test RMA: {rma:.4f}")
  print(f"Test R_Quadrat: {r_quadrat:.4f}")
  print(f"Test Precision@{K}: {precision_at_k:.4f}")
  print(f"Test Recall@{K}: {recall_at_k:.4f}")

  return test_loss, test_accuracy, rmse, rma, r_quadrat, precision_at_k, recall_at_k

In [149]:
epochs = 10

for e in range(epochs):
  print(f"Epoch {e+1}\n", 40*"-")
  print("TRAIN")
  average_train_loss, train_accuracy, train_rmse, train_rma, train_r_quadrat, train_precision_at_k, train_recall_at_k = train(train_dataloader, hybridRecommendationModelWithNCF, loss_func, device, genre_tensor, optimizer)
  print("TEST")
  average_test_loss, test_accuracy, test_rmse, test_rma, test_r_quadrat, test_precision_at_k, test_recall_at_k = test(test_dataloader, hybridRecommendationModelWithNCF, loss_func, device, genre_tensor)

print("Done!")

Epoch 1
 ----------------------------------------
TRAIN


KeyboardInterrupt: 

In [313]:
# trainiertes Modell speichern
torch.save(hybridRecommendationModelWithNCF.state_dict(), "model.pt")
print("Model saved to model.pt")

Model saved to model.pt


In [130]:
average_test_loss, test_accuracy, test_rmse, test_rma, test_r_quadrat, precision_at_k, recall_at_k = test(test_dataloader, hybridRecommendationModelWithNCF, loss_func, device, genre_tensor)


UnboundLocalError: cannot access local variable 'precision_at_k' where it is not associated with a value

In [86]:
def display_recommendations_and_favorites(model, user_id_mapping, anime_id_mapping, anime_df, ratings_df, genre_tensor, top_k=10, device='cpu'):

    valid_anime_id = set(anime_df['anime_id'])
    ratings_df = ratings_df[ratings_df['anime_id'].isin(valid_anime_id)]

    # Wähle einen zufälligen Benutzer aus
    random_user_id = random.choice(list(user_id_mapping.keys()))
    print(f"Zufällig ausgewählter Benutzer-ID: {random_user_id}")
    
    # Interne Benutzer-ID erhalten
    internal_user_id = user_id_mapping[random_user_id]
    
    # Zeige die (10) beliebtesten Animes des Benutzers
    user_ratings = ratings_df[ratings_df['user_id'] == internal_user_id]
    user_ratings_sorted = user_ratings.sort_values(by='rating', ascending=False).head(top_k)
    
    print("Die beliebtesten Animes des Benutzers:")
    for _, row in user_ratings_sorted.iterrows():
        anime_id = row['anime_id']
        anime_name = anime_df.loc[anime_df['anime_id'] == anime_id, 'name'].values[0]
        genres = anime_df.loc[anime_df['anime_id'] == anime_id, 'genre'].values[0]
        print(f"{anime_name} (Genre: {genres}) - Bewertung: {row['rating']:.2f}")
    
    # Finde Animes, die der Benutzer noch nicht bewertet hat

    all_anime_ids = []
    for _, real_anime_id in enumerate(anime_id_mapping):
        all_anime_ids.append(real_anime_id)

    all_anime_ids = set(all_anime_ids)

    seen_anime_ids = set(user_ratings['anime_id'].values)
    unseen_anime_ids = list(all_anime_ids - seen_anime_ids)

                                                                                                              
    # Vorhersagen für nicht bewertete Animes                                                                    
    user_tensor = torch.tensor([internal_user_id] * len(unseen_anime_ids), dtype=torch.long).to(device)  
    repr_unseen_anime_ids = [anime_id_mapping[anime_id] for anime_id in unseen_anime_ids if anime_id in anime_id_mapping]
    anime_tensor = torch.tensor(repr_unseen_anime_ids, dtype=torch.long).to(device)
    content_features = genre_tensor[anime_tensor].to(device) 
 
    model.eval()
    with torch.no_grad():
        predictions = model(user_tensor, anime_tensor, content_features).cpu().numpy()
    
    # Empfehlungen sortieren
    recommendations = list(zip(unseen_anime_ids, predictions))
    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)[:top_k]
    print("-----------------------------------------------------------------------")
    print("Die relevantesten Empfehlungen für den Benutzer:")
    for anime_id, predicted_score in recommendations:
        anime_name = anime_df.loc[anime_df['anime_id'] == anime_id, 'name'].values[0]
        genres = anime_df.loc[anime_df['anime_id'] == anime_id, 'genre'].values[0]
        print(f"{anime_name} (Genre: {genres}) - Vorhergesagte Bewertung: {predicted_score:.2f}")

# Aufrufen der Funktion
display_recommendations_and_favorites(
    model=hybridRecommendationModelWithNCF,
    user_id_mapping=user_id_mapping,
    anime_id_mapping=anime_id_mapping,
    anime_df=anime_df,
    ratings_df=ratings_df,
    genre_tensor=genre_tensor,
    top_k=10,
    device=device
)


Zufällig ausgewählter Benutzer-ID: 38284
Die beliebtesten Animes des Benutzers:
Prince of Tennis (Genre: Action, Comedy, School, Shounen, Sports) - Bewertung: 10.00
Sengoku Kitan Youtouden Soushuuhen (Genre: Action, Martial Arts, Samurai) - Bewertung: 10.00
Puchi Puri Yuushi (Genre: Comedy, Fantasy, Magic, Shoujo) - Bewertung: 10.00
Fushigi Yuugi OVA 2 (Genre: Adventure, Drama, Fantasy, Historical, Martial Arts, Romance, Shoujo) - Bewertung: 10.00
Rozen Maiden: Träumend (Genre: Action, Comedy, Drama, Magic, Seinen) - Bewertung: 10.00
JoJo no Kimyou na Bouken (Genre: Adventure, Drama, Fantasy, Horror, Shounen, Supernatural, Vampire) - Bewertung: 10.00
Aa! Megami-sama! (Genre: Comedy, Magic, Romance, Seinen, Supernatural) - Bewertung: 9.00
Final Fantasy VII: Last Order (Genre: Action, Adventure, Drama, Fantasy, Sci-Fi) - Bewertung: 8.00
Die relevantesten Empfehlungen für den Benutzer:
['15 Bishoujo Hyouryuuki'] (Genre: ['Comedy, Harem, Hentai']) - Vorhergesagte Bewertung: 13.54
['Persona

In [None]:
def prediction_for_specified_user(model, user_id_mapping, anime_id_mapping, anime_df, genre_tensor, k, filepath, ratings_df=ratings_df, device='cpu'):
    number_users = model.ncf.user_embedding.num_embeddings
    user_ratings_df = pd.read_csv(filepath)
    # (Eventuelle) Zeilen mit fehlenden IDs entfernen
    user_ratings_df = user_ratings_df.dropna(subset=['user_id', 'anime_id'])
    # Zeilen mit '-1' als Bewertungen entfernen
    user_ratings_df = user_ratings_df[user_ratings_df['rating'] != -1]

    user_real_id = user_ratings_df['user_id'].iloc[0]

    if user_real_id not in user_id_mapping:
        user_representational_id = len(user_id_mapping)
        user_id_mapping[user_real_id] = user_representational_id
    else:
        user_representational_id = user_id_mapping[user_real_id]
    if user_representational_id>=number_users:
        user_representational_id=number_users-1    

    user_ratings_df['user_id'] = user_representational_id

    combined_ratings_df  = pd.concat([ratings_df, user_ratings_df], ignore_index=True)

    valid_anime_id = set(anime_df['anime_id'])
    combined_ratings_df = combined_ratings_df[combined_ratings_df['anime_id'].isin(valid_anime_id)]

    all_anime_ids = []
    for _, real_anime_id in enumerate(anime_id_mapping):
        all_anime_ids.append(real_anime_id)

    all_anime_ids = set(all_anime_ids)

    seen_anime_ids = set(combined_ratings_df['anime_id'].values)
    unseen_anime_ids = list(all_anime_ids - seen_anime_ids)

    user_tensor = torch.tensor([user_representational_id] * len(unseen_anime_ids), dtype=torch.long).to(device)
    repr_unseen_anime_ids = [anime_id_mapping[anime_id] for anime_id in unseen_anime_ids if anime_id in anime_id_mapping]
    anime_tensor = torch.tensor(repr_unseen_anime_ids, dtype=torch.long).to(device)
    content_features = genre_tensor[anime_tensor].to(device)


    model.eval()
    with torch.no_grad():
        predictions = model(user_tensor, anime_tensor, content_features).cpu().numpy()

    recommendations = list(zip(unseen_anime_ids, predictions))
    sorted_recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)[:k]
    
    print("Die relevantestenen Empfehlungen für diesen Benutzer sind:")
    for anime_id, predicted_rating in sorted_recommendations:
        name = anime_df.loc[anime_df['anime_id'] == anime_id, 'name'].values[0]
        genre = anime_df.loc[anime_df['anime_id'] == anime_id, 'genre'].values[0]
        print(f"Name: {name}, Genre: {genre}, vorhergesagte Bewertung: {predicted_rating:.2f}")

    
prediction_for_specified_user(hybridRecommendationModelWithNCF,
                              user_id_mapping=user_id_mapping,
                              anime_id_mapping=anime_id_mapping,
                              anime_df=anime_df,
                              genre_tensor=genre_tensor,
                              k=10,
                              filepath='Datensaetze/test_thriller_films.csv',
                              device=device
                              )  

{1: 0, 2: 1, 3: 2, 5: 3, 7: 4, 8: 5, 9: 6, 10: 7, 11: 8, 12: 9, 14: 10, 15: 11, 16: 12, 17: 13, 18: 14, 19: 15, 20: 16, 21: 17, 22: 18, 23: 19, 24: 20, 25: 21, 26: 22, 27: 23, 28: 24, 29: 25, 30: 26, 31: 27, 32: 28, 33: 29, 34: 30, 35: 31, 36: 32, 37: 33, 38: 34, 39: 35, 40: 36, 41: 37, 42: 38, 43: 39, 44: 40, 45: 41, 46: 42, 47: 43, 48: 44, 50: 45, 51: 46, 52: 47, 53: 48, 55: 49, 56: 50, 57: 51, 58: 52, 59: 53, 60: 54, 61: 55, 62: 56, 63: 57, 64: 58, 65: 59, 66: 60, 67: 61, 68: 62, 69: 63, 70: 64, 71: 65, 72: 66, 73: 67, 74: 68, 75: 69, 76: 70, 77: 71, 78: 72, 79: 73, 80: 74, 81: 75, 82: 76, 83: 77, 84: 78, 85: 79, 86: 80, 87: 81, 88: 82, 90: 83, 91: 84, 92: 85, 93: 86, 94: 87, 95: 88, 96: 89, 97: 90, 98: 91, 99: 92, 100: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 107: 100, 108: 101, 109: 102, 110: 103, 111: 104, 112: 105, 113: 106, 114: 107, 115: 108, 116: 109, 117: 110, 118: 111, 119: 112, 120: 113, 121: 114, 122: 115, 123: 116, 124: 117, 125: 118, 126: 119, 127: 120,