> This is supposed to be kind of a blueprint of how to train and predict data

We'll start by reading the created matrix in form of data/preocessed/user_matrix.csv

In [4]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
import pickle
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors

In [5]:
project_dir = Path().resolve().parent
print(f"Project directory: {project_dir}")
genome_scores_path = os.path.join(project_dir, "data", "raw", "genome-scores.csv")
ratings_path = os.path.join(project_dir, "data", "raw", "ratings.csv")
user_matrix_path = os.path.join(project_dir, "data", "processed", "user_matrix.csv")
train_user_matrix_path = os.path.join(project_dir, "data", "processed", "train_user_matrix.csv")
test_user_matrix_path = os.path.join(project_dir, "data", "processed", "test_user_matrix.csv")


In [6]:
# Create movie embeddings data
df_scores = pd.read_csv(genome_scores_path,dtype={
    "movieId": "int32",
    "tagId": "int16",
    "relevance": "float32"})
movie_embeddings = df_scores.pivot(
    index="movieId",
    columns="tagId",
    values="relevance").fillna(0)

In [7]:
# Genres
df_movies = pd.read_csv(movies_path)
df_genres = df_movies["genres"].str.get_dummies(sep="|")
df_genres["movieId"] = df_movies["movieId"]

# Tags
df_tags = pd.read_csv(tags_path)
df_tags = df_tags.dropna(subset=["tag"])
df_tagtext = df_tags.groupby("movieId")["tag"].apply(lambda tags: " ".join(tags)).reset_index()

In [8]:
# # read the data
# user_matrix = pd.read_csv("../data/processed/user_matrix.csv")
# X = user_matrix.drop("userId", axis=1)
# y = user_matrix["userId"]

In [9]:
# Split the data into train and test sets, because it is dependent on the user
# we have to use a different approach for splitting the data
# we will use the userId as the key for splitting the data and generate train
# and test sets for each user
train_list = []
test_list = []

df_ratings = pd.read_csv(ratings_path, dtype={
    "userId": "int32",
    "movieId": "int32",
    "rating": "float32"})

for user_id, group in df_ratings.groupby("userId"):
    if len(group) < min_ratings:  # to small to predict
        continue
    train, test = train_test_split(group, test_size=test_size)
    train_list.append(train)
    test_list.append(test)

train_data = pd.concat(train_list)
test_data= pd.concat(test_list)


In [10]:
# Build user vectors
user_vectors = []
user_ids = []

for user_id, group in df_ratings.groupby("userId"):
    rated_movies = group["movieId"].values
    common_movies = [mid for mid in rated_movies if mid in movie_embeddings.index]
    if not common_movies:
        continue
    vectors = movie_embeddings.loc[common_movies]
    user_vector = vectors.mean(axis=0)
    user_vectors.append(user_vector)
    user_ids.append(user_id)

user_data = pd.DataFrame(user_vectors, index=user_ids).reset_index().rename(columns={"index": "userId"})

In [11]:
# KNN training
X = user_data.drop("userId", axis=1)
knn = NearestNeighbors(n_neighbors=n_neighbors, algorithm=algorithm)
knn.fit(X)

In [12]:
precisions = []

if n_users > 0:
    test_user_ids = test_data["userId"].unique()[:n_users]
else:
    test_user_ids = test_data["userId"].unique()

for user_id in tqdm(test_user_ids, desc="Evaluating"):
    try:
        test_row = test_data[test_data["userId"] == user_id]
        test_movie = test_row.iloc[0]["movieId"]

        user_history = train_data[train_data["userId"] == user_id]
        if user_history.empty:
            continue

        # hole Vektor für den aktuellen Test-User
        vector = user_data[user_data["userId"] == user_id].drop("userId", axis=1).values
        if vector.size == 0:
            continue

        _, indices = knn.kneighbors(vector, n_neighbors=n_neighbors)
        similar_users = user_data.iloc[indices[0]]["userId"].values

        recommended_movies = (
            train_data[train_data["userId"].isin(similar_users)]["movieId"]
            .value_counts()
            .head(n_neighbors)
            .index.tolist()
        )

        precision = 1 if test_movie in recommended_movies else 0
        precisions.append(precision)
    except Exception as e:
        print(f"Fehler bei User {user_id}: {e}")
        continue

print(f"\nPrecision@{n_neighbors}: {np.mean(precisions):.4f}")

Evaluating: 100%|██████████| 2000/2000 [06:10<00:00,  5.40it/s]


Precision@10: 0.0445





In [13]:
# Create the model
# model = NearestNeighbors(n_neighbors=10, algorithm="brute", metric="cosine")
model = NearestNeighbors(n_neighbors=20, algorithm="ball_tree").fit(X)

In [14]:
# Safe the model
model_path = "../models/knn_model.pkl"
os.makedirs(os.path.dirname(model_path), exist_ok=True)
with open(model_path, "wb") as f:
    pickle.dump(model, f)

print(f"Model saved at: {model_path}")

Model saved at: ../models/knn_model.pkl


In [15]:
# predict the neighbors
