# 1. Importi i uƒçitavanje podataka + modela

In [1]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, message="pkg_resources is deprecated as an API.*")

from pathlib import Path
import numpy as np
import pandas as pd

from scipy.sparse import load_npz
from sklearn.metrics.pairwise import cosine_similarity

import numpy as np
from joblib import load as joblib_load
from surprise import SVD, Dataset, Reader

# putanje (prilagodi ako ti je drugaƒçije)
DATA_DIR = Path("../data")
PROC_DIR = DATA_DIR / "processed"
RAW_DIR  = DATA_DIR / "raw"
MODELS_DIR = Path("../models")


In [2]:
# 1) Uƒçitaj filmove (isti fajl kao u 03 i 04)
df_movies = pd.read_parquet(PROC_DIR / "movies_merged.parquet")

print(df_movies.shape)
df_movies.head()

(9708, 24)


Unnamed: 0,movieId,title,mean_rating,rating_count,text_features,genre_action,genre_adventure,genre_animation,genre_children,genre_comedy,...,genre_film-noir,genre_horror,genre_imax,genre_musical,genre_mystery,genre_romance,genre_sci-fi,genre_thriller,genre_war,genre_western
0,1,Toy Story (1995),3.92093,215,toy story pixar pixar fun,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),3.431818,110,jumanji fantasy magic board game robin willia...,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),3.259615,52,grumpier old men moldy old,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),2.357143,7,waiting to exhale,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),3.071429,49,father of the bride part ii pregnancy remake,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# 2) Uƒçitaj originalne ocene + filtriranje kao u 04_collaborative_filtering.ipynb
ratings = pd.read_csv(RAW_DIR / "ratings.csv")

min_ratings_user = 5
min_ratings_item = 5
counts_u = ratings['userId'].value_counts()
counts_i = ratings['movieId'].value_counts()

ratings_f = ratings[
    ratings['userId'].isin(counts_u[counts_u >= min_ratings_user].index) &
    ratings['movieId'].isin(counts_i[counts_i >= min_ratings_item].index)
].copy()

print("Filtered ratings_f:", ratings_f.shape)
ratings_f.head()

Filtered ratings_f: (90274, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
# 3) Hold-out split (isti kao u 04 ‚Äì da koristi≈° identiƒçan test skup)
from sklearn.model_selection import train_test_split as sk_split
from surprise import Reader

reader = Reader(rating_scale=(0.5, 5.0))
df_r = ratings_f[['userId','movieId','rating']].copy()

train_df, test_df = sk_split(df_r, test_size=0.2, random_state=42, shuffle=True)

print("Train size:", train_df.shape, "Test size:", test_df.shape)


Train size: (72219, 3) Test size: (18055, 3)


In [5]:
# 4) Uƒçitaj SVD model iz prethodnog koraka
svd_model: SVD = joblib_load(MODELS_DIR / "svd_model.pkl")
svd_model


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1be4905a8f0>

In [6]:
# 5) Uƒçitaj CBF matricu X i (opciono) tfidf vektorizer
X_cbf = load_npz(MODELS_DIR / "X_cbf.npz")

# nije ti nu≈æno potreban ovde, ali mo≈æe≈° da ga uƒçita≈° ako ti zatreba
# tfidf = joblib_load(MODELS_DIR / "tfidf_text_features.joblib")

X_cbf.shape


(9708, 1766)

In [7]:
# 6) Pripremi mapiranje movieId -> index u X_cbf / df_movies
# (pretpostavka: df_movies je isti redosled kao u 03 i tamo je kreiran X_cbf)
movie_index_by_id = {mid: idx for idx, mid in enumerate(df_movies["movieId"])}
len(movie_index_by_id), df_movies.shape


(9708, (9708, 24))

# 2. CBF user profil ‚Äì vektor ukusa korisnika

In [8]:
from collections import defaultdict

def build_user_profile(user_id, ratings_df, df_movies, X, min_rating=4.0):
    """
    User-based CBF: napravi profil korisnika kao prosek vektora filmova
    koje je ocenio >= min_rating.
    """
    # ocene korisnika u TRAIN skupu
    user_ratings = ratings_df[ratings_df["userId"] == user_id]
    liked = user_ratings[user_ratings["rating"] >= min_rating]["movieId"].values

    idxs = [movie_index_by_id[m]
            for m in liked
            if m in movie_index_by_id]

    if len(idxs) == 0:
        return None  # korisnik nema dovoljno "lajkovanih" filmova

    # sparse srednji vektor -> np.matrix
    user_vec = X[idxs].mean(axis=0)

    # PRETVORI u obiƒçan numpy array (1 x d)
    user_vec = np.asarray(user_vec)
    if user_vec.ndim == 1:
        user_vec = user_vec.reshape(1, -1)

    return user_vec



In [9]:
from sklearn.metrics.pairwise import cosine_similarity

def cbf_score(user_profile, movie_idx, X):
    """
    Kosinusna sliƒçnost izmeƒëu user profila i filma.
    Vraƒáa skalar (>= 0).
    """
    if user_profile is None:
        return 0.0

    # osiguraj da je np.ndarray oblika (1, d)
    up = np.asarray(user_profile)
    if up.ndim == 1:
        up = up.reshape(1, -1)

    movie_vec = X[movie_idx]  # 1 x d (sparse)

    sim = cosine_similarity(movie_vec, up).ravel()[0]
    return float(max(sim, 0.0))


In [10]:
def cbf_score_to_rating(sim, min_rating=0.5, max_rating=5.0):
    """
    Jednostavno mapiranje [0,1] -> [0.5, 5.0]
    """
    return min_rating + (max_rating - min_rating) * sim


In [11]:
def normalize_cf_rating(est, min_rating=0.5, max_rating=5.0):
    return (est - min_rating) / (max_rating - min_rating)


In [12]:
def recommend_hybrid_for_user(
    user_id,
    svd_model,
    train_ratings,
    df_movies,
    X,
    alpha_cf=0.7,
    top_n=10,
    min_ratings_item=5
):
    """
    Kombinuje CF (SVD) i CBF user-profil u jedan hibridni skor.
    alpha_cf je te≈æina CF dela (0..1).
    """

    # 1) korisniƒçki profil iz TRAIN skupa
    user_profile = build_user_profile(user_id, train_ratings, df_movies, X, min_rating=4.0)

    # 2) filmovi koje je korisnik veƒá ocenio (da ih iskljuƒçimo)
    rated = set(train_ratings.loc[train_ratings["userId"] == user_id, "movieId"])

    # 3) kandidati: svi filmovi koji su u filtriranom skupu i imaju dovoljno ocena
    cnt_items = train_ratings["movieId"].value_counts()
    all_items = set(train_ratings["movieId"].unique())

    candidates = [
        iid for iid in all_items
        if (iid not in rated) and (cnt_items.get(iid, 0) >= min_ratings_item)
    ]

    if not candidates:
        return pd.DataFrame(columns=["movieId","title","hyb_score","cf_est","cbf_sim"])

    cf_est_list = []
    cbf_sim_list = []
    movie_ids = []

    for iid in candidates:
        # CF deo
        cf_est = svd_model.predict(user_id, iid).est
        cf_norm = normalize_cf_rating(cf_est)

        # CBF deo
        if iid in movie_index_by_id:
            midx = movie_index_by_id[iid]
            sim = cbf_score(user_profile, midx, X)
        else:
            sim = 0.0

        # Hibridni skor
        hyb = alpha_cf * cf_norm + (1 - alpha_cf) * sim

        movie_ids.append(iid)
        cf_est_list.append(cf_est)
        cbf_sim_list.append(sim)

    res = pd.DataFrame({
        "movieId": movie_ids,
        "cf_est": cf_est_list,
        "cbf_sim": cbf_sim_list,
    })
    res["hyb_score"] = alpha_cf * normalize_cf_rating(res["cf_est"]) + (1 - alpha_cf) * res["cbf_sim"]

    # dodaj naslove
    res = res.merge(df_movies[["movieId","title","mean_rating","rating_count"]], on="movieId", how="left")

    res = res.sort_values("hyb_score", ascending=False).head(top_n)
    return res[["movieId","title","hyb_score","cf_est","cbf_sim","mean_rating","rating_count"]]


In [13]:
sample_user = ratings_f["userId"].iloc[0]

hyb_recs = recommend_hybrid_for_user(
    user_id=sample_user,
    svd_model=svd_model,
    train_ratings=train_df,
    df_movies=df_movies,
    X=X_cbf,
    alpha_cf=0.7,
    top_n=10,
    min_ratings_item=20
)

print(f"üé¨ Hibridne preporuke za korisnika {sample_user}:\n")
hyb_recs


üé¨ Hibridne preporuke za korisnika 1:



Unnamed: 0,movieId,title,hyb_score,cf_est,cbf_sim,mean_rating,rating_count
578,3275,"Boondock Saints, The (2000)",0.882499,5.0,0.60833,4.22093,43
801,6016,City of God (Cidade de Deus) (2002),0.87523,4.709865,0.734541,4.146667,75
388,1884,Fear and Loathing in Las Vegas (1998),0.869589,4.840361,0.648072,3.967391,46
224,908,North by Northwest (1959),0.869182,4.983629,0.57243,4.184211,57
293,1262,"Great Escape, The (1963)",0.86775,4.837024,0.643674,4.127907,43
276,1223,"Grand Day Out with Wallace and Gromit, A (1989)",0.866421,4.936801,0.587507,4.089286,28
96,57669,In Bruges (2008),0.861659,4.89382,0.593918,4.158537,41
407,2000,Lethal Weapon (1987),0.860641,4.733548,0.673631,3.673333,75
824,6350,Laputa: Castle in the Sky (Tenk√ª no shiro Rapy...,0.859503,4.781358,0.645045,4.0625,24
543,3000,Princess Mononoke (Mononoke-hime) (1997),0.859322,4.686414,0.693673,3.958333,48


In [14]:
# ƒåiste SVD preporuke (isti helper kao u 04_collaborative_filtering ‚Äì mo≈æe≈° da copy/paste)
def recommend_for_user_svd(user_id, algo, df_index, ratings_df, top_n=10, min_ratings_item=5):
    cnt_items = ratings_df["movieId"].value_counts()
    all_items = set(ratings_df["movieId"].unique())
    rated = set(ratings_df.loc[ratings_df['userId'] == user_id, 'movieId'])

    candidates = [iid for iid in all_items - rated
                  if cnt_items.get(iid, 0) >= min_ratings_item]

    est = np.array([algo.predict(user_id, iid).est for iid in candidates])

    if len(est) == 0:
        return pd.DataFrame(columns=["userId","movieId","title","est_rating"])

    top_idx = np.argpartition(est, -top_n)[-top_n:]
    top_idx = top_idx[np.argsort(est[top_idx])[::-1]]

    top_movies = [candidates[i] for i in top_idx]
    df_top = df_index[df_index["movieId"].isin(top_movies)].copy()
    df_top["est_rating"] = [algo.predict(user_id, mid).est for mid in df_top["movieId"]]
    return df_top.sort_values("est_rating", ascending=False).head(top_n)[["movieId","title","est_rating"]]

svd_recs = recommend_for_user_svd(
    user_id=sample_user,
    algo=svd_model,
    df_index=df_movies,
    ratings_df=train_df,
    top_n=10,
    min_ratings_item=20
)

svd_recs


Unnamed: 0,movieId,title,est_rating
277,318,"Shawshank Redemption, The (1994)",5.0
602,750,Dr. Strangelove or: How I Learned to Stop Worr...,5.0
681,899,Singin' in the Rain (1952),5.0
975,1276,Cool Hand Luke (1967),5.0
1067,1387,Jaws (1975),5.0
2462,3275,"Boondock Saints, The (2000)",5.0
896,1193,One Flew Over the Cuckoo's Nest (1975),4.99989
686,904,Rear Window (1954),4.992014
690,908,North by Northwest (1959),4.983629
4581,6807,Monty Python's The Meaning of Life (1983),4.975612


In [15]:
def make_predictions_svd(test_df, svd_model):
    preds = []
    for row in test_df.itertuples(index=False):
        uid, iid, true_r = row
        est = svd_model.predict(uid, iid).est
        preds.append((uid, iid, true_r, est, None))
    return preds


In [18]:
def make_predictions_cbf_hybrid(test_df, train_df, df_movies, X, svd_model, alpha_cf=0.7):
    # ke≈°iraj user profile da ne raƒçuna≈° iznova za svaki red
    user_profiles = {}
    preds_cbf = []
    preds_hyb = []

    for row in test_df.itertuples(index=False):
        uid, iid, true_r = row

        if uid not in user_profiles:
            user_profiles[uid] = build_user_profile(uid, train_df, df_movies, X, min_rating=4.0)

        up = user_profiles[uid]

        # CBF
        if iid in movie_index_by_id:
            midx = movie_index_by_id[iid]
            sim = cbf_score(up, midx, X)
        else:
            sim = 0.0

        cbf_est = cbf_score_to_rating(sim)

        # CF
        cf_est = svd_model.predict(uid, iid).est
        cf_norm = normalize_cf_rating(cf_est)

        # Hybrid
        hyb_score = alpha_cf * cf_norm + (1 - alpha_cf) * sim
        hyb_est  = cbf_score_to_rating(hyb_score)  # mapiramo nazad u [0.5, 5.0]

        preds_cbf.append((uid, iid, true_r, cbf_est, None))
        preds_hyb.append((uid, iid, true_r, hyb_est, None))

    return preds_cbf, preds_hyb
