In [1]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, message="pkg_resources is deprecated as an API.*")

import json, sys, platform, subprocess, datetime as dt
from pathlib import Path
import pandas as pd
from joblib import dump
import numpy as np
from surprise import SVD, Dataset, Reader, accuracy
from sklearn.model_selection import train_test_split as sk_split, KFold

# 1. Učitacvanje podataka

In [2]:
ratings = pd.read_csv("../data/raw/ratings.csv")

# filtriraj ultra-retke korisnike/filmove radi stabilnosti
# min_ratings_user = 5
# min_ratings_item = 5
# counts_u = ratings['userId'].value_counts()
# counts_i = ratings['movieId'].value_counts()
# ratings_f = ratings[
#     ratings['userId'].isin(counts_u[counts_u >= min_ratings_user].index) &
#     ratings['movieId'].isin(counts_i[counts_i >= min_ratings_item].index)
# ].copy()
# print("Filtered:", ratings_f.shape)
ratings_f = ratings.copy()
ratings_f.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


# 2. Pristup cosine_similarity

In [3]:
# pivot film x korisnik (bolje item-based zbog veličine)
df_r = ratings_f[['userId','movieId','rating']].copy()
train_df, test_df = sk_split(df_r, test_size=0.2, random_state=42, shuffle=True)

R = train_df.pivot_table(index='movieId', columns='userId', values='rating')

# mean-centering po itemu (oduzmi prosečnu ocenu svakog filma) tj radi normalizaciju
item_mean = R.mean(axis=1)
R_mc = R.sub(item_mean, axis=0).fillna(0.0)  # prazno = 0 nakon centriranja
R_mc.shape

(8983, 610)

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

X_items = csr_matrix(R_mc.values)  # sparse radi brzine
S_items = cosine_similarity(X_items, dense_output=False)  # film x film (sparse)
movie_index = {mid: i for i, mid in enumerate(R_mc.index)}
index_movie = {i: mid for mid, i in movie_index.items()}

### Procena cosine_similarity-a

In [5]:
# korisnik -> (movieId, rating)
user_hist = ratings_f.groupby('userId')[['movieId','rating']].apply(
    lambda g: list(zip(g['movieId'].values, g['rating'].values))
)

def predict_user_item(user_id, movie_id, k=20, min_sim=0.0):
    if user_id not in user_hist or movie_id not in movie_index:
        return np.nan
    
    i_tgt = movie_index[movie_id]
    sims = S_items[i_tgt].toarray().ravel()  # sličnosti targeta ka svim filmovima

    # kandidati = filmovi koje je korisnik ocenio i postoje u matrici
    cand = []
    for m_id, r in user_hist[user_id]:
        j = movie_index.get(m_id)
        if j is None or j == i_tgt:
            continue
        s = sims[j]
        if s > min_sim:
            # koristimo odstupanje od proseka ocenjenog filma
            dev = r - item_mean.loc[m_id]
            cand.append((s, dev, m_id))

    if not cand:
        return np.nan

    # uzmi top-k po sličnosti
    cand.sort(key=lambda x: x[0], reverse=True)
    cand = cand[:k]

    # uteženi prosek odstupanja
    num = sum(s * dev for s, dev, _ in cand)
    den = sum(abs(s) for s, _, _ in cand)
    if den == 0:
        return np.nan

    # vrati na skalu dodavanjem proseka target filma (baseline)
    return float(item_mean.loc[movie_id] + num / den)

def predict_with_true(user_id, movie_id, k=20, min_sim=0.0):
    # 1) pronađi pravu ocenu
    true_rating = ratings_f[
        (ratings_f['userId'] == user_id) &
        (ratings_f['movieId'] == movie_id)
    ]['rating']
    
    if len(true_rating) > 0:
        true_rating = float(true_rating.iloc[0])
    else:
        true_rating = None  # korisnik nije ocenio film

    # 2) predviđanje
    pred = predict_user_item(user_id, movie_id, k=k, min_sim=min_sim)

    return {
        "user_id": user_id,
        "movie_id": movie_id,
        "true_rating": true_rating,
        "pred_rating": pred
    }

some_user = ratings_f['userId'].iloc[0]
some_movie = ratings_f['movieId'].iloc[0]
predict_with_true(some_user, some_movie, k=20)

{'user_id': 1,
 'movie_id': 1,
 'true_rating': 4.0,
 'pred_rating': 4.499699600539054}

In [6]:
def evaluate_item_cf(ratings_df, k=10, min_sim=0.0):
    abs_errors = []   # za MAE
    sq_errors = []    # za RMSE
    n_total = 0
    n_skipped = 0     # slučajevi gde nismo mogli da predvidimo (NaN)

    for row in ratings_df.itertuples(index=False):
        n_total += 1
        user_id = row.userId
        movie_id = row.movieId
        true_rating = row.rating

        pred_rating = predict_user_item(user_id, movie_id, k=k, min_sim=min_sim)

        if np.isnan(pred_rating):
            n_skipped += 1
            continue

        err = pred_rating - true_rating
        abs_errors.append(abs(err))
        sq_errors.append(err ** 2)

    mae = float(np.mean(abs_errors)) if abs_errors else np.nan
    rmse = float(np.sqrt(np.mean(sq_errors))) if sq_errors else np.nan
    coverage = (n_total - n_skipped) / n_total if n_total > 0 else 0.0

    return {
        "mae": mae,
        "rmse": rmse,
        "coverage": coverage,
        "n_total": n_total,
        "n_used": n_total - n_skipped,
        "n_skipped": n_skipped
    }

# evaluate_item_cf(test_df, k=10)
# {'mae': 0.6533359841168803,
#  'rmse': 0.8630645431263049,
#  'coverage': 0.9202697342324474,
#  'n_total': 20168,
#  'n_used': 18560,
#  'n_skipped': 1608}

### 3. Pristup SVD

### Train

In [7]:
reader = Reader(rating_scale=(0.5, 5.0))
df_r = ratings_f[['userId','movieId','rating']].copy()

train_df, test_df = sk_split(df_r, test_size=0.2, random_state=42, shuffle=True)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scores, mae_scores = [], []

for tr_idx, val_idx in kf.split(train_df):
    tr_df = train_df.iloc[tr_idx]
    val_df = train_df.iloc[val_idx]

    # Surprise Dataset -> Trainset
    train_data = Dataset.load_from_df(tr_df, reader)
    trainset = train_data.build_full_trainset()

    # Model
    algo = SVD(
        n_factors=200,
        n_epochs=20,
        lr_all=0.007,
        reg_all=0.05,
        random_state=42
    )
    algo.fit(trainset)

    valset = list(val_df.itertuples(index=False, name=None))
    preds = algo.test(valset)
    rmse_scores.append(accuracy.rmse(preds, verbose=False))
    mae_scores.append(accuracy.mae(preds,  verbose=False))

print(f"CV RMSE mean: {np.mean(rmse_scores):.4f}  (std: {np.std(rmse_scores):.4f})")
print(f"CV MAE  mean: {np.mean(mae_scores):.4f}  (std: {np.std(mae_scores):.4f})")

CV RMSE mean: 0.8733  (std: 0.0057)
CV MAE  mean: 0.6723  (std: 0.0032)


### Test

In [8]:
final_data = Dataset.load_from_df(train_df, reader)
final_trainset = final_data.build_full_trainset()

final_algo = SVD(
    n_factors=200,
    n_epochs=20,
    lr_all=0.007,
    reg_all=0.05,
    random_state=42
)
final_algo.fit(final_trainset)

testset = list(test_df.itertuples(index=False, name=None))
test_preds = final_algo.test(testset)
print("Hold-out RMSE:", accuracy.rmse(test_preds, verbose=False))
print("Hold-out MAE :", accuracy.mae(test_preds,  verbose=False))

Hold-out RMSE: 0.8732595395633489
Hold-out MAE : 0.6680720298238061


### Ručno testiranje

In [9]:
df_movies = pd.read_parquet("../data/processed/movies_merged.parquet")
cnt_items = ratings_f['movieId'].value_counts()
all_items = set(ratings_f['movieId'].unique())

def recommend_for_user_svd(user_id, algo, df_index, top_n=10, min_ratings_item=5):
    rated = set(ratings_f.loc[ratings_f['userId'] == user_id, 'movieId'])
    candidates = [iid for iid in all_items - rated
                  if cnt_items.get(iid, 0) >= min_ratings_item]

    # proceni ocenu za sve kandidate
    est = np.array([algo.predict(user_id, iid).est for iid in candidates])

    # top-N
    if len(est) == 0:
        return pd.DataFrame(columns=["userId","movieId","title","est_rating"])

    idx = np.argpartition(est, -top_n)[-top_n:]
    idx = idx[np.argsort(est[idx])[::-1]]
    rec_iids = [candidates[i] for i in idx]
    out = pd.DataFrame({"userId": user_id, "movieId": rec_iids, "est_rating": est[idx]})
    # spajanje naslova (pretpostavka: df_index ima movieId,title)
    out = out.merge(df_index[["movieId","title"]], on="movieId", how="left")
    return out[["userId","movieId","title","est_rating"]]

In [10]:
# Odaberi korisnika (npr. prvog iz skupa)
sample_user = ratings_f['userId'].iloc[0]

# Pozovi funkciju i ispiši rezultate
recommendations = recommend_for_user_svd(
    user_id=sample_user,
    algo=final_algo,
    df_index=df_movies,
    top_n=10,
    min_ratings_item=20
)

# Prikaz
recommendations

Unnamed: 0,userId,movieId,title,est_rating
0,1,912,Casablanca (1942),5.0
1,1,1223,"Grand Day Out with Wallace and Gromit, A (1989)",5.0
2,1,4973,"Amelie (Fabuleux destin d'Amélie Poulain, Le) ...",5.0
3,1,318,"Shawshank Redemption, The (1994)",5.0
4,1,1104,"Streetcar Named Desire, A (1951)",5.0
5,1,56782,There Will Be Blood (2007),5.0
6,1,1262,"Great Escape, The (1963)",4.943917
7,1,6016,City of God (Cidade de Deus) (2002),4.938689
8,1,858,"Godfather, The (1972)",4.930713
9,1,68954,Up (2009),4.928588


### Precision i recall

In [11]:
def precision_recall_at_k(predictions, k=10, threshold=4.0):
    # grupiši po korisniku
    from collections import defaultdict
    by_user = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        by_user[uid].append((iid, est, true_r))
    precisions, recalls = [], []
    for uid, vals in by_user.items():
        vals.sort(key=lambda x: x[1], reverse=True)
        topk = vals[:k]
        n_rel = sum((true >= threshold) for (_, _, true) in vals)  # svi relevantni u testu
        n_rec_k = sum((est >= threshold) for (_, est, _) in topk)
        n_rel_and_rec_k = sum(((true >= threshold) and (est >= threshold)) for (_, est, true) in topk)
        prec = n_rel_and_rec_k / k if k else 0
        rec = n_rel_and_rec_k / n_rel if n_rel else 0
        precisions.append(prec); recalls.append(rec)
    return np.mean(precisions), np.mean(recalls)

prec, rec = precision_recall_at_k(test_preds, k=10, threshold=4.0)
print(f"Precision@10={prec:.3f}  Recall@10={rec:.3f}")

Precision@10=0.330  Recall@10=0.279


# 4. Čuvanje modela

In [12]:
dump(final_algo, "../models/svd_model.pkl")

# load
# from joblib import load
# final_algo = load("../models/svd_model.pkl")

['../models/svd_model.pkl']