In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix, hstack

# 1. Ucitavanje podataka

In [2]:
df = pd.read_parquet("../data/processed/movies_merged.parquet")

# 2. Izbor Vectorizera 

In [3]:
# 3) Kreiranje Vektorizera
tfidf = TfidfVectorizer(
    stop_words='english',
    min_df=4,
    max_df=0.8,
    ngram_range=(1,3),
    sublinear_tf=True
)
X_text_features = tfidf.fit_transform(df["text_features"])   # sparse matrica

In [4]:
# from sklearn.feature_extraction.text import CountVectorizer

# bow = CountVectorizer(stop_words='english', min_df=3, max_df=0.8, ngram_range=(1,2), binary=False)
# X_text_features = bow.fit_transform(df["text_features"])

In [5]:
# from sklearn.feature_extraction.text import HashingVectorizer

# hv = HashingVectorizer(stop_words='english', n_features=2**18, alternate_sign=False, ngram_range=(1,3))
# X_text_features = hv.transform(df["text_features"])

In [6]:
# from sklearn.decomposition import TruncatedSVD
# from sklearn.preprocessing import Normalizer
# from sklearn.pipeline import make_pipeline

# svd = TruncatedSVD(n_components=128, random_state=0)
# lsa = make_pipeline(svd, Normalizer(copy=False))
# X_text_features_tfidf = tfidf.fit_transform(df["text_features"])
# X_text_features = lsa.fit_transform(X_text_features_tfidf)

# 3. Kreiranje sparse matrice za genre kolonu

In [7]:
genre_cols = [c for c in df.columns if c.startswith("genre_")]

In [8]:
from scipy.sparse import csr_matrix

X_genres = csr_matrix(df[genre_cols].values, dtype=float)  # one-hot to sparse

# 4. Spajanje sparse matrica u jednu

In [9]:
from scipy.sparse import hstack

alpha = 0.6  # žanrovi alpha, tag + tittle 1-alpha
X = hstack([alpha * X_genres, (1 - alpha) * X_text_features], format="csr")
X.shape

(9708, 1766)

# 5. Testiranje

### Ručna procena cosine_similarity-a

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

# mape titlovi<->indeksi
idx_by_title = {t.lower(): i for i, t in enumerate(df["title"])}
title_by_idx = df["title"].to_dict()

def get_similar_by_index(i, top_n=10, min_ratings=0):
    # sličnost samo prema i-tom redu
    sim_row = cosine_similarity(X[i], X, dense_output=False).toarray().ravel()
    sim_row[i] = -1  # isključi isti film
    # opcioni filter po broju ocena
    if min_ratings > 0:
        mask = (df["rating_count"].values >= min_ratings)
        sim_row = np.where(mask, sim_row, -1)
    # top N
    top_idx = np.argpartition(sim_row, -top_n)[-top_n:]
    top_idx = top_idx[np.argsort(sim_row[top_idx])[::-1]]
    out = df.loc[top_idx, ["movieId","title","mean_rating","rating_count"]].copy()
    out["similarity"] = sim_row[top_idx]
    return out.reset_index(drop=True)

def get_similar_movies(title, top_n=10, min_ratings=0):
    i = idx_by_title.get(title.lower())
    if i is None:
        raise ValueError(f"Film '{title}' nije pronađen.")
    return get_similar_by_index(i, top_n=top_n, min_ratings=min_ratings)


In [11]:
get_similar_movies("Toy Story (1995)", top_n=10, min_ratings=50)


Unnamed: 0,movieId,title,mean_rating,rating_count,similarity
0,3114,Toy Story 2 (1999),3.860825,97,0.965533
1,4886,"Monsters, Inc. (2001)",3.871212,132,0.918367
2,78499,Toy Story 3 (2010),4.109091,55,0.8833
3,2355,"Bug's Life, A (1998)",3.516304,92,0.856316
4,4306,Shrek (2001),3.867647,170,0.844113
5,673,Space Jam (1996),2.707547,53,0.844113
6,6377,Finding Nemo (2003),3.960993,141,0.813157
7,5218,Ice Age (2002),3.688235,85,0.813157
8,2987,Who Framed Roger Rabbit? (1988),3.572165,97,0.785375
9,2005,"Goonies, The (1985)",3.570175,57,0.766652


### Mera slicnosti 

In [12]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# uzorak da bude brzo
np.random.seed(0)
subset_idx = np.random.choice(len(df), size=min(500, len(df)), replace=False)
X_sub = X[subset_idx]
sim = cosine_similarity(X_sub, X_sub)  # dense za mali uzorak

genre_cols = [c for c in df.columns if c.startswith("genre_")]
G = df.iloc[subset_idx][genre_cols].values

same, diff = [], []
for i in range(len(subset_idx)):
    # delim zajednički žanr? (OR preko svih žanrova)
    shared = (G[i] & G).sum(axis=1) > 0
    for j in range(len(subset_idx)):
        if i == j: 
            continue
        (same if shared[j] else diff).append(sim[i, j])

same = np.array(same); diff = np.array(diff)
print(f"Mean(sim | shared-genre): {same.mean():.4f}")
print(f"Mean(sim | no-shared-genre): {diff.mean():.4f}")
print(f"Δ = {same.mean() - diff.mean():.4f} (što veće, to bolje)")

#lsa 0.4599 
#hv 0.4376 
#bow 0.3921
#tfid 0.4630

Mean(sim | shared-genre): 0.4632
Mean(sim | no-shared-genre): 0.0002
Δ = 0.4630 (što veće, to bolje)


### Hit Rate - Koliko često se nalazi film u top 10 preporučenih

In [13]:
# učitaj originalne ocene (raw)
ratings = pd.read_csv("../data/raw/ratings.csv")

def hitrate_at_k(k=10, min_user_likes=3, user_limit=500):
    hits = 0
    total = 0
    users = ratings.groupby("userId")
    for uid, grp in users:
        liked = grp[grp["rating"] >= 4.0].sort_values("timestamp")
        if len(liked) < min_user_likes:
            continue
        # query = pretposlednji, target = poslednji (leave-one-out stil)
        target = liked.iloc[-1]["movieId"]
        query  = liked.iloc[-2]["movieId"]
        # mapiraj na indexe u df
        if target not in set(df["movieId"]) or query not in set(df["movieId"]):
            continue
        qi = df.index[df["movieId"] == query][0]
        rec = get_similar_by_index(qi, top_n=k, min_ratings=0)
        if target in set(rec["movieId"]):
            hits += 1
        total += 1
        if total >= user_limit:
            break
    return hits / total if total > 0 else np.nan

print("HitRate@10 (CBF, leave-one-out proxy):", hitrate_at_k(k=10, min_user_likes=3, user_limit=500))
#lsa 0.01
#hv 0.012
#bow 0.01
#tfid 0.01

HitRate@10 (CBF, leave-one-out proxy): 0.01


# 6. Čuvanje modela

In [14]:
from scipy.sparse import save_npz
import joblib

save_npz("../models/X_cbf.npz", X)
joblib.dump(tfidf, "../models/tfidf_text_features.joblib")

# Ovako se ucitava kasnije

# from scipy.sparse import load_npz
# import joblib

# X = load_npz("../models/X_cbf.npz")
# tfidf = joblib.load("../models/tfidf_text_features.joblib")

['../models/tfidf_text_features.joblib']