In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score

In [2]:
merged = pd.read_csv('processed_data/ratings_with_tmdb.csv')

In [3]:
merged.head()

Unnamed: 0,userId,rating,tmdbId
0,1,1.0,197
1,1,4.5,10474
2,1,5.0,238
3,1,5.0,240
4,1,5.0,207


In [4]:
movie_sim_graph = pd.read_csv('processed_data/umap_movie_graph_truncated.csv')
movie_sim_graph.head()

Unnamed: 0,source,target,weight
0,2,1379,0.1864265
1,2,5971,2.804486e-07
2,2,7015,3.330669e-16
3,2,42112,1.0
4,3,7974,0.1879906


In [5]:
adj = movie_sim_graph.groupby('source').apply(
    lambda df: list(zip(df['target'], df['weight']))
).to_dict()

  adj = movie_sim_graph.groupby('source').apply(


In [6]:
results = []
iterations = 50

for user, user_df in merged.groupby('userId'):
    mean_r = user_df['rating'].mean()
    user_df['label'] = (user_df['rating'] >= mean_r).astype(int)
    if user_df.shape[0] < 10:
        continue

    train = user_df.sample(frac=0.8, random_state=42)
    test  = user_df.drop(train.index)

    if len(test) < 5:
        continue

    beliefs = {}
    for _, r in train.iterrows():
        beliefs[r['tmdbId']] = float(r['label'])

    test_movies = test['tmdbId'].tolist()
    for m in test_movies:
        beliefs[m] = 0.5

    for _ in range(iterations):
        new_beliefs = beliefs.copy()
        for m in test_movies:
            neighbors = adj.get(m, [])
            if not neighbors: 
                continue
            
            num = 0; den = 0
            for nb, w in neighbors:
                if nb in beliefs:
                    num += w * beliefs[nb]
                    den += w
            
            if den > 0:
                new_beliefs[m] = num/den
        
        # check convergence
        if max(abs(new_beliefs[m]-beliefs[m]) for m in test_movies) < 1e-4:
            break

        beliefs = new_beliefs
    
    y_true = test['label'].values
    y_score = np.array([beliefs[m] for m in test_movies])
    y_pred = (y_score >= 0.5).astype(int)

    # Add class sizes
    results.append({
        'userId': user,
        'accuracy': accuracy_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred, zero_division=0),
        'roc_auc': roc_auc_score(y_true, y_score) if len(np.unique(y_true))>1 else np.nan,
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'train_size': len(train),
        'test_size': len(test),
    })


results_df = pd.DataFrame(results)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [7]:
results_df.head()

Unnamed: 0,userId,accuracy,f1,roc_auc,precision,recall,train_size,test_size
0,1,0.8,0.857143,0.75,1.0,0.75,22,5
1,4,0.583333,0.736842,0.5,0.636364,0.875,50,12
2,5,1.0,1.0,,1.0,1.0,21,5
3,7,0.545455,0.666667,0.7,0.5,1.0,42,11
4,8,0.608696,0.756757,0.607143,0.608696,1.0,90,23


In [8]:
results_df.to_csv('prc_user_based_results.csv', index=False)