In [6]:
import numpy as np
from scipy import sparse
import pmf
import pandas as pd
import  tqdm
from scipy.sparse import csr_matrix


def predictive_check_with_vad(model, train_data, vad_data, n_simulations=50):
    # Step 1: Fit the model on the training data
    train_rows, train_cols = train_data.nonzero()
    model.fit(train_data, train_rows, train_cols)

    # Step 2: Calculate log-likelihood of the validation data
    vad_rows, vad_cols = vad_data.nonzero()
    actual_vad_loglik = model.pred_loglikeli(vad_data, vad_rows, vad_cols)

    # Step 3: Simulate datasets and calculate their log-likelihoods
    simulated_logliks = []
    for _ in tqdm(range(n_simulations)):
        # Generate a simulated dataset from the model
        simulated_data = model.generate_replicated_data()
        simulated_loglik = model.pred_loglikeli(simulated_data, vad_rows, vad_cols)
        simulated_logliks.append(simulated_loglik)

    # Step 4: Compare simulated log-likelihoods to actual validation log-likelihood
    proportion_better = np.mean([loglik > actual_vad_loglik for loglik in simulated_logliks])

    return proportion_better





train_data = pd.read_csv('C:/Users/Sten Stokroos/Desktop/zelf/dat/proc/ml_wg/train.csv')
vad_data = pd.read_csv('C:/Users/Sten Stokroos/Desktop/zelf/dat/proc/ml_wg/validation.csv')

def convert_to_csr(df):
    # Ensure user IDs and item IDs are integers and zero-indexed
    user_ids = df['uid'].astype(int)
    item_ids = df['sid'].astype(int)
    ratings = df['rating'].astype(float)

    # Get the maximum indices for matrix dimensions
    n_users = user_ids.max() + 1
    n_items = item_ids.max() + 1

    # Create a sparse matrix
    return csr_matrix((ratings, (user_ids, item_ids)), shape=(n_users, n_items))

train_sparse = convert_to_csr(train_data)
vad_sparse = convert_to_csr(vad_data)






# Use the function
model = pmf.PoissonMF(n_components=100, max_iter=100, smoothness=100, random_state=42, a=0.3, b=0.3, c=0.3, d=0.3)
proportion_better = predictive_check_with_vad(model, train_sparse, vad_sparse)
print("Proportion of simulated datasets with higher log-likelihood than validation data:", proportion_better)


ValueError: dimension mismatch