In [None]:
import os
import numpy as np
import pandas as pd
from hpfrec import HPF
import heapq
import math
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Set random seed for reproducibility
randseed = 42
print("Random seed:", randseed)
np.random.seed(randseed)

dir_ml = 'C:/Users/Sten Stokroos/Desktop/NEW/zelf/Data/out/'

def choose_data(dat, test_size, fold=0, n_folds=5):
    if dat == 'ml2':
        train = pd.read_csv(os.path.join(dir_ml, 'ml_train2.csv'), sep="\t", header=None, names=['userId', 'songId', 'rating'], usecols=[0, 1, 2], engine="python")
        test = pd.read_csv(os.path.join(dir_ml, 'ml_test2.csv'), sep="\t", header=None, names=['userId', 'songId', 'rating'], usecols=[0, 1, 2], engine="python")
        user_ids = train['userId'].unique()
        item_ids = train['songId'].unique()

        n_users = len(user_ids)
        n_items = len(item_ids)

        val = None  # Assuming no validation set for 'ml2'
    elif dat == 'ml':
        ml_full = pd.read_csv(os.path.join(dir_ml, 'ml-1m_full.csv'), sep="\t", header=None, names=['userId', 'songId', 'rating'], usecols=[0, 1, 2], engine="python")

        user_ids = ml_full['userId'].unique()
        item_ids = ml_full['songId'].unique()

        n_users = len(user_ids)
        n_items = len(item_ids)
        
        # Split user IDs for train and test sets
        train, test = train_test_split(ml_full, test_size=test_size, random_state=42)

        # Shuffle the training set
        train = train.sample(frac=1, random_state=42).reset_index(drop=True)
        
        # Create folds for cross-validation
        fold_size = int(len(train) / n_folds)
        val = train.iloc[fold * fold_size: (fold + 1) * fold_size]
        
        # Remaining data is the training set for this fold
        train = pd.concat([train.iloc[:fold * fold_size], train.iloc[(fold + 1) * fold_size:]]).reset_index(drop=True)
    else:
        print('Wrong data input')
        return None, None, None, None, None

    # Print the sizes of the datasets
    print(f"Train set size: {train.shape[0]} ratings")
    print(f"Validation set size: {val.shape[0]} ratings")
    print(f"Test set size: {test.shape[0]} ratings")

    return train, val, test, n_users, n_items

ml = 'ml'

# Define evaluation functions
def getNDCG(ranklist, gtItem):
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtItem:
            return math.log(2) / math.log(i + 2)
    return 0

def eval_one_rating(idx, model, test_ratings, test_negatives, topk):
    rating = test_ratings[idx]
    user = rating[0]
    gtItem = rating[1]
    items = test_negatives[idx]
    items.append(gtItem)
    
    # Get prediction scores
    map_item_score = {}
    users = np.full(len(items), user, dtype='int32')
    predictions = model.predict(user=users, item=np.array(items))
    
    for i in range(len(items)):
        item = items[i]
        map_item_score[item] = predictions[i]
    items.pop()
    
    # Evaluate top rank list
    ranklist = heapq.nlargest(topk, map_item_score, key=map_item_score.get)
    ndcg = getNDCG(ranklist, gtItem)
    return ndcg

k_values = [32]
n_folds = 5
test_size = 0.1

def load_negative_file(filename):
    negativeList = []
    with open(filename, "r") as f:
        line = f.readline()
        while line != None and line != "":
            arr = line.split("\t")
            negatives = []
            for x in arr[1:]:
                negatives.append(int(x))
            negativeList.append(negatives)
            line = f.readline()
    return negativeList

for fold in range(n_folds):
    print(f"Processing fold {fold+1}/{n_folds}...")
    train_df, val_df, test_df, num_users, num_items = choose_data(ml, test_size, fold=fold, n_folds=n_folds)

    # Load negative samples for the current fold
    negative_samples_file = f'C:/Users/Sten Stokroos/Desktop/NEW/zelf/Data/out/ml2_negatives_fold_{fold}.csv'
    test_negatives = load_negative_file(negative_samples_file)

    # Rename columns to 'UserId', 'ItemId', 'Count'
    train_df = train_df.rename(columns={'userId': 'UserId', 'songId': 'ItemId', 'rating': 'Count'})
    val_df = val_df.rename(columns={'userId': 'UserId', 'songId': 'ItemId', 'rating': 'Count'})
    test_df = test_df.rename(columns={'userId': 'UserId', 'songId': 'ItemId', 'rating': 'Count'})

    # Convert ratings to binary exposure data (1 if Count > 0, else 0)
    train_df['Count'] = (train_df['Count'] > 0).astype(int)
    val_df['Count'] = (val_df['Count'] > 0).astype(int)
    test_df['Count'] = (test_df['Count'] > 0).astype(int)

    # Get unique user and item IDs
    all_observed_df = pd.concat([train_df, val_df, test_df])
    user_ids = all_observed_df['UserId'].unique()
    item_ids = all_observed_df['ItemId'].unique()

    test_ratings = test_df[['UserId', 'ItemId']].values.tolist()

    for k in k_values:
        print(f"Training HPF model with k={k} for fold {fold+1}...")
        recommender = HPF(
            k=k, a=0.3, a_prime=0.3, b_prime=1.0,
            c=0.3, c_prime=0.3, d_prime=1.0, ncores=-1,
            stop_crit='train-llk', check_every=10, stop_thr=1e-3,
            users_per_batch=None, items_per_batch=None, step_size=lambda x: 1/np.sqrt(x+2),
            maxiter=100, use_float=True, reindex=False, verbose=True,
            random_seed=None, allow_inconsistent_math=False, full_llk=False,
            alloc_full_phi=False, keep_data=True, save_folder=None,
            produce_dicts=True, keep_all_objs=True, sum_exp_trick=False
        )

        # Fit the model to the training data
        recommender.fit(train_df, val_df)
        topk = 10

        # Calculate log-likelihood on the validation set
        llk = recommender.eval_llk(val_df)
        print(f"Log-likelihood for k={k}, fold {fold+1}: {llk['llk']}")

        # Evaluate the model using NDCG
        ndcg_scores = [eval_one_rating(i, recommender, test_ratings, test_negatives, topk) for i in tqdm(range(len(test_ratings)))]
        avg_ndcg = np.mean(ndcg_scores)
        print(f"Average NDCG for k={k}, fold {fold+1}: {avg_ndcg}")

        # Initialize an empty matrix for exposures
        exposure_matrix = np.zeros((len(user_ids), len(item_ids)))

        # Predict the exposure data for all user-item pairs
        for i, user in enumerate(user_ids):
            exposures = recommender.predict(user=[user] * len(item_ids), item=item_ids)
            exposure_matrix[i, :] = exposures

        # Convert the exposure matrix to a DataFrame
        exposure_df = pd.DataFrame(exposure_matrix, index=user_ids, columns=item_ids)

        # Save the exposure matrix to a CSV file with k and fold in the file name
        output_file = f'C:/Users/Sten Stokroos/Desktop/NEW/zelf/Data/exposure_output/ml_exp_k_{k}_fold_{fold+1}.csv'
        exposure_df.to_csv(output_file, index=False, header=False)
