*UE Learning from User-generated Data, CP MMS, JKU Linz 2023*
# Challenge



In [87]:
import numpy as np
import pandas as pd
import random as rnd
import torch
from torch import nn, optim
from tqdm import tqdm
from rec import *

## Task 

Load the users, items and both the train interactions and test interactions
from the **new version of the lfm dataset** provided

In [217]:
def read(dataset, file):
    return pd.read_csv(dataset + '.' + file, sep='\t')

users = read("lfm-challenge", 'user') #id, country, age, gender, registration date
#print(users)
items = read("lfm-challenge", 'item') #id, artist, track, country of origin
train_inters = read("lfm-challenge", 'inter_train') # user id, item id, listening events
test_inters = read("lfm-challenge", 'inter_test') #user id, item id, listening events
train_test = pd.concat([train_inters, test_inters])

# print(train_inters)
# print(test_inters)
# print(train_test)
train_interaction_matrix = inter_matr_implicit(users=users, items=items, interactions=train_test,
                                               dataset_name="lfm-tiny")
# with np.printoptions(threshold=np.inf):
#     print(train_interaction_matrix)
test_interaction_matrix = inter_matr_implicit(users=users, items=items, interactions=test_inters,
                                              dataset_name="lfm-tiny")


In [218]:
def read_values_from_file(file_path): #  функция берет ids of test users 
    values = []
    k = 0
    try:
        with open(file_path, 'r') as file:
            for line in file:
                line = line.strip()  # Remove leading/trailing whitespaces
                if line:  # Skip empty lines
                    values.append(int(line))
                k+=1
                # if k==10:
                #     break
    except IOError:
        print("Error: Unable to read the file.")
    return values
print(read_values_from_file('test_indices.txt'))
indices = read_values_from_file('test_indices.txt')
#indices = range(0,1194)

[4251, 9092, 6483, 4517, 4353, 7505, 1504, 3152, 1606, 6897, 1771, 4815, 4173, 7909, 3592, 6689, 8063, 1954, 8530, 9346, 2202, 8896, 8598, 1247, 1572, 8070, 7687, 1849, 7330, 1367, 2340, 5343, 6779, 5069, 5256, 2810, 8733, 7546, 6189, 7438, 900, 8155, 5282, 3762, 4289, 4618, 6097, 657, 7312, 2211, 6274, 8691, 6594, 3554, 4318, 4493, 8899, 4947, 7072, 7183, 3882, 3577, 7421, 744, 6172, 6617, 7970, 836, 2684, 956, 9359, 7305, 427, 8231, 6749, 4235, 8257, 1895, 3522, 620, 919, 4820, 9368, 4227, 3030, 7789, 6084, 6538, 4554, 7514, 3982, 3426, 7198, 1272, 6637, 3195, 870, 5462, 4347, 8060]


In [219]:
# with np.printoptions(threshold=np.inf):
#     print(train_interaction_matrix)
display(np.shape(train_interaction_matrix))

(9699, 10000)

In [231]:
def recTopKPop(inter_matr: np.array,
               user: int,
               top_k: int,
               users: pd.DataFrame) -> np.array:
    '''
    inter_matr - np.array from the task 1;
    user - user_id, integer;
    top_k - expected length of the resulting list;
    users: pandas DataFrame consisting of user information for all users, requires a "country" column;


    returns - list/array of top K popular items that the user has never seen
              (sorted in the order of descending popularity);
    '''

    top_pop = None

    # global item-popularity distribution:
    item_pop = inter_matr.sum(axis=0)

    # finding items seen by the user, basicaly indices of non-zero elements ...
    # ... in the interaction array corresponding to the user:
    items_seen = np.nonzero(inter_matr[user])

    # force values seen by the user to become 'unpopular'
    item_pop[items_seen] = 0
    
    # # Filter users based on the same country as the user
    # country = users.loc[user, "country"]
    # Filter users based on the same country, age, and gender as the user
    country = users.loc[user, "country"]
    age = users.loc[user, "age_at_registration"]
    gender = users.loc[user, "gender"]
    
    # Extract year and month from registration_date
    registration_date = pd.to_datetime(users.loc[user, "registration_date"])
    registration_year_month = f"{registration_date.year}-{registration_date.month:02d}"

    # registration_year = pd.to_datetime(users.loc[user, "registration_date"]).year
    # same_country_users = users[users["country"] == country] 
    
    # same_group_users = users[(users["country"] == country) & (users["age_at_registration"] == age) & (users["gender"] == gender)]

    #same_group_users = users[(users["country"] == country) & (users["age_at_registration"] == age) & (pd.to_datetime(users["registration_date"]).dt.year == registration_year)]
    same_group_users = users[(users["country"] == country) & (users["age_at_registration"] == age) & (users["gender"] == gender) & (pd.to_datetime(users["registration_date"]).dt.strftime("%Y-%m") == registration_year_month)]

    # Exclude items seen by users from different countries
    # for idx, row in same_country_users.iterrows():
    for idx, row in same_group_users.iterrows():
        if idx != user:
            items_seen = np.nonzero(inter_matr[idx])
            item_pop[items_seen] = 0

    top_pop = np.full((top_k,), -1)

    # get indices of top_K (new) popular items
    t_pop = (-item_pop).argsort()[:top_k]
    top_pop[:len(t_pop)] = t_pop

    return top_pop

In [232]:
# Define the output file path
output_file = "rec_k12127091_Ibragimov_Timur(TRAIN+TEST_countrAgeGenderYearMonth).tsv"

# create a pandas Dataframe with user data that has at least a "country column"
result = []

# Open the file in write mode
with open(output_file, "w") as f:
    for elem in indices:
        top_10 = recTopKPop(inter_matr=train_interaction_matrix, user=elem, top_k=10, users=users)
        result.append(top_10)
        #print(result)
        # print(elem, "top_10: ",top_10)
        #print(type(top_10))
        
        # Write the user ID and recommendations to the file
        f.write(f"{elem}\t{','.join(map(str, top_10))}\n")
    result = np.array(result)
    # print(result, type(result))

In [234]:
output_file = "report_k12127091_Ibragimov_Timur.txt"
with open(output_file, "w") as f:
    f.write(f"a")

# Part  : Evaluation

In [167]:
def get_ndcg_score(predictions: np.ndarray, test_interaction_matrix: np.ndarray, topK=10) -> float:
    """
    predictions - np.ndarray - predictions of the recommendation algorithm for each user.
    test_interaction_matrix - np.ndarray - test interaction matrix for each user.
    topK - int - topK recommendations should be evaluated.
    
    returns - average ndcg score over all users.
    """
    score = None
    
    # TODO: YOUR IMPLEMENTATION.
    ndcg_scores = []
    num_users = predictions.shape[0]

    for i in range(num_users):
        # Get the topK recommendations for the i-th user
        topk_items = predictions[i][:topK]
        
        # Get the held-out items for the i-th user
        held_out_items = np.where(test_interaction_matrix[i] == 1)[0]
        
        # Compute the ideal DCG for the i-th user
        ideal_dcg = 0.0
        for j in range(min(topK, len(held_out_items))):
            if j == 0:
                    ideal_dcg += 1.0
            else:
                ideal_dcg += 1.0 / np.log2(j + 2)
        
        # Compute the actual DCG for the i-th user
        dcg = 0.0
        for j, item in enumerate(topk_items):
            if item in held_out_items:
                if j == 0:
                    dcg += 1.0
                else:
                    dcg += 1.0 / np.log2(j + 2)

        # Compute the nDCG score for the i-th user
        if ideal_dcg > 0:
            ndcg = dcg / ideal_dcg
        else:
            ndcg = 0.0
        
        ndcg_scores.append(ndcg)

    # Compute the average nDCG score over all users
    score = np.mean(ndcg_scores)

    return score

In [233]:
predictions = result
# print(predictions.shape[0])
test_interaction_matrix = test_interaction_matrix

ndcg_score = get_ndcg_score(predictions, test_interaction_matrix, topK=10)

print(ndcg_score)

0.026392208624811064



ndcg_score(recTopKPopByCountry): 0.009580595786269233

ndcg_score(recTopKPopByAgeGenderYear): 0.01116046819893001

ndcg_score(recTopKPopByCountryGender): 0.014961716384681991

ndcg_score(recTopKPopByCountryYear): 0.016949164841607835

ndcg_score(recTopKPopByCountryAgeGender): 0.017774002347438075 

ndcg_score(recTopKPopByCountryGenderYear): 0.01875258045152185

ndcg_score(recTopKPopByCountryAgeGenderYear): 0.022790104969650105 + 0.026392208624811064

ndcg_score(recTopKPopByCountryAgeYear): 0.02304267544108223 + 0.02647045421779115

ndcg_score(recTopKPopByCountryAgeGenderYearMonth): 0.027075819929509005 + 0.026392208624811064