PARTE PFAIR

In [63]:
import numba
from pathlib import Path
import pandas as pd
import pandas.core.groupby
import pandas.core.series
import logging
import sys
import numpy as np
import time
from sortedcontainers import SortedDict
from sklearn.preprocessing import minmax_scale
import math

In [64]:
work_dir = Path('.').resolve()
print(work_dir)
data_file = work_dir / 'results' / 'AMBAR' / 'NEUMF'
print(data_file)

M:\Framework\C-Fairness-RecSys\Provider\AMBAR-35d26542c99acb47dbf0e8e08d6d97b19b27c6b9
M:\Framework\C-Fairness-RecSys\Provider\AMBAR-35d26542c99acb47dbf0e8e08d6d97b19b27c6b9\results\AMBAR\NEUMF


In [65]:
#Preprocessing

logging.basicConfig(format='%(asctime)s -> %(message)s', level=logging.INFO)
logging.info("Loading data...")

#Algoritmo Matrix Factorization
alg = (data_file / 'neumf_default').with_suffix('.csv')
# Carga de datos de un archivo csv con el nombre del algoritmo
# el cual, al parecer no se encuentra en el repositorio
# Presumiblemente el csv contenia los datos de las recomendaciones por el algoritmo
df_rec = pd.read_csv(alg, usecols=['user_id', 'item_id', 'score'])

#Carga de datos de un archivo csv con el nombre de las pistas
df_items = pd.read_csv(work_dir / 'data' / 'PFair'/ 'tracks.csv')

#Merge de los csv de las recomendaciones del algoritmo con las pistas con "leftjoin"
df_merge1 = pd.merge(df_rec, df_items, on='item_id', how='left')

#Se extraen las columnas de user_id, item, score y group de la tabla df_merge1
data = df_merge1[['user_id', 'item', 'score', 'group']]

#Se renombra la columna item a item_id
data = data.rename(index=str, columns={"item": "item_id"})

#Se define una funcion log2 para calcular el logaritmo base 2 de un numero
def log2(n):
    return math.log10(n) / math.log10(2)

#Se define una funcion get_exposure para calcular la exposicion de una pista en base a su posicion
def get_exposure(position):
    return 1 / log2(1 + position)

#Se crea una nueva columna id para identificar cada pista
data['id'] = range(len(data))

#Se crea una nueva columna position para identificar la posicion de cada pista en la lista de recomendaciones
data['position'] = data.groupby('user_id').cumcount() + 1

#Se crea una nueva columna exposure para identificar la exposicion de cada pista en la lista de recomendaciones
data['exposure'] = data['position'].apply(get_exposure)

#Se crea una nueva tabla final con las columnas id, user_id, item_id, score, exposure, group y position 
final = data[['id','user_id', 'item_id', 'score', 'exposure', 'group', 'position']]

print(final)
#final.to_csv(alg+'_R.csv', index=False)

2025-01-06 02:24:41,572 -> Loading data...


              id    user_id  item_id     score  exposure          group  \
0              0     484751     5491  0.142756  1.000000         Europe   
1              1     484751    16921  0.142756  0.630930  North America   
2              2     484751    43818  0.142756  0.500000  North America   
3              3     484751     9841  0.142756  0.430677  North America   
4              4     484751    38703  0.142756  0.386853         Europe   
...          ...        ...      ...       ...       ...            ...   
1287995  1287995  999839889    45740  0.142756  0.100387  North America   
1287996  1287996  999839889    33102  0.142756  0.100372         Europe   
1287997  1287997  999839889     5496  0.142756  0.100358  North America   
1287998  1287998  999839889    31856  0.142756  0.100343         Europe   
1287999  1287999  999839889    21834  0.142756  0.100329  North America   

         position  
0               1  
1               2  
2               3  
3               4  

In [66]:
#VISIBILITY

#final = pd.read_csv(alg+'_R.csv')
amount = len(final)
logging.info("{} uploaded records".format(amount))

users = final.to_numpy()
items = df_items.to_numpy()

# Normalize rating (0.0-1.0 range)
users[:,3] = minmax_scale(users[:,3])

2025-01-06 02:24:43,378 -> 1288000 uploaded records


In [67]:
def get_continent_id(continent):
    continent_ids = {"Africa": 1, "Asia": 2, "Europe": 3,"North America": 4,
                     "Oceania": 5, "South America": 6}

    if continent in continent_ids:
        return continent_ids[continent]
    else:
        # If there are multiple continents for this item, we split them into a list
        if isinstance(continent, str):
            continents = continent.split("|")
            result = ""
            # Then we calculate an ID for each continent in the list and append them to a single value
            # For example, Africa|Europe|Oceania will become '135'
            for c in continents:
                result += str(get_continent_id(c))
            return int(result)
        else:
            return 0

# Number of items per continent
continent_dict = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0}

topk = 10
for i in range(len(users)):
    # We calculate numerical value for each continent ID
    item = int(users[i][2]-1)
    continent_name = items[item][-1]
    users[i][-2] = get_continent_id(continent_name)

    # Then we count the number of occurences for each item by continent
    if users[i][-1] <= topk:
        group = int(users[i][-2])
        continents = [int(d) for d in str(group)]
        for c in continents:
            if c in continent_dict:
                continent_dict[c] += 1
    i += 1

In [68]:
# Leer split de entrenamiento y asignar columnas relacionadas al usuario
# ------------------------------ANTES SE LLAMABA train.csv-------------------------------
df_train = pd.read_csv( data_file / 'train_set.csv', usecols=['user', 'item', 'rating'] )
# Asignar columnas relacionadas a items
df_items.columns=['id','item', 'artist_id', 'group']

df_merge2 = pd.merge(df_train, df_items, on='item')
train = df_merge2[['user', 'id', 'rating', 'group']]
train = train.rename(index=str, columns={"id": "item"})
print(train)
#train.to_csv('train_R.csv', index=False)

            user   item  rating          group
0      133473995   1416       1         Europe
1      144231082   1416       2         Europe
2      175884756   1416       2         Europe
3      594791079   1416       2         Europe
4      113089606   1416       1         Europe
...          ...    ...     ...            ...
74511   77356380  17324       1           Asia
74512   77356380  17328       1           Asia
74513   77356380  17327       1           Asia
74514  535146439   5951       3  North America
74515  447270669  14347       1  North America

[74516 rows x 4 columns]


In [69]:
counter = {}

def count_groups(groups):
    global counter

    groups = str(groups).split("|")
    for g in groups:
        if g not in counter:
            counter[g] = 0
        counter[g] = counter[g] + 1

train["group"] = train["group"].apply(count_groups)

keys = counter.keys()
sorted_keys = sorted(keys)

sorted_counter = {}
for key in sorted_keys:
    sorted_counter[key] = counter[key]

print(sorted_counter)

total_t = sum(sorted_counter.values())

target = [g / total_t for g in sorted_counter.values()]

print (target)

{'Africa': 85, 'Asia': 5549, 'Europe': 13975, 'North America': 49714, 'Oceania': 2608, 'South America': 2585}
[0.0011406946159214129, 0.0744672285146814, 0.18754361479413817, 0.6671587310107896, 0.034999194803800524, 0.03469053626066885]


In [70]:
total = sum(continent_dict.values())
proportions = [c / total for c in continent_dict.values()]
initial_proportions = proportions.copy()
logging.info("Initial proportions: {}".format(initial_proportions))

target_proportions = target.copy()
logging.info("Target proportions: {}".format(target_proportions))

# The difference between target proportions and initial ones
# If the number is negative, we want to swap item out of Topk
# If the number is positive, we want to swap item into Topk
proportions_delta = np.array(target_proportions) - np.array(proportions)
logging.info("Proportions delta: {}".format(proportions_delta))

2025-01-06 02:24:45,612 -> Initial proportions: [0.0, 0.0, 0.3, 0.5, 0.1, 0.1]
2025-01-06 02:24:45,613 -> Target proportions: [0.0011406946159214129, 0.0744672285146814, 0.18754361479413817, 0.6671587310107896, 0.034999194803800524, 0.03469053626066885]
2025-01-06 02:24:45,614 -> Proportions delta: [ 0.00114069  0.07446723 -0.11245639  0.16715873 -0.06500081 -0.06530946]


In [71]:
def check_underrepresented(group):
    if group > len(proportions_delta):
        # If the item's group contains more than one continent, we need to split it
        continents = [int(d) for d in str(group)]
        sum_deltas = 0
        # To determine whether we need to swap, we calculate the sum of deltas for present continents
        # If the total is higher than 0, then the item is underrepresented and swap is beneficial
        for c in continents:
            sum_deltas += proportions_delta[c-1]
        return sum_deltas >= 0
    else:
        return proportions_delta[group-1] >= 0
# ORIGINAL TENIA 1000
topn = 100
def precompute(numpy_data, num_users):
    global users

    for i in range(num_users):
        s1 = [] # List of candidates for swapping out
        s2 = [] # List of candidates for swapping in

        for j in range(topn):
            index = i*topn+j # Current item index
            group = int(users[index][-2]) # Current item group

            is_underrepresented = check_underrepresented(group)

            if (users[index][-1] <= topk and not is_underrepresented):
                s1.append(users[index])
            elif (users[index][-1] > topk and is_underrepresented):
                s2.append(users[index])

        k = 0

        while len(s2) >= 1:
            for n in range(len(s1)-1, -1, -1):
                item1 = s1[n]
                item2 = s2[0]

                # Table reference: id, id_user, id_item, score, exposure, group, position
                loss = item1[3] - item2[3]

                # We save information about the swap into the numpy array
                numpy_data[i*topn*topk+k][0] = i
                numpy_data[i*topn*topk+k][1] = users[index][1]
                numpy_data[i*topn*topk+k][2] = item1[0]
                numpy_data[i*topn*topk+k][3] = item2[0]
                numpy_data[i*topn*topk+k][4] = loss

                k += 1

            del s2[0]

    return numpy_data

In [72]:
logging.info("Calculating initial losses...")

start = time.time()

num_users = int(len(users) / topn)
numpy_data = precompute(np.zeros([len(users)*10, 5]), num_users)

end = time.time()
logging.info('Elapsed: {}'.format(end - start))

numpy_data = numpy_data[numpy_data[:,1] != 0] # Remove empty rows
numpy_data = numpy_data[numpy_data[:,4].argsort()] # Sort by loss

logging.info('Possible number of swaps: {}'.format(len(numpy_data)))

2025-01-06 02:24:45,643 -> Calculating initial losses...
2025-01-06 02:24:48,260 -> Elapsed: 2.616241931915283
2025-01-06 02:24:48,451 -> Possible number of swaps: 386400


In [73]:
def update_proportions(group, is_underrepresented):
    global continent_dict

    continents = [int(d) for d in str(group)]
    value = 1 if is_underrepresented else -1
    for c in continents:
        continent_dict[c] += value

In [74]:
def rerank():
    global numpy_data, num_users, users, proportions, proportions_delta

    loss_total = 0.0
    i = 0
    completed_swaps = 0

    users_length = len(users)
    logging.info(f"Tamaño de users: {users_length}")

    while(i < len(numpy_data)):
        item_1 = int(numpy_data[i][2]-1) # Item to swap out
        item_2 = int(numpy_data[i][3]-1) # Item to swap in

        group_1 = int(users[item_1][-2])
        group_2 = int(users[item_2][-2])
        
        logging.debug(f"Intercambio {i}: item_1={item_1}, item_2={item_2}")
        
        if (check_underrepresented(group_1) == False and check_underrepresented(group_2) == True):
            # Preserve the original position values
            position_aux = users[item_1][-1]
            users[item_1][-1] = users[item_2][-1]
            users[item_2][-1] = position_aux

            # Preserve the original indexes
            index_aux = users[item_1][0]
            users[item_1][0] = users[item_2][0]
            users[item_2][0] = index_aux

            # Calculate loss when performing this swap
            loss = users[item_1][3] - users[item_2][3]

            # Swap the items
            users[[item_1, item_2]] = users[[item_2, item_1]]
            loss_total += loss

            # We recompute the current proportions and their delta
            update_proportions(group_1, False)
            update_proportions(group_2, True)
            total = sum(continent_dict.values())
            proportions = [c / total for c in continent_dict.values()]
            proportions_delta = np.array(target_proportions) - np.array(proportions)

            completed_swaps += 1

        i += 1

    logging.info('Completed swaps: {}'.format(completed_swaps))
    logging.info('Total iterations: {}'.format(i))
    return loss_total, users

In [75]:
start = time.time()

loss_total, result = rerank()

end = time.time()

logging.info('Elapsed: {}'.format(end - start))

continent_dict = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0}

for i in range(len(result)):
    # Then we count the number of occurences for each item by continent
    if result[i][-1] <= topk:
        group = int(result[i][-2])
        continents = [int(d) for d in str(group)]
        for c in continents:
            if c in continent_dict:
                continent_dict[c] += 1
    i += 1

total = sum(continent_dict.values())
proportions = [c / total for c in continent_dict.values()]

logging.info("Initial proportions: {}".format(initial_proportions))
logging.info("Current proportions: {}".format(proportions))
logging.info("Target proportions: {}".format(target_proportions))
original_delta = np.array(target_proportions) - np.array(initial_proportions)
proportions_delta = np.array(target_proportions) - np.array(proportions)
logging.info("Original delta: {}".format(original_delta))
logging.info("Proportions delta: {}".format(proportions_delta))
logging.info("Total loss: {}".format(loss_total))

df_vis = pd.DataFrame(data=result, columns=['id', 'user_id', 'item_id', 'score', 'exposure', 'group', 'position'])
df_vis = df_vis.astype({'id': 'Int64', 'user_id': 'Int64', 'item_id': 'Int64', 'group': 'Int64', 'position': 'Int64'})
df_vis_final = df_vis.loc[(df_vis['position'] <= topk)]

df_vis_final = df_vis_final.sort_values(by=['user_id', 'position'], ascending=[True, True])
print(df_vis_final)

#df_vis_final.to_csv(alg+'_V.csv', index=False, header = True)
logging.info('Ended process.')

2025-01-06 02:24:48,508 -> Tamaño de users: 1288000
2025-01-06 02:24:50,103 -> Completed swaps: 1449
2025-01-06 02:24:50,104 -> Total iterations: 386400
2025-01-06 02:24:50,105 -> Elapsed: 1.596999168395996
2025-01-06 02:24:50,490 -> Initial proportions: [0.0, 0.0, 0.3, 0.5, 0.1, 0.1]
2025-01-06 02:24:50,491 -> Current proportions: [0.0, 0.01420807453416149, 0.1875, 0.5982919254658385, 0.1, 0.1]
2025-01-06 02:24:50,491 -> Target proportions: [0.0011406946159214129, 0.0744672285146814, 0.18754361479413817, 0.6671587310107896, 0.034999194803800524, 0.03469053626066885]
2025-01-06 02:24:50,492 -> Original delta: [ 0.00114069  0.07446723 -0.11245639  0.16715873 -0.06500081 -0.06530946]
2025-01-06 02:24:50,493 -> Proportions delta: [ 1.14069462e-03  6.02591540e-02  4.36147941e-05  6.88668055e-02
 -6.50008052e-02 -6.53094637e-02]
2025-01-06 02:24:50,493 -> Total loss: 0.0
2025-01-06 02:24:51,264 -> Ended process.


              id    user_id  item_id score  exposure  group  position
0              0     484751     5491   0.0       1.0      6         1
1              1     484751    16921   0.0   0.63093      4         2
2              2     484751    43818   0.0       0.5      4         3
3              3     484751     9841   0.0  0.430677      4         4
4              4     484751    38703   0.0  0.386853      3         5
...          ...        ...      ...   ...       ...    ...       ...
1287005  1287005  999839889     6212   0.0  0.356207      4         6
1287006  1287006  999839889       19   0.0  0.333333      4         7
1287007  1287007  999839889     5974   0.0  0.168613      4         8
1287008  1287008  999839889     9040   0.0  0.169294      4         9
1287009  1287009  999839889    22209   0.0  0.289065      5        10

[12880 rows x 7 columns]


In [76]:
#EXPOSURE
#df_vis_final = pd.read_csv(alg+'_V.csv')

amount = len(df_vis_final)
logging.info("{} uploaded records".format(amount))

users = df_vis_final.to_numpy()
items = df_items.to_numpy()

2025-01-06 02:24:51,274 -> 12880 uploaded records


In [77]:
def get_continent_id(continent):
    continent_ids = {"Africa": 1, "Asia": 2, "Europe": 3,"North America": 4, "Oceania": 5, "South America": 6}
    
    if continent in continent_ids:
        return continent_ids[continent]
    else:
        # If there are multiple continents for this item, we split them into a list
        if isinstance(continent, str):
            continents = continent.split("|")
            result = ""
            # Then we calculate an ID for each continent in the list and append them to a single value
            # For example, Africa|Europe|Oceania will become '135'
            for c in continents:
                result += str(get_continent_id(c))
            return int(result)
        else:
            return 0

# Number of items per continent
continent_dict = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0}
  
for i in range(len(users)):
    # We calculate numerical value for each continent ID
    item = int(users[i][2]-1)
    continent_name = items[item][-1]
    users[i][-2] = get_continent_id(continent_name)
    
    # Then we count the total exposure for each item by continent 
    if users[i][-1] <= topk:
        group = int(users[i][-2])
        continents = [int(d) for d in str(group)]
        for c in continents:
            if c in continent_dict:
                continent_dict[c] += users[i][4]
                
    i += 1

In [78]:
total = sum(continent_dict.values())
proportions = [c / total for c in continent_dict.values()]
initial_proportions = proportions.copy()
logging.info("Initial proportions: {}".format(initial_proportions))

target_proportions = target.copy()
logging.info("Target proportions: {}".format(target_proportions))

# The difference between target proportions and initial ones
# If the number is negative, we want to swap item out of Topk
# If the number is positive, we want to swap item into Topk
proportions_delta = np.array(target_proportions) - np.array(proportions)
logging.info("Proportions delta: {}".format(proportions_delta))

2025-01-06 02:24:51,384 -> Initial proportions: [0.0, 0.005448640889873883, 0.1490901471614006, 0.5527499622133702, 0.06563869003216116, 0.227072559703194]
2025-01-06 02:24:51,385 -> Target proportions: [0.0011406946159214129, 0.0744672285146814, 0.18754361479413817, 0.6671587310107896, 0.034999194803800524, 0.03469053626066885]
2025-01-06 02:24:51,385 -> Proportions delta: [ 0.00114069  0.06901859  0.03845347  0.11440877 -0.0306395  -0.19238202]


In [79]:
# We determine if the current item's group is underrepresented or overrepresented
def check_underrepresented(group):
    if group > len(proportions_delta):
        # If the item's group contains more than one continent, we need to split it
        continents = [int(d) for d in str(group)]
        sum_deltas = 0
        # To determine whether we need to swap, we calculate the sum of deltas for present continents
        # If the total is higher than 0, then the item is underrepresented and swap is beneficial
        for c in continents:
            sum_deltas += proportions_delta[c-1]
        return sum_deltas >= 0
    else:
        return proportions_delta[group-1] >= 0

# Funcion para preparar los datos para un proceso de intercambio
# de elementos de la lista de recomendaciones, basado en la representacion de los grupos
def precompute(numpy_data, num_users):
    global users
    
    for i in range(num_users):
        s1 = [] # List of candidates for swapping out
        s2 = [] # List of candidates for swapping in
        
        order = {} # Dictionary for saving the order of items for the current user
        
        #s1
        for j in range(topk):
            index = i*topk+j # Current item index
            item_id = int(users[index][0]) # Current item ID
            group = int(users[index][-2]) # Current item group
            
            is_underrepresented = check_underrepresented(group)
            
            if not is_underrepresented:
                s1.append(users[index])
            
            order[item_id] = group
        
        m = 0
        
        #s2
        for n in range(len(s1)-1, -1, -1):
            for j in range(topk):
                index = i*topk+j # Current item index
                item_id = int(users[index][0]) # Current item ID
                group = int(users[index][-2]) # Current item group
                item1 = s1[n]
                
                is_underrepresented = check_underrepresented(group)
                
                if (is_underrepresented and item_id > s1[n][0]):
                    s2.append([item_id, users[index][1], users[index][2], users[index][3], 
                               users[index][4], order[item_id], users[index][6]])
            
            k = 0
            
            while len(s2) >= 1:
                item1 = s1[n]
                acc_count = 0
                for item in s2:
                    item2 = item
                    #count = max(item1[3] - item2[3], 0) + acc_count
                    count = abs(item1[3] - item2[3]) + acc_count
                    numpy_data[i*1000+k+m] = [i, users[index][1], item1[0], item2[0], count+len(s1)-n]

                    order[int(item1[0])] = int(item2[-2])
                    order[int(item2[0])] = int(item1[-2])

                    item1 = np.array([item2[0], item1[1], item1[2], item1[3], item2[4], item1[5], item2[6]])

                    acc_count += count
                    k += 1
                del s2[0]
            m += k
    
    return numpy_data

In [80]:
logging.info("Calculating initial losses...")

start = time.time()
# Calcular el numero de usuarios
num_users = int(len(users) / topk)
print(num_users)
# Precomputar los datos para un proceso de intercambio
# HICE UN CAMBIO, MULTIPLIQUE POR 1000. ORIGINALMENTE TENIA 100
numpy_data = precompute(np.zeros([len(users)*100, 5]), num_users)

end = time.time()
logging.info('Elapsed: {}'.format(end - start))

numpy_data = numpy_data[numpy_data[:,1] != 0] # Remove empty rows
numpy_data = numpy_data[numpy_data[:,4].argsort()] # Sort by loss

logging.info('Possible number of swaps: {}'.format(len(numpy_data)))

2025-01-06 02:24:51,414 -> Calculating initial losses...


1288


2025-01-06 02:24:51,635 -> Elapsed: 0.2199993133544922
2025-01-06 02:24:51,659 -> Possible number of swaps: 46368


In [81]:
def update_proportions(group, exp, is_underrepresented):
    global continent_dict
    
    continents = [int(d) for d in str(group)]
    value = exp if is_underrepresented else -exp
    for c in continents:
        continent_dict[c] += value

In [82]:
"""def rerank():
    global numpy_data, num_users, users, proportions, proportions_delta
    
    loss_total = 0.0
    i = 0
    completed_swaps = 0
    users_length = len(users)
    logging.info(f"Tamaño de users: {users_length}")

    while(i < len(numpy_data)):       
        item_1 = int(numpy_data[i][0] * 10  + (numpy_data[i][2]-1) % 50) # Item to swap out
        item_2 = int(numpy_data[i][0] * 10  + (numpy_data[i][3]-1) % 50) # Item to swap in
        # CODIGO AGREGADO
        if item_1 >= users_length or item_2 >= users_length:
            logging.error(f"Index out of bounds: item_1={item_1}, item_2={item_2}, users_length={users_length}")
            break
        # CODIGO AGREGADO
        
        group_1 = int(users[item_1][-2])
        group_2 = int(users[item_2][-2])
        
        exp = users[item_1][4] - users[item_2][4]
        #Si el ítem que se quiere remover no está subrepresentado y el
        #  ítem que se quiere agregar sí está subrepresentado, entonces
        #  procede al intercambio.
        if (check_underrepresented(group_1) == False and check_underrepresented(group_2) == True):
            # Preserve the original position values
            position_aux = users[item_1][-1]
            users[item_1][-1] = users[item_2][-1]
            users[item_2][-1] = position_aux
            
            # Preserve the original indexes
            index_aux = users[item_1][0]
            users[item_1][0] = users[item_2][0]
            users[item_2][0] = index_aux
            
            # Preserve the original exposure values
            exp_aux = users[item_1][4]
            users[item_1][4] = users[item_2][4]
            users[item_2][4] = exp_aux

            # Calculate loss when performing this swap
            loss = users[item_1][3] - users[item_2][3]

            # Swap the items
            users[[item_1, item_2]] = users[[item_2, item_1]]
            loss_total += loss

            # We recompute the current proportions and their delta
            update_proportions(group_1, exp, False)
            update_proportions(group_2, exp, True)
            total = sum(continent_dict.values())
            proportions = [c / total for c in continent_dict.values()]
            proportions_delta = np.array(target_proportions) - np.array(proportions)

            completed_swaps += 1
            
        i += 1
        
    logging.info('Completed swaps: {}'.format(completed_swaps))
    logging.info('Total iterations: {}'.format(i))
    return loss_total, users
"""

def rerank():
    global numpy_data, num_users, users, proportions, proportions_delta, completed_swaps
    loss_total = 0.0
    i = 0
    completed_swaps = 0
    users_length = len(users)
    logging.info(f"Tamaño de users: {users_length}")

    while(i < len(numpy_data)):
        # Fix the index calculation
        user_offset = int(numpy_data[i][0]) * topk  # Use topk instead of 10
        item_1_offset = int(numpy_data[i][2]-1) % topk
        item_2_offset = int(numpy_data[i][3]-1) % topk
        
        item_1 = user_offset + item_1_offset
        item_2 = user_offset + item_2_offset

        # Validate indices before proceeding
        if item_1 >= users_length or item_2 >= users_length:
            i += 1
            continue
            
        group_1 = int(users[item_1][-2])
        group_2 = int(users[item_2][-2])
        
        exp = users[item_1][4] - users[item_2][4]
        
        if (check_underrepresented(group_1) == False and check_underrepresented(group_2) == True):
            # Preserve the original position values
            position_aux = users[item_1][-1]
            users[item_1][-1] = users[item_2][-1]
            users[item_2][-1] = position_aux
            
            # Preserve the original indexes
            index_aux = users[item_1][0]
            users[item_1][0] = users[item_2][0]
            users[item_2][0] = index_aux
            
            # Preserve the original exposure values
            exp_aux = users[item_1][4]
            users[item_1][4] = users[item_2][4]
            users[item_2][4] = exp_aux

            # Calculate loss when performing this swap
            loss = users[item_1][3] - users[item_2][3]

            # Swap the items
            users[[item_1, item_2]] = users[[item_2, item_1]]
            loss_total += loss

            # We recompute the current proportions and their delta
            update_proportions(group_1, exp, False)
            update_proportions(group_2, exp, True)
            total = sum(continent_dict.values())
            proportions = [c / total for c in continent_dict.values()]
            proportions_delta = np.array(target_proportions) - np.array(proportions)

            completed_swaps += 1
            
        i += 1
        
    logging.info('Completed swaps: {}'.format(completed_swaps))
    logging.info('Total iterations: {}'.format(i))
    return loss_total, users


In [83]:
start = time.time()

loss_total, result = rerank()

end = time.time()

logging.info('Elapsed: {}'.format(end - start))

continent_dict = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0}

for i in range(len(result)):
    if result[i][-1] <= topk:
        group = int(result[i][-2])
        continents = [int(d) for d in str(group)]
        for c in continents:
            if c in continent_dict:
                continent_dict[c] += result[i][4]
                
    i += 1
    
total = sum(continent_dict.values())
proportions = [c / total for c in continent_dict.values()]

logging.info("Initial proportions: {}".format(initial_proportions))
logging.info("Current proportions: {}".format(proportions))
logging.info("Target proportions: {}".format(target_proportions))
original_delta = np.array(target_proportions) - np.array(initial_proportions)
proportions_delta = np.array(target_proportions) - np.array(proportions)
logging.info("Original delta: {}".format(original_delta))
logging.info("Proportions delta: {}".format(proportions_delta))
logging.info("Total loss: {}".format(loss_total))

df_exp = pd.DataFrame(data=result, columns=['id', 'user_id', 'item_id', 'score', 'exposure', 'group', 'position'])
df_exp = df_exp.astype({'id': 'Int64', 'user_id': 'Int64', 'item_id': 'Int64', 'group': 'Int64', 'position': 'Int64'})
df_exp_final = df_exp.loc[(df_exp['position'] <= topk)]

df_exp_final = df_exp_final.sort_values(by=['user_id', 'position'], ascending=[True, True])
print(df_exp_final)

df_exp_final.to_csv((work_dir / 'neumf_default_exp').with_suffix('.csv'), index=False, header = True)

logging.info('Ended process.')

#Save log result

log_filename = work_dir / f'neumf_default_results.log'
with open(log_filename, 'w') as f:
    f.write("=== PFair Reranking Results ===\n\n")
    f.write(f"Initial proportions: {initial_proportions}\n")
    f.write(f"Current proportions: {proportions}\n")
    f.write(f"Target proportions: {target_proportions}\n")
    f.write(f"Original delta: {original_delta.tolist()}\n")
    f.write(f"Proportions delta: {proportions_delta.tolist()}\n")
    f.write(f"Total loss: {loss_total}\n")
    f.write(f"Completed swaps: {completed_swaps}\n")
    f.write(f"Total iterations: {i}\n")
    f.write(f"Execution time: {end - start} seconds\n")

logging.info(f'Results saved to {log_filename}')

2025-01-06 02:24:51,754 -> Tamaño de users: 12880
2025-01-06 02:24:52,174 -> Completed swaps: 11870
2025-01-06 02:24:52,175 -> Total iterations: 46368
2025-01-06 02:24:52,228 -> Elapsed: 0.47400450706481934
2025-01-06 02:24:52,248 -> Initial proportions: [0.0, 0.005448640889873883, 0.1490901471614006, 0.5527499622133702, 0.06563869003216116, 0.227072559703194]
2025-01-06 02:24:52,249 -> Current proportions: [0.0, 0.008037125816874645, 0.16591429473031452, 0.66719492008533, 0.06100750825974197, 0.09784615110773895]
2025-01-06 02:24:52,250 -> Target proportions: [0.0011406946159214129, 0.0744672285146814, 0.18754361479413817, 0.6671587310107896, 0.034999194803800524, 0.03469053626066885]
2025-01-06 02:24:52,250 -> Original delta: [ 0.00114069  0.06901859  0.03845347  0.11440877 -0.0306395  -0.19238202]
2025-01-06 02:24:52,251 -> Proportions delta: [ 1.14069462e-03  6.64301027e-02  2.16293201e-02 -3.61890745e-05
 -2.60083135e-02 -6.31556148e-02]
2025-01-06 02:24:52,251 -> Total loss: 0.0


            id    user_id  item_id score  exposure  group  position
0            0     484751    16921   0.0       1.0      4         1
1            1     484751    43818   0.0   0.63093      4         2
2            2     484751     9841   0.0       0.5      4         3
3            3     484751     6212   0.0  0.430677      4         4
4            4     484751       19   0.0  0.386853      4         5
...        ...        ...      ...   ...       ...    ...       ...
12875  1287005  999839889     5974   0.0  0.356207      4         6
12876  1287006  999839889     5491   0.0  0.333333      6         7
12877  1287007  999839889    22209   0.0  0.168613      5         8
12878  1287008  999839889     9040   0.0  0.169294      4         9
12879  1287009  999839889     9841   0.0  0.289065      4        10

[12880 rows x 7 columns]
