In [1]:
import sys
import os

parent_path = '../..'
sys.path.append(parent_path)

import src

In [2]:
import numpy as np
import time

def original_item_mean_calc(interactions_df, items_embeddings):
    contexts = []
    users_current_info = {}
     
    for _, row in interactions_df.iterrows():
        user_id = row[src.COLUMN_USER_ID]
        item_id = row[src.COLUMN_ITEM_ID]

        if user_id not in users_current_info:
            users_current_info[user_id] = {
                'acum_emb': np.zeros((items_embeddings.shape[1], )),
                'count': 0
            }
        
        contexts.append(users_current_info[user_id]['acum_emb'] / max(1, users_current_info[user_id]['count']))

        users_current_info[user_id]['acum_emb'] += items_embeddings[item_id][:items_embeddings.shape[1]]
        users_current_info[user_id]['count'] += 1
    
    return contexts

In [3]:
import pandas as pd

np.random.seed(42)

qnt_interactions = 1_000_000
qnt_items = 1_000
qnt_users = 1_000
embeddings_size = 128

interactions_df = pd.DataFrame({
    src.COLUMN_USER_ID: np.random.randint(0, qnt_users, qnt_interactions),
    src.COLUMN_ITEM_ID: np.random.randint(0, qnt_items, qnt_interactions)
})

embeddings = np.random.randn(qnt_items, embeddings_size)

display(interactions_df)
print(embeddings)

Unnamed: 0,id_user,id_item
0,102,561
1,435,272
2,860,50
3,270,560
4,106,945
...,...,...
999995,90,939
999996,317,459
999997,478,157
999998,20,581


[[ 0.07625305 -2.40397124  0.88920569 ... -0.61930533  0.04758986
  -1.52609338]
 [-0.61174723 -2.5630066   0.14089729 ...  0.00434078 -1.02284649
  -0.69097982]
 [ 2.50829613 -0.31981701 -0.08647952 ...  0.52865101  0.01792331
  -0.53275675]
 ...
 [-0.53067589  1.14584774  0.64468006 ...  0.80784909 -0.57144674
  -2.11550449]
 [ 0.2180888   1.41055271  0.44752302 ...  1.02060622  0.4953454
   0.5482535 ]
 [ 0.59584657 -0.09841745  0.4816981  ...  0.37223684  1.72974581
  -0.37055014]]


In [4]:
start_time = time.time()
result1 = original_item_mean_calc(interactions_df, embeddings)
time_1 = time.time() - start_time

print(f'Demorou {time_1} segundos')

Demorou 67.81588625907898 segundos


In [9]:
def new_item_mean_calc_vectorized(interactions_df, items_embeddings):
    """
    Calculates the mean of previous item embeddings for each user context in a vectorized way.

    Args:
        interactions_df (pd.DataFrame): DataFrame with 'user_id' and 'item_id' columns.
        items_embeddings (np.ndarray): NumPy array where items_embeddings[item_id]
                                        gives the embedding for that item.

    Returns:
        list: A list of context embeddings, where each context is the mean of
              previous item embeddings for the corresponding user interaction.
    """
    n_users = interactions_df[src.COLUMN_USER_ID].nunique()
    embedding_dim = items_embeddings.shape[1]

    # Map user_ids to 0-indexed values if they are not already, for direct indexing into arrays
    # This is a common practice for efficiency when user IDs might be sparse or non-sequential.
    # For this example, we assume user_ids are already 0-indexed for simplicity.
    # If not, you'd add:
    # unique_users = interactions_df[src.COLUMN_USER_ID].unique()
    # user_id_map = {user: i for i, user in enumerate(unique_users)}
    # mapped_user_ids = interactions_df[src.COLUMN_USER_ID].map(user_id_map).values
    # N.B.: Ensure n_users reflects the maximum mapped user ID + 1.

    user_ids = interactions_df[src.COLUMN_USER_ID].values
    item_ids = interactions_df[src.COLUMN_ITEM_ID].values

    # Get the embeddings for all items in the interactions_df
    interaction_item_embeddings = items_embeddings[item_ids]

    # Create empty arrays to store cumulative sums and counts
    # These will temporarily hold the running sums/counts for each interaction
    # The trick is to group by user and apply cumulative operations within groups.

    # Option 1: Using groupby and apply (often readable, but might be slightly slower than pure numpy for very large DFs)
    # This directly calculates the cumulative sum and count per user.
    temp_df = pd.DataFrame({
        'user_id': user_ids,
        'item_id': item_ids,
        'embedding': list(interaction_item_embeddings) # Store embeddings as a list of arrays
    })

    # Calculate cumulative sum of embeddings per user
    temp_df['cum_emb'] = temp_df.groupby('user_id')['embedding'].cumsum()

    # Calculate cumulative count of items per user
    temp_df['cum_count'] = temp_df.groupby('user_id').cumcount() + 1 # cumcount() is 0-indexed

    # Shift the cumulative values to get the *previous* state
    temp_df['prev_cum_emb'] = temp_df.groupby('user_id')['cum_emb'].shift(fill_value=np.zeros(embedding_dim))
    temp_df['prev_cum_count'] = temp_df.groupby('user_id')['cum_count'].shift(fill_value=0)

    # Calculate contexts
    # Handle division by zero for the first item of a user
    # We use np.where to conditionally divide or return zeros
    contexts_raw = np.array(temp_df['prev_cum_emb'].tolist())
    prev_counts_arr = temp_df['prev_cum_count'].values.reshape(-1, 1) # Reshape for broadcasting

    # contexts = np.where(prev_counts_arr > 0, contexts_raw / prev_counts_arr, np.zeros_like(contexts_raw))
    # A more robust way to handle the division, similar to the original max(1, counts[user_id])
    # The original max(1, counts[user_id]) effectively means if count is 0, the result is 0.
    # If prev_counts_arr is 0, we want the context to be zeros.
    contexts = np.divide(contexts_raw, prev_counts_arr,
                         out=np.zeros_like(contexts_raw, dtype=float),
                         where=prev_counts_arr != 0)

    return contexts.tolist() # Convert back to list of arrays if that's the desired output format

In [21]:
def new_item_mean_calc(interactions_df, items_embeddings):
    contexts = []
    n_users = 1_000
    embedding_dim = items_embeddings.shape[1]

    acum_embs = np.zeros((n_users, embedding_dim))
    counts = np.zeros(n_users)
    
    user_ids = interactions_df[src.COLUMN_USER_ID].values
    item_ids = interactions_df[src.COLUMN_ITEM_ID].values
    
    for user_id, item_id in zip(user_ids, item_ids):
        contexts.append(acum_embs[user_id] / max(1, counts[user_id]))

        acum_embs[user_id] += items_embeddings[item_id]
        counts[user_id] += 1
    
    return contexts

In [22]:
start_time = time.time()
result2 = new_item_mean_calc(interactions_df, embeddings)
time_2 = time.time() - start_time

print(f'Demorou {time_2} ({time_1 / time_2:.2f}x mais rápido) segundos')
print('Resultados iguais ?')
print((np.array(result1) == np.array(result2)).all())

Demorou 4.193735837936401 (16.17x mais rápido) segundos
Resultados iguais ?
True
