# Aula 10 - Recomendação baseada em sessão - Exercícios

In [1]:
import pandas as pd
import numpy as np

### Leitura do arquivo 2019-Oct-sample.csv (vide Aula 10 - Exemplos) caso não possua o arquivo

In [2]:
subset = pd.read_csv('./2019-Oct-sample.csv')
subset.head()

Unnamed: 0,event_time,event_type,product_id,category_code,brand,user_session
0,2019-10-31 06:23:12 UTC,view,1005115,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c
1,2019-10-31 06:23:52 UTC,view,1005105,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c
2,2019-10-31 06:25:30 UTC,view,1005105,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c
3,2019-10-31 06:26:58 UTC,view,1004858,electronics.smartphone,samsung,00000056-a206-40dd-b174-a072550fa38c
4,2019-10-31 06:28:21 UTC,view,1005104,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c


In [3]:
map_items = {item: idx for idx, item in enumerate(subset.product_id.unique())}
map_sessions = {item: idx for idx, item in enumerate(subset.user_session.unique())}
subset['itemId'] = subset['product_id'].map(map_items)
subset['sessionId'] = subset['user_session'].map(map_sessions)
subset.head()

Unnamed: 0,event_time,event_type,product_id,category_code,brand,user_session,itemId,sessionId
0,2019-10-31 06:23:12 UTC,view,1005115,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c,0,0
1,2019-10-31 06:23:52 UTC,view,1005105,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c,1,0
2,2019-10-31 06:25:30 UTC,view,1005105,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c,1,0
3,2019-10-31 06:26:58 UTC,view,1004858,electronics.smartphone,samsung,00000056-a206-40dd-b174-a072550fa38c,2,0
4,2019-10-31 06:28:21 UTC,view,1005104,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c,3,0


In [4]:
n_items = subset['itemId'].max()+1
print('No. items: ', n_items)
n_sessions = subset['sessionId'].max()+1
print('No. sessions: ', n_sessions)

No. items:  42581
No. sessions:  483508


In [5]:
# create a dataset
# remove sessions with less than 2 items
def create_data(df):
    df.sort_values(by=['sessionId', 'event_time'], inplace=True, ignore_index=True)
    sessions, session = [], []
    for index, value in df.iterrows():
        if index != 0:
            if value["sessionId"] == df.at[index-1, "sessionId"]:
                if value["event_type"] == 'view':
                    session.append(value["itemId"])
            else:
                if len(session) > 1:
                    sessions.append((df.at[index-1, "sessionId"], session))
                session = [value["itemId"]]
        else:
            session.append(value["itemId"])
    return sessions

In [6]:
sessions = create_data(subset)

In [7]:
import random

random.shuffle(sessions)
split = len(sessions) * 0.8
train = sessions[:int(split)]
test = sessions[int(split):]
print('No. train sessions: ', len(train))
print('No. test sessions: ', len(test))

No. train sessions:  237531
No. test sessions:  59383


In [8]:
def jaccard(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

In [9]:
actual_session = test[37]
target = actual_session[1][0:-1]
print(actual_session)
print(target)
subset.loc[subset.sessionId==actual_session[0]]

(413016, [1764, 1764, 352, 352])
[1764, 1764, 352]


Unnamed: 0,event_time,event_type,product_id,category_code,brand,user_session,itemId,sessionId
1710744,2019-10-23 12:34:00 UTC,view,1004436,electronics.smartphone,samsung,107af9f0-92bc-482a-ab72-14d4787ae1ac,1764,413016
1710745,2019-10-23 12:34:52 UTC,view,1004436,electronics.smartphone,samsung,107af9f0-92bc-482a-ab72-14d4787ae1ac,1764,413016
1710746,2019-10-23 12:35:53 UTC,view,1004766,electronics.smartphone,samsung,107af9f0-92bc-482a-ab72-14d4787ae1ac,352,413016
1710747,2019-10-23 12:36:25 UTC,view,1004766,electronics.smartphone,samsung,107af9f0-92bc-482a-ab72-14d4787ae1ac,352,413016


In [10]:
def compute_score(train, target, itemId):
    candidate_sessions = []
    for s in range(len(train)):
        if itemId in train[s][1]:
            candidate_sessions.append(train[s][1])
    
    score = 0
    for n in range(len(candidate_sessions)):
        score += jaccard(candidate_sessions[n], target)
    
    return score


In [11]:
categories = subset.loc[subset.sessionId==actual_session[0]]['category_code'].unique().tolist()
candidate_items = subset.loc[subset.category_code.isin(categories)]['itemId'].unique().tolist()
len(candidate_items)

1190

In [12]:
ranking = []
for i in range(len(candidate_items)):
    ranking.append((compute_score(train, target, candidate_items[i]), candidate_items[i]))

ranking.sort()
ranking.reverse()
print(ranking[0:10])

[(185.11428563853138, 352), (78.27063199228311, 1764), (57.242783767755355, 136), (32.0487716897276, 6), (26.532767825253533, 18), (24.152746201712205, 153), (22.408210966996066, 147), (21.613262807707365, 17), (17.4373353654766, 222), (15.996059381043912, 79)]


In [13]:
subset.loc[subset.itemId==21083]

Unnamed: 0,event_time,event_type,product_id,category_code,brand,user_session,itemId,sessionId
262378,2019-10-28 11:56:34 UTC,view,21408343,electronics.clocks,romanson,02880533-d7ce-45bc-bc70-5d348203de11,21083,63162
329712,2019-10-11 15:52:48 UTC,view,21408343,electronics.clocks,romanson,032e7ca0-c78a-4167-b40b-e695e86a2f03,21083,79442
361345,2019-10-08 18:25:15 UTC,view,21408343,electronics.clocks,romanson,037aa64f-9798-4a10-b692-6d60d757ba6b,21083,87016
493880,2019-10-30 13:46:25 UTC,view,21408343,electronics.clocks,romanson,04c22360-b6c7-47bb-af50-501b85ce3399,21083,119192
657797,2019-10-08 17:07:25 UTC,view,21408343,electronics.clocks,romanson,0658adfb-8f43-4869-8f68-f11e16a40e45,21083,158819
667941,2019-10-09 07:40:36 UTC,view,21408343,electronics.clocks,romanson,06725817-89ff-4b8f-8c1a-adda02bd13a8,21083,161355
714952,2019-10-15 16:32:32 UTC,view,21408343,electronics.clocks,romanson,06e60861-aecc-4544-866b-a01499926aae,21083,172626
714953,2019-10-15 16:33:16 UTC,view,21408343,electronics.clocks,romanson,06e60861-aecc-4544-866b-a01499926aae,21083,172626
909150,2019-10-19 01:05:03 UTC,view,21408343,electronics.clocks,romanson,08c3c7ba-3874-45a7-a836-5952a02ed413,21083,219468
1055780,2019-10-07 10:35:35 UTC,view,21408343,electronics.clocks,romanson,0a3039ef-a7d3-4101-959b-30003cb161d4,21083,255199


## Para essa aula, você poderá escolher um dentre os três exercícios abaixo para resolver.

***Exercício 01:*** A função compute_score() definida acima e explicada na aula, é a implementação do algoritmo Session-based KNN (S-KNN). Implemente uma variação da função que represente o algoritmo Sequential Session-based KNN (S-SKNN). Compare o desempenho de ambas as funções na recomendação do último item de uma sessão qualquer do conjunto de teste. Para fazer essa comparação, utilize a métrica Average Precision. 

In [14]:
# TODO

***Exercício 02:*** Implemente outra variação da função compute_score() que represente o algoritmo Sequential Filter Session-based KNN (SF-SKNN). Compare o desempenho desse algoritmo com as demais abordagens para uma sessão qualquer do conjunto de teste.

In [15]:
# TODO

***Exercício 03:*** Na aula utilizamos uma estratégia trivial para selecionar itens candidatos para poder calcular seu escore: selecionamos apenas itens da mesma categoria que os itens que estão na sessão atual. Isso pode ser um problema, pois numa sessão, um usuário pode estar visualizando um produto e o sistema poderia recomendar um produto de outra categoria (exemplo: usuário visualiza/compra um smartphone, e o sistema recomenda uma capa protetora). Pense e implemente uma estratégia para selecionar os itens candidatos para os quais será calculado o escore via função compute_score(). Lembre-se de que quanto menor a quantidade de itens candidatos, mais rápido o sistema irá gerar a recomendação top N. Explique sua estratégia.

### A estratégia abordada foi de unir dois conjuntos de recomendação, o primeiro contendo os itens já recomendados normalmente e o segundo os itens mais populares das 3 categorias mais próximas da mais frequente.

In [16]:
from collections import defaultdict
from itertools import combinations

Construir um mapa de afinidade baseado na métrica de Jaccard

In [22]:
# Agrupar o dataset completo por sessão para obter uma lista de categorias únicas por sessão
session_categories = subset.groupby('category_code')['sessionId'].apply(set).to_dict()

affinity_scores = {}
all_categories = list(session_categories.keys())

# Calcular a similaridade de Jaccard para cada par único de categorias
for cat1, cat2 in combinations(all_categories, 2):
    # Obter os conjuntos de sessões para cada categoria
    sessions1 = session_categories.get(cat1, set())
    sessions2 = session_categories.get(cat2, set())
    
    # Calcular o score de Jaccard
    score = jaccard(sessions1, sessions2)
    
    # Guardar o score se for significativo (maior que zero)
    if score > 0:
        if cat1 not in affinity_scores: affinity_scores[cat1] = {}
        if cat2 not in affinity_scores: affinity_scores[cat2] = {}
        affinity_scores[cat1][cat2] = score
        affinity_scores[cat2][cat1] = score
# Cria o mapa de afinidade próprimanete dito
affinity_map = {}
for cat, neighbors in affinity_scores.items():
    # Escolher top 3 vizinhos
    top_neighbors = sorted(neighbors.items(), key=lambda item: item[1], reverse=True)[:3]
    affinity_map[cat] = [neighbor[0] for neighbor in top_neighbors]

In [27]:
example_category = 'appliances.environment.vacuum'
if example_category in affinity_map:
    print(affinity_map[example_category])

['appliances.kitchen.washer', 'appliances.iron', 'appliances.kitchen.refrigerators']


In [34]:
# Calcular a popularidade global de todo os itens 
item_popularity = subset['itemId'].value_counts()

In [41]:
def candidates(actual_session, k=3):
    # Obter a categoria principal da sessão atual
    try:
        main_category = actual_session['category_code'].value_counts().idxmax()
    except ValueError:
        return [], []
    
    # Encontrar categoriar vizinhas
    neighbors = affinity_map.get(main_category, [])
        
    # Mix all the relevant categories
    relevant_categories = [main_category] + (neighbors)
    
    # get the top 3 itens for each of the relevat categories 
    top_k_items = []
    for category in relevant_categories:
        itens_na_categoria = subset[subset['category_code'] == category]['itemId'].unique()
        
        pop_in_cat = item_popularity[item_popularity.index.isin(itens_na_categoria)]
        
        top_k_items.extend(pop_in_cat.head(k).index.tolist())
    
    candidate_items = list(set(top_k_items))
    
    return candidate_items, relevant_categories

Meapear o id do item para o nome

In [45]:
item_details = subset[['itemId', 'category_code', 'brand']].drop_duplicates(subset='itemId').set_index('itemId')
map_id_to_name = {
    itemId: f"{row['brand']} {row['category_code']}" 
    for itemId, row in item_details.fillna('').iterrows()
}

In [None]:
actual_session_df = subset.loc[subset.sessionId == actual_session[0]]

candidate_items_top3, _ = candidates(actual_session_df)

candidate_names = [
    map_id_to_name.get(itemId) 
    for itemId in candidate_items_top3
]

for name in candidate_names:
    print(name)

apple electronics.smartphone
xiaomi electronics.audio.headphone
samsung electronics.smartphone
apple electronics.audio.headphone
samsung electronics.video.tv
samsung electronics.smartphone
apple electronics.clocks
artel electronics.video.tv
samsung electronics.video.tv
apple electronics.audio.headphone
samsung electronics.clocks
xiaomi electronics.clocks


In [47]:
def compute_score(train, target, itemId):
    candidate_sessions = []
    for s in range(len(train)):
        if itemId in train[s][1]:
            candidate_sessions.append(train[s][1])
    
    score = 0
    for n in range(len(candidate_sessions)):
        score += jaccard(candidate_sessions[n], target)
    
    return score

In [48]:
target_session_items = actual_session[1]

ranking = []

for item_id in candidate_items_top3:
    score = compute_score(train, target_session_items, item_id)
    ranking.append((score, item_id))

ranking.sort(key=lambda x: x[0], reverse=True)

top_10_recomendacoes = []
for score, item_id in ranking[:10]:
    nome_do_produto = map_id_to_name.get(item_id, f"ID Desconhecido: {item_id}")
    top_10_recomendacoes.append({'Produto': nome_do_produto, 'Score': score})


recomendacoes_df = pd.DataFrame(top_10_recomendacoes)
display(recomendacoes_df)

Unnamed: 0,Produto,Score
0,samsung electronics.smartphone,50.571144
1,samsung electronics.smartphone,23.9873
2,apple electronics.smartphone,5.458682
3,xiaomi electronics.audio.headphone,0.480648
4,apple electronics.audio.headphone,0.410742
5,apple electronics.clocks,0.276149
6,apple electronics.audio.headphone,0.25811
7,samsung electronics.video.tv,0.231602
8,samsung electronics.clocks,0.175223
9,samsung electronics.video.tv,0.132009
