# Aula 10 - Recomendação baseada em sessão - exemplos

In [1]:
import pandas as pd
import numpy as np

### Leitura do arquivo 2019-Oct-sample.csv

In [2]:
subset = pd.read_csv('./2019-Oct-sample.csv')
subset.head()

Unnamed: 0,event_time,event_type,product_id,category_code,brand,user_session
0,2019-10-31 06:23:12 UTC,view,1005115,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c
1,2019-10-31 06:23:52 UTC,view,1005105,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c
2,2019-10-31 06:25:30 UTC,view,1005105,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c
3,2019-10-31 06:26:58 UTC,view,1004858,electronics.smartphone,samsung,00000056-a206-40dd-b174-a072550fa38c
4,2019-10-31 06:28:21 UTC,view,1005104,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c


In [3]:
map_items = {item: idx for idx, item in enumerate(subset.product_id.unique())}
map_sessions = {item: idx for idx, item in enumerate(subset.user_session.unique())}
subset['itemId'] = subset['product_id'].map(map_items)
subset['sessionId'] = subset['user_session'].map(map_sessions)
subset.head()

Unnamed: 0,event_time,event_type,product_id,category_code,brand,user_session,itemId,sessionId
0,2019-10-31 06:23:12 UTC,view,1005115,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c,0,0
1,2019-10-31 06:23:52 UTC,view,1005105,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c,1,0
2,2019-10-31 06:25:30 UTC,view,1005105,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c,1,0
3,2019-10-31 06:26:58 UTC,view,1004858,electronics.smartphone,samsung,00000056-a206-40dd-b174-a072550fa38c,2,0
4,2019-10-31 06:28:21 UTC,view,1005104,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c,3,0


In [4]:
n_items = subset['itemId'].max()+1
print('No. items: ', n_items)
n_sessions = subset['sessionId'].max()+1
print('No. sessions: ', n_sessions)

No. items:  42581
No. sessions:  483508


In [5]:
# create a dataset
# remove sessions with less than 2 items
def create_data(df):
    df.sort_values(by=['sessionId', 'event_time'], inplace=True, ignore_index=True)
    sessions, session = [], []
    for index, value in df.iterrows():
        if index != 0:
            if value["sessionId"] == df.at[index-1, "sessionId"]:
                if value["event_type"] == 'view':
                    session.append(value["itemId"])
            else:
                if len(session) > 1:
                    sessions.append((df.at[index-1, "sessionId"], session))
                session = [value["itemId"]]
        else:
            session.append(value["itemId"])
    return sessions

In [6]:
sessions = create_data(subset)

In [7]:
print('No. sessions: ', len(sessions))
print('Session 1:', sessions[1])
subset.loc[subset.sessionId==1]

No. sessions:  296914
Session 1: (1, [6, 7, 8, 9, 10, 11, 12, 9, 13, 9, 0, 14, 1, 15, 16, 17])


Unnamed: 0,event_time,event_type,product_id,category_code,brand,user_session,itemId,sessionId
7,2019-10-06 11:24:45 UTC,view,1004768,electronics.smartphone,samsung,00000083-8816-4d58-a9b8-f52f54186edc,6,1
8,2019-10-06 11:25:54 UTC,view,1005098,electronics.smartphone,samsung,00000083-8816-4d58-a9b8-f52f54186edc,7,1
9,2019-10-06 11:25:59 UTC,view,1005073,electronics.smartphone,samsung,00000083-8816-4d58-a9b8-f52f54186edc,8,1
10,2019-10-06 11:26:39 UTC,view,1004871,electronics.smartphone,samsung,00000083-8816-4d58-a9b8-f52f54186edc,9,1
11,2019-10-06 11:26:53 UTC,view,1004751,electronics.smartphone,samsung,00000083-8816-4d58-a9b8-f52f54186edc,10,1
12,2019-10-06 11:27:05 UTC,view,1004653,electronics.smartphone,samsung,00000083-8816-4d58-a9b8-f52f54186edc,11,1
13,2019-10-06 11:27:24 UTC,view,1005015,electronics.smartphone,samsung,00000083-8816-4d58-a9b8-f52f54186edc,12,1
14,2019-10-06 11:28:05 UTC,view,1004871,electronics.smartphone,samsung,00000083-8816-4d58-a9b8-f52f54186edc,9,1
15,2019-10-06 11:28:34 UTC,view,1003527,electronics.smartphone,xiaomi,00000083-8816-4d58-a9b8-f52f54186edc,13,1
16,2019-10-06 11:28:45 UTC,view,1004871,electronics.smartphone,samsung,00000083-8816-4d58-a9b8-f52f54186edc,9,1


In [8]:
import random

random.shuffle(sessions)
split = len(sessions) * 0.8
train = sessions[:int(split)]
test = sessions[int(split):]
print('No. train sessions: ', len(train))
print('No. test sessions: ', len(test))

No. train sessions:  237531
No. test sessions:  59383


In [9]:
def jaccard(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

In [10]:
actual_session = test[3]
target = actual_session[1][0:-1]
print(actual_session)
print(target)
subset.loc[subset.sessionId==actual_session[0]]

(480724, [1619, 93, 5760, 5760])
[1619, 93, 5760]


Unnamed: 0,event_time,event_type,product_id,category_code,brand,user_session,itemId,sessionId
1989117,2019-10-28 18:49:49 UTC,view,3701134,appliances.environment.vacuum,bosch,132f42ee-9b0e-43d3-8478-068da8d438a3,1619,480724
1989118,2019-10-28 18:54:50 UTC,view,3701084,appliances.environment.vacuum,xiaomi,132f42ee-9b0e-43d3-8478-068da8d438a3,93,480724
1989119,2019-10-28 19:04:33 UTC,view,3700828,appliances.environment.vacuum,philips,132f42ee-9b0e-43d3-8478-068da8d438a3,5760,480724
1989120,2019-10-28 19:06:07 UTC,view,3700828,appliances.environment.vacuum,philips,132f42ee-9b0e-43d3-8478-068da8d438a3,5760,480724


In [11]:
def compute_score(train, target, itemId):
    candidate_sessions = []
    for s in range(len(train)):
        if itemId in train[s][1]:
            candidate_sessions.append(train[s][1])
    
    score = 0
    for n in range(len(candidate_sessions)):
        score += jaccard(candidate_sessions[n], target)
    
    return score
    
compute_score(train=train, target=target, itemId=931)

0.0

In [12]:
categories = subset.loc[subset.sessionId==actual_session[0]]['category_code'].unique().tolist()
candidate_items = subset.loc[subset.category_code.isin(categories)]['itemId'].unique().tolist()
candidate_items

[93,
 171,
 175,
 196,
 199,
 231,
 232,
 233,
 234,
 235,
 236,
 237,
 295,
 312,
 512,
 513,
 559,
 645,
 648,
 649,
 650,
 651,
 652,
 653,
 654,
 655,
 656,
 657,
 658,
 710,
 711,
 810,
 908,
 993,
 994,
 1079,
 1080,
 1110,
 1122,
 1196,
 1197,
 1198,
 1199,
 1200,
 1201,
 1202,
 1203,
 1204,
 1205,
 1206,
 1289,
 1290,
 1291,
 1292,
 1293,
 1374,
 1375,
 1376,
 1377,
 1378,
 1379,
 1405,
 1499,
 1527,
 1552,
 1617,
 1618,
 1619,
 1620,
 1621,
 1622,
 1623,
 1657,
 1658,
 1659,
 1660,
 1791,
 1847,
 1937,
 1938,
 1943,
 1944,
 1945,
 2080,
 2101,
 2102,
 2103,
 2193,
 2194,
 2195,
 2196,
 2293,
 2297,
 2298,
 2433,
 2605,
 2667,
 2668,
 2736,
 2737,
 2738,
 3075,
 3163,
 3178,
 3179,
 3180,
 3181,
 3199,
 3200,
 3201,
 3202,
 3216,
 3221,
 3222,
 3223,
 3287,
 3298,
 3321,
 3428,
 3464,
 3465,
 3466,
 3478,
 3479,
 3480,
 3481,
 3482,
 3483,
 3542,
 3566,
 3596,
 3628,
 3876,
 3883,
 3919,
 3940,
 3941,
 3942,
 3943,
 3944,
 3945,
 3946,
 3947,
 3948,
 4093,
 4203,
 4238,
 4260,


In [13]:
ranking = []
for i in range(len(candidate_items)):
    ranking.append((compute_score(train, target, candidate_items[i]), candidate_items[i]))

ranking.sort()
ranking.reverse()
print(ranking[0:10])

[(93.77613906065321, 1619), (54.39865682471801, 93), (20.454496258535112, 810), (15.416207032483264, 1527), (15.354235276112906, 5760), (10.164347268853518, 1657), (9.878098896084811, 196), (8.67006150576479, 295), (7.108629633492461, 312), (7.1006393485986905, 3181)]


In [14]:
subset.loc[subset.itemId==1046]

Unnamed: 0,event_time,event_type,product_id,category_code,brand,user_session,itemId,sessionId
2519,2019-10-12 18:16:54 UTC,view,5100719,electronics.clocks,samsung,000729f8-7f6c-43f6-b1dd-aab45a29f5c1,1046,652
5660,2019-10-01 13:27:59 UTC,view,5100719,electronics.clocks,samsung,000fdfe4-e1f0-4a93-9c22-f04066ad895e,1046,1461
5705,2019-10-01 10:07:25 UTC,view,5100719,electronics.clocks,samsung,000ff41f-8d94-449e-86fc-25e0957da685,1046,1470
5706,2019-10-01 10:08:04 UTC,view,5100719,electronics.clocks,samsung,000ff41f-8d94-449e-86fc-25e0957da685,1046,1470
6330,2019-10-12 19:48:00 UTC,view,5100719,electronics.clocks,samsung,00118404-d57f-4480-b917-df839bc6a188,1046,1642
...,...,...,...,...,...,...,...,...
1998579,2019-10-31 03:54:14 UTC,view,5100719,electronics.clocks,samsung,1347912f-0cb1-4086-b174-d7f10f275ffa,1046,483132
1998631,2019-10-11 16:58:51 UTC,view,5100719,electronics.clocks,samsung,1347c04a-6cfd-4cca-925e-60cead2e1931,1046,483151
1998632,2019-10-11 16:59:10 UTC,view,5100719,electronics.clocks,samsung,1347c04a-6cfd-4cca-925e-60cead2e1931,1046,483151
1998633,2019-10-11 17:00:34 UTC,view,5100719,electronics.clocks,samsung,1347c04a-6cfd-4cca-925e-60cead2e1931,1046,483151
