In [4]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler

In [6]:
users = pd.read_json("../data/raw/users.jsonl", lines=True)
sessions = pd.read_json("../data/raw/sessions.jsonl", lines=True)
products = pd.read_json("../data/raw/products.jsonl", lines=True)

In [7]:
sessions['score'] = sessions['event_type'].map({'VIEW_PRODUCT':5, 'BUY_PRODUCT':5})
sessions

Unnamed: 0,session_id,timestamp,user_id,product_id,event_type,offered_discount,purchase_id,score
0,124,2021-07-30 00:57:46,102,1040,VIEW_PRODUCT,10,,5
1,125,2021-05-13 04:51:17,102,1297,VIEW_PRODUCT,0,,5
2,126,2021-03-26 17:23:58,102,1288,VIEW_PRODUCT,0,,5
3,126,2021-03-26 17:25:28,102,1151,VIEW_PRODUCT,0,,5
4,126,2021-03-26 17:26:46,102,1055,VIEW_PRODUCT,0,,5
...,...,...,...,...,...,...,...,...
118386,20275,2021-06-07 01:00:33,301,1052,VIEW_PRODUCT,10,,5
118387,20275,2021-06-07 01:00:44,301,1153,VIEW_PRODUCT,10,,5
118388,20275,2021-06-07 01:03:47,301,1054,VIEW_PRODUCT,10,,5
118389,20275,2021-06-07 01:05:02,301,1049,VIEW_PRODUCT,10,,5


In [8]:
group = sessions.groupby(['user_id', 'product_id'])['score'].sum().reset_index()
group['score'] = group['score'].apply(lambda x: 5 if x>5 else x)
group = pd.pivot_table(group, values='score', index='user_id', columns='product_id')
group = group.fillna(0)
group = group.stack().reset_index()
group = group.rename(columns={0:'score'})
group['user_view'] = group['score'].apply(lambda x: 1 if x > 0 else 0)
group

Unnamed: 0,user_id,product_id,score,user_view
0,102,1001,0.0,0
1,102,1002,5.0,1
2,102,1003,5.0,1
3,102,1004,5.0,1
4,102,1005,5.0,1
...,...,...,...,...
63795,301,1315,5.0,1
63796,301,1316,5.0,1
63797,301,1317,5.0,1
63798,301,1318,5.0,1


In [9]:
std = MinMaxScaler(feature_range=(0, 1))
std.fit(group['score'].values.reshape(-1,1))
group['interaction_score'] = std.transform(group['score'].values.reshape(-1,1))

group

Unnamed: 0,user_id,product_id,score,user_view,interaction_score
0,102,1001,0.0,0,0.0
1,102,1002,5.0,1,1.0
2,102,1003,5.0,1,1.0
3,102,1004,5.0,1,1.0
4,102,1005,5.0,1,1.0
...,...,...,...,...,...
63795,301,1315,5.0,1,1.0
63796,301,1316,5.0,1,1.0
63797,301,1317,5.0,1,1.0
63798,301,1318,5.0,1,1.0


In [10]:
def price_bin(price):
    if price <= 25:
        return 0
    if price <= 50:
        return 1
    if price <= 100:
        return 2
    if price <= 250:
        return 3
    if price <= 500:
        return 4
    if price <= 1000:
        return 5
    if price <= 2000:
        return 6
    if price <= 4000:
        return 7
    else:
        return 8

In [11]:
def rating_bin(rating):
    if rating <= 0.5:
        return 0
    if rating <= 1.5:
        return 1
    if rating <= 2.5:
        return 2
    if rating <= 3.5:
        return 3
    if rating <= 4.5:
        return 4
    else:
        return 5

In [12]:
group = pd.merge(group, products, on="product_id", how="left")
group = pd.merge(group, users, on="user_id", how="left")
group = group[['user_id', 'product_id', 'product_name', 'category_path', 'price', 'user_rating', 'score', 'interaction_score', 'user_view']]
group['price'] = group['price'].apply(lambda x: price_bin(x))
group['user_rating'] = group['user_rating'].apply(lambda x: rating_bin(x))
group

Unnamed: 0,user_id,product_id,product_name,category_path,price,user_rating,score,interaction_score,user_view
0,102,1001,Telefon Siemens Gigaset DA310,Telefony i akcesoria;Telefony stacjonarne,2,5,0.0,0.0,0
1,102,1002,Kyocera FS-1135MFP,Komputery;Drukarki i skanery;Biurowe urządzeni...,7,1,5.0,1.0,1
2,102,1003,Kyocera FS-3640MFP,Komputery;Drukarki i skanery;Biurowe urządzeni...,8,4,5.0,1.0,1
3,102,1004,Fallout 3 (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,1,3,5.0,1.0,1
4,102,1005,Szalone Króliki Na żywo i w kolorze (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,1,0,5.0,1.0,1
...,...,...,...,...,...,...,...,...,...
63795,301,1315,Jabra Talk,Telefony i akcesoria;Akcesoria telefoniczne;Ze...,2,0,5.0,1.0,1
63796,301,1316,Plantronics Voyager Legend,Telefony i akcesoria;Akcesoria telefoniczne;Ze...,3,4,5.0,1.0,1
63797,301,1317,Plantronics Savi W740,Telefony i akcesoria;Akcesoria telefoniczne;Ze...,6,4,5.0,1.0,1
63798,301,1318,Plantronics Savi W710,Sprzęt RTV;Audio;Słuchawki,5,5,5.0,1.0,1


In [13]:
mask = np.random.rand(len(group)) < 0.8
trainset = group[mask]
testset = group[~mask]
trainset

Unnamed: 0,user_id,product_id,product_name,category_path,price,user_rating,score,interaction_score,user_view
2,102,1003,Kyocera FS-3640MFP,Komputery;Drukarki i skanery;Biurowe urządzeni...,8,4,5.0,1.0,1
6,102,1007,Dead Space 3 (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,2,5,5.0,1.0,1
7,102,1008,Tom Clancy&#39;s Rainbow Six Vegas (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,1,5,5.0,1.0,1
8,102,1009,Kinect Joy Ride (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,2,1,5.0,1.0,1
9,102,1010,BioShock 2 (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,2,2,5.0,1.0,1
...,...,...,...,...,...,...,...,...,...
63794,301,1314,Assassin&#39;s Creed (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,1,2,0.0,0.0,0
63795,301,1315,Jabra Talk,Telefony i akcesoria;Akcesoria telefoniczne;Ze...,2,0,5.0,1.0,1
63796,301,1316,Plantronics Voyager Legend,Telefony i akcesoria;Akcesoria telefoniczne;Ze...,3,4,5.0,1.0,1
63797,301,1317,Plantronics Savi W740,Telefony i akcesoria;Akcesoria telefoniczne;Ze...,6,4,5.0,1.0,1


In [14]:
train_matrix = pd.pivot_table(trainset, values='score', index='user_id', columns='product_id')
train_matrix = train_matrix.fillna(0)
train_matrix

product_id,1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,...,1310,1311,1312,1313,1314,1315,1316,1317,1318,1319
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
102,0.0,0.0,5.0,0.0,0.0,0.0,5.0,5.0,5.0,5.0,...,0.0,0.0,0.0,0.0,5.0,5.0,5.0,5.0,0.0,0.0
103,0.0,0.0,0.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,0.0,0.0,0.0,5.0,5.0,5.0,5.0,0.0,5.0,5.0
104,0.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,0.0,0.0,...,0.0,5.0,5.0,0.0,5.0,0.0,0.0,5.0,0.0,0.0
105,5.0,0.0,0.0,5.0,5.0,5.0,5.0,5.0,0.0,5.0,...,0.0,5.0,0.0,0.0,5.0,5.0,5.0,5.0,0.0,0.0
106,0.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,0.0,5.0,0.0,0.0,5.0,5.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,5.0,5.0,5.0,5.0,5.0,5.0,0.0,0.0,5.0,5.0,...,0.0,5.0,0.0,0.0,0.0,5.0,5.0,5.0,0.0,5.0
298,0.0,0.0,5.0,5.0,5.0,5.0,0.0,5.0,5.0,0.0,...,0.0,0.0,0.0,0.0,5.0,5.0,0.0,5.0,0.0,0.0
299,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
300,0.0,0.0,0.0,0.0,5.0,0.0,5.0,5.0,5.0,5.0,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
product_cat = trainset[['product_id', 'category_path', 'price', 'user_rating']].drop_duplicates('product_id')
product_cat = product_cat.sort_values(by='product_id')
product_cat

Unnamed: 0,product_id,category_path,price,user_rating
319,1001,Telefony i akcesoria;Telefony stacjonarne,2,5
639,1002,Komputery;Drukarki i skanery;Biurowe urządzeni...,7,1
2,1003,Komputery;Drukarki i skanery;Biurowe urządzeni...,8,4
322,1004,Gry i konsole;Gry na konsole;Gry Xbox 360,1,3
323,1005,Gry i konsole;Gry na konsole;Gry Xbox 360,1,0
...,...,...,...,...
314,1315,Telefony i akcesoria;Akcesoria telefoniczne;Ze...,2,0
315,1316,Telefony i akcesoria;Akcesoria telefoniczne;Ze...,3,4
316,1317,Telefony i akcesoria;Akcesoria telefoniczne;Ze...,6,4
317,1318,Sprzęt RTV;Audio;Słuchawki,5,5


In [16]:
price_matrix = np.reciprocal(euclidean_distances(np.array(product_cat['price']).reshape(-1,1))+1)
euclidean_matrix1 = pd.DataFrame(price_matrix,columns=product_cat['product_id'],index=product_cat['product_id'])

rating_matrix = np.reciprocal(euclidean_distances(np.array(product_cat['user_rating']).reshape(-1,1))+1)
euclidean_matrix2 = pd.DataFrame(rating_matrix,columns=product_cat['product_id'],index=product_cat['product_id'])

tfidf_vectorizer = TfidfVectorizer()
doc_term = tfidf_vectorizer.fit_transform(list(product_cat['category_path']))
dt_matrix = pd.DataFrame(doc_term.toarray().round(3), index=[i for i in product_cat['product_id']], columns=tfidf_vectorizer.get_feature_names())
cos_similar_matrix = pd.DataFrame(cosine_similarity(dt_matrix.values),columns=product_cat['product_id'],index=product_cat['product_id'])

similarity_matrix = euclidean_matrix1.multiply(euclidean_matrix2).multiply(cos_similar_matrix)
content_matrix = train_matrix.dot(similarity_matrix)
std = MinMaxScaler(feature_range=(0, 1))
std.fit(content_matrix.values)
content_matrix = std.transform(content_matrix.values)
content_matrix = pd.DataFrame(content_matrix,columns=sorted(trainset['product_id'].unique()),index=sorted(trainset['user_id'].unique()))
content_matrix



Unnamed: 0,1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,...,1310,1311,1312,1313,1314,1315,1316,1317,1318,1319
102,0.442528,0.404883,0.647633,0.836574,0.713570,0.785704,0.952613,0.922382,0.837164,0.785704,...,0.697561,0.789192,0.689776,0.589751,0.861066,0.812993,0.805039,0.835672,0.312814,0.438830
103,0.659426,0.384914,0.443936,0.856530,0.814848,0.862864,0.945294,0.870682,0.819957,0.862864,...,0.751676,0.622028,0.799004,0.866966,0.883194,0.878321,0.917830,0.619027,1.000000,0.950357
104,0.359136,1.000000,0.994433,0.734704,0.594540,0.659644,0.801163,0.770151,0.606104,0.659644,...,0.627882,0.597763,0.649849,0.505587,0.686990,0.337902,0.445121,0.686211,0.307124,0.337143
105,0.967610,0.282837,0.372036,0.617409,0.550520,0.624163,0.639182,0.614499,0.586820,0.624163,...,0.524166,0.712264,0.574446,0.619507,0.628182,0.916085,0.975371,0.970742,0.338904,0.545490
106,0.617791,0.681215,0.569498,0.875279,0.667663,0.840489,0.865541,0.819826,0.783191,0.840489,...,0.924134,0.880691,0.893750,1.000000,0.868489,0.588766,0.915614,0.888026,0.609395,0.563649
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,0.829796,0.793067,0.690111,0.778085,0.637282,0.782131,0.807464,0.777582,0.734689,0.782131,...,0.544574,0.627195,0.599721,0.429546,0.719144,0.863168,0.690323,0.777794,0.253591,0.850230
298,0.391397,0.562303,0.934621,0.835435,0.747890,0.832525,0.804474,0.779880,0.809443,0.832525,...,0.754252,0.694579,0.710903,0.528325,0.873651,0.699753,0.534607,0.782679,0.308147,0.346154
299,0.066127,0.073762,0.033777,0.114601,0.122383,0.117930,0.093079,0.103083,0.127174,0.117930,...,0.083697,0.069488,0.104825,0.093926,0.125745,0.058159,0.076652,0.090890,0.059841,0.073277
300,0.003476,0.017808,0.005595,0.126885,0.157672,0.130460,0.170243,0.143883,0.128887,0.130460,...,0.033562,0.130215,0.017921,0.022790,0.118483,0.014838,0.009749,0.004100,0.011277,0.003722


In [17]:
content_df = content_matrix.stack().reset_index()
content_df = content_df.rename(columns={'level_0':'user_id','level_1':'product_id',0:'predicted_interaction'})
content_df

Unnamed: 0,user_id,product_id,predicted_interaction
0,102,1001,0.442528
1,102,1002,0.404883
2,102,1003,0.647633
3,102,1004,0.836574
4,102,1005,0.713570
...,...,...,...
63795,301,1315,0.943119
63796,301,1316,0.933119
63797,301,1317,0.899556
63798,301,1318,0.885893


In [18]:
testset = testset.merge(content_df,on=['user_id','product_id'])
testset['predicted_view'] = testset['predicted_interaction'].apply(lambda x:1 if x>=0.5 else 0)
testset

Unnamed: 0,user_id,product_id,product_name,category_path,price,user_rating,score,interaction_score,user_view,predicted_interaction,predicted_view
0,102,1001,Telefon Siemens Gigaset DA310,Telefony i akcesoria;Telefony stacjonarne,2,5,0.0,0.0,0,0.442528,0
1,102,1002,Kyocera FS-1135MFP,Komputery;Drukarki i skanery;Biurowe urządzeni...,7,1,5.0,1.0,1,0.404883,0
2,102,1004,Fallout 3 (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,1,3,5.0,1.0,1,0.836574,1
3,102,1005,Szalone Króliki Na żywo i w kolorze (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,1,0,5.0,1.0,1,0.713570,1
4,102,1006,Call of Duty 4 Modern Warfare (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,2,2,5.0,1.0,1,0.785704,1
...,...,...,...,...,...,...,...,...,...,...,...
12678,301,1278,Intenso Music Walker 8GB,Sprzęt RTV;Przenośne audio i video;Odtwarzacze...,2,2,5.0,1.0,1,0.488395,0
12679,301,1296,Sencor SDA-630,Sprzęt RTV;Video;Telewizory i akcesoria;Anteny...,2,5,0.0,0.0,0,0.648637,1
12680,301,1297,Telmor DSP-860,Sprzęt RTV;Video;Telewizory i akcesoria;Anteny...,3,3,5.0,1.0,1,0.710251,1
12681,301,1306,One For All SV 9215,Sprzęt RTV;Video;Telewizory i akcesoria;Anteny...,2,5,0.0,0.0,0,0.648637,1


In [19]:
correct = 0
for index, record in testset.iterrows():
    if record['user_view'] == record['predicted_view']:
        correct = correct + 1
correct = correct / len(testset)
print(f"Accuracy: {correct*100}%")

Accuracy: 61.32618465662698%
