In [280]:
import numpy as np
import pandas as pd
from scipy import spatial
import operator
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler

In [281]:
users = pd.read_json("./IUM21Z_Zad_02_01/users.jsonl", lines=True)
sessions = pd.read_json("./IUM21Z_Zad_02_01/sessions.jsonl", lines=True)
products = pd.read_json("./IUM21Z_Zad_02_01/products.jsonl", lines=True)

# Model A

In [282]:
sessionsA = sessions.copy()

In [283]:
sessionsA['score'] = sessionsA['event_type'].map({'VIEW_PRODUCT':1, 'BUY_PRODUCT':0})

groupA = sessionsA.groupby(['user_id', 'product_id'])['score'].sum().reset_index()

groupA = pd.pivot_table(groupA, values='score', index='user_id', columns='product_id')
groupA = groupA.fillna(0)
groupA = groupA.stack().reset_index()
groupA = groupA.rename(columns={0:'score'})
groupA['user_view'] = groupA['score'].apply(lambda x: 1 if x > 0 else 0)
groupA

Unnamed: 0,user_id,product_id,score,user_view
0,102,1001,0.0,0
1,102,1002,3.0,1
2,102,1003,2.0,1
3,102,1004,2.0,1
4,102,1005,5.0,1
...,...,...,...,...
63795,301,1315,1.0,1
63796,301,1316,1.0,1
63797,301,1317,4.0,1
63798,301,1318,4.0,1


In [284]:
mask = np.random.rand(len(groupA)) < 0.8
trainsetA = groupA[mask]
testsetA = groupA[~mask]
trainsetA = trainsetA.reset_index()
trainsetA

Unnamed: 0,index,user_id,product_id,score,user_view
0,0,102,1001,0.0,0
1,1,102,1002,3.0,1
2,2,102,1003,2.0,1
3,4,102,1005,5.0,1
4,9,102,1010,6.0,1
...,...,...,...,...,...
51015,63794,301,1314,0.0,0
51016,63795,301,1315,1.0,1
51017,63796,301,1316,1.0,1
51018,63797,301,1317,4.0,1


In [285]:
def one_hot_encode(element, list):
    one_hot_encode_list = []
    
    for e in list:
        if element == e:
            one_hot_encode_list.append(1)
        else:
            one_hot_encode_list.append(0)
    return one_hot_encode_list

In [286]:
category_list = products['category_path'].unique()

productsA = products.copy()
productsA['category_path'] = productsA['category_path'].apply(lambda x: one_hot_encode(x, category_list))
productsA['price'] = (productsA['price'] - productsA['price'].min()) / (productsA['price'].max() - productsA['price'].min())
productsA['user_rating'] = (productsA['user_rating'] - productsA['user_rating'].min()) / (productsA['user_rating'].max() - productsA['user_rating'].min())

In [287]:
def similarity(product_id1, product_id2):
    a = productsA.iloc[product_id1]
    b = productsA.iloc[product_id2]
    
    categoryA = a['category_path']
    categoryB = b['category_path']
    category_distance = spatial.distance.cosine(categoryA, categoryB)
    
    priceA = a['price']
    priceB = b['price']
    price_distance = abs(priceA - priceB) * 1
    
    ratingA = a['user_rating']
    ratingB = b['user_rating']
    rating_distance = abs(ratingA - ratingB)
    
    return category_distance + price_distance + rating_distance

In [288]:
def get_distances(product_id):
    p = products.index[products['product_id'] == product_id][0]
    distances = []
    
    for index, product in products.iterrows():
        if product['product_id'] != product_id:
            dist = similarity(index, p)
            distances.append(dist)
        else:
            distances.append(0)
    
    return distances

def get_neighbours(distances, K):
    distances = [(index, dist) for index, dist in enumerate(distances)]
    distances.sort(key=operator.itemgetter(1))
    neighbours = []
    
    for x in range(K):
        neighbours.append(distances[x])
    return neighbours

In [289]:
all_distances = []
for i in range(len(products)):
    all_distances.append(get_distances(products['product_id'].iloc[i]))

In [290]:
def get_recommendationA(user_id, dataset, K, all_distances):
    s = dataset.index[dataset['user_id'] == user_id].tolist()
    ids = []
    for i in s:
        ids.append(dataset['product_id'].iloc[i])
    
    distances = [0] * len(products)
    for id in ids:
        i = products.index[products['product_id'] == id][0]
        score = dataset.loc[(dataset['user_id'] == user_id) & (dataset['product_id'] == id), 'score']
        if len(score) != 0:
            score = score.item()
            for p in range(len(products)):
                distances[p] = distances[p] + all_distances[i][p] * score
    
    maxi = max(distances)
    for id in ids:
        distances[products.index[products['product_id'] == id][0]] = maxi
    
    return get_neighbours(distances, K)

In [291]:
user_mask = np.random.rand(len(users)) < 0.2
test_users = users[user_mask]
test_users

Unnamed: 0,user_id,name,city,street
5,107,Eryk Sendor,Wrocław,pl. Diamentowa 929
9,111,Rafał Jagieło,Radom,pl. Swierkowa 652
14,116,Albert Grupa,Wrocław,pl. Działkowa 20/57
15,117,Błażej Kołaczek,Kraków,aleja Jaskółcza 74/10
17,119,Iwo Nagel,Szczecin,ul. Solidarnosci 984
20,122,Adam Kliber,Radom,al. Tysiąclecia 437
21,123,Melania Fiedoruk,Radom,al. Broniewskiego 97/46
23,125,Melania Garncarek,Wrocław,pl. Borówkowa 658
38,140,Adrianna Szumny,Radom,aleja Łanowa 20
39,141,Klara Pietrus,Warszawa,ulica Owocowa 269


In [292]:
correct = 0
for index, user in test_users.iterrows():
    print(user['user_id'])
    recommendations = get_recommendationA(user['user_id'], trainsetA, 5, all_distances)
    for recommendation in recommendations:
        id = products.iloc[recommendation[0]]['product_id']
        view = testsetA[(testsetA['product_id'] == id) & (testsetA['user_id'] == user['user_id'])]['user_view']
        if len(view) != 0:
            view = view.item()
            if view == 1:
                correct = correct + 1
correct / (5 * len(test_users))

107
111
116
117
119
122
123
125
140
141
142
150
171
177
178
183
187
190
197
200
201
210
214
218
221
222
225
226
231
235
236
237
259
265
266
269
271
272
273
275
280
283
287
291
301


0.3511111111111111

# Model B

In [293]:
sessionsB = sessions.copy()

In [294]:
sessionsB['score'] = sessionsB['event_type'].map({'VIEW_PRODUCT':5, 'BUY_PRODUCT':5})

groupB = sessionsB.groupby(['user_id', 'product_id'])['score'].sum().reset_index()

groupB = sessionsB.groupby(['user_id', 'product_id'])['score'].sum().reset_index()
groupB['score'] = groupB['score'].apply(lambda x: 5 if x>5 else x)
groupB = pd.pivot_table(groupB, values='score', index='user_id', columns='product_id')
groupB = groupB.fillna(0)
groupB = groupB.stack().reset_index()
groupB = groupB.rename(columns={0:'score'})
groupB['user_view'] = groupB['score'].apply(lambda x: 1 if x > 0 else 0)

std1 = MinMaxScaler(feature_range=(0, 1))
std1.fit(groupB['score'].values.reshape(-1,1))
groupB['interaction_score'] = std1.transform(groupB['score'].values.reshape(-1,1))
groupB

Unnamed: 0,user_id,product_id,score,user_view,interaction_score
0,102,1001,0.0,0,0.0
1,102,1002,5.0,1,1.0
2,102,1003,5.0,1,1.0
3,102,1004,5.0,1,1.0
4,102,1005,5.0,1,1.0
...,...,...,...,...,...
63795,301,1315,5.0,1,1.0
63796,301,1316,5.0,1,1.0
63797,301,1317,5.0,1,1.0
63798,301,1318,5.0,1,1.0


In [295]:
def price_bin(price):
    if price <= 25:
        return 0
    if price <= 50:
        return 1
    if price <= 100:
        return 2
    if price <= 250:
        return 3
    if price <= 500:
        return 4
    if price <= 1000:
        return 5
    if price <= 2000:
        return 6
    if price <= 4000:
        return 7
    else:
        return 8

In [296]:
def rating_bin(rating):
    if rating <= 0.5:
        return 0
    if rating <= 1.5:
        return 1
    if rating <= 2.5:
        return 2
    if rating <= 3.5:
        return 3
    if rating <= 4.5:
        return 4
    else:
        return 5

In [297]:
groupB = pd.merge(groupB, products, on="product_id", how="left")
groupB = pd.merge(groupB, users, on="user_id", how="left")
groupB = groupB[['user_id', 'product_id', 'product_name', 'category_path', 'price', 'user_rating', 'score', 'interaction_score', 'user_view']]
groupB['price'] = groupB['price'].apply(lambda x: price_bin(x))
groupB['user_rating'] = groupB['user_rating'].apply(lambda x: rating_bin(x))
groupB

Unnamed: 0,user_id,product_id,product_name,category_path,price,user_rating,score,interaction_score,user_view
0,102,1001,Telefon Siemens Gigaset DA310,Telefony i akcesoria;Telefony stacjonarne,2,5,0.0,0.0,0
1,102,1002,Kyocera FS-1135MFP,Komputery;Drukarki i skanery;Biurowe urządzeni...,7,1,5.0,1.0,1
2,102,1003,Kyocera FS-3640MFP,Komputery;Drukarki i skanery;Biurowe urządzeni...,8,4,5.0,1.0,1
3,102,1004,Fallout 3 (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,1,3,5.0,1.0,1
4,102,1005,Szalone Króliki Na żywo i w kolorze (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,1,0,5.0,1.0,1
...,...,...,...,...,...,...,...,...,...
63795,301,1315,Jabra Talk,Telefony i akcesoria;Akcesoria telefoniczne;Ze...,2,0,5.0,1.0,1
63796,301,1316,Plantronics Voyager Legend,Telefony i akcesoria;Akcesoria telefoniczne;Ze...,3,4,5.0,1.0,1
63797,301,1317,Plantronics Savi W740,Telefony i akcesoria;Akcesoria telefoniczne;Ze...,6,4,5.0,1.0,1
63798,301,1318,Plantronics Savi W710,Sprzęt RTV;Audio;Słuchawki,5,5,5.0,1.0,1


In [298]:
trainsetB = groupB[mask]
testsetB = groupB[~mask]
trainsetB = trainsetB.reset_index()
trainsetB

Unnamed: 0,index,user_id,product_id,product_name,category_path,price,user_rating,score,interaction_score,user_view
0,0,102,1001,Telefon Siemens Gigaset DA310,Telefony i akcesoria;Telefony stacjonarne,2,5,0.0,0.0,0
1,1,102,1002,Kyocera FS-1135MFP,Komputery;Drukarki i skanery;Biurowe urządzeni...,7,1,5.0,1.0,1
2,2,102,1003,Kyocera FS-3640MFP,Komputery;Drukarki i skanery;Biurowe urządzeni...,8,4,5.0,1.0,1
3,4,102,1005,Szalone Króliki Na żywo i w kolorze (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,1,0,5.0,1.0,1
4,9,102,1010,BioShock 2 (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,2,2,5.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...
51015,63794,301,1314,Assassin&#39;s Creed (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,1,2,0.0,0.0,0
51016,63795,301,1315,Jabra Talk,Telefony i akcesoria;Akcesoria telefoniczne;Ze...,2,0,5.0,1.0,1
51017,63796,301,1316,Plantronics Voyager Legend,Telefony i akcesoria;Akcesoria telefoniczne;Ze...,3,4,5.0,1.0,1
51018,63797,301,1317,Plantronics Savi W740,Telefony i akcesoria;Akcesoria telefoniczne;Ze...,6,4,5.0,1.0,1


In [299]:
train_matrix = pd.pivot_table(trainsetB, values='score', index='user_id', columns='product_id')
train_matrix = train_matrix.fillna(0)
train_matrix

product_id,1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,...,1310,1311,1312,1313,1314,1315,1316,1317,1318,1319
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
102,0.0,5.0,5.0,0.0,5.0,0.0,0.0,0.0,0.0,5.0,...,5.0,5.0,0.0,0.0,5.0,5.0,0.0,5.0,0.0,0.0
103,0.0,5.0,0.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,0.0,5.0,0.0,5.0,5.0,5.0,5.0,0.0,5.0,5.0
104,0.0,5.0,5.0,5.0,0.0,5.0,0.0,5.0,0.0,5.0,...,0.0,5.0,5.0,0.0,5.0,5.0,0.0,5.0,5.0,0.0
105,5.0,0.0,0.0,5.0,5.0,5.0,5.0,5.0,0.0,5.0,...,0.0,5.0,0.0,5.0,5.0,5.0,5.0,0.0,0.0,5.0
106,0.0,5.0,0.0,5.0,5.0,5.0,0.0,5.0,5.0,5.0,...,5.0,5.0,0.0,5.0,0.0,0.0,5.0,5.0,5.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,5.0,0.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,0.0,5.0,0.0,5.0,0.0,5.0,5.0,5.0,0.0,5.0
298,0.0,5.0,5.0,5.0,0.0,5.0,0.0,5.0,5.0,0.0,...,0.0,5.0,0.0,0.0,5.0,5.0,0.0,5.0,0.0,0.0
299,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
300,0.0,0.0,0.0,5.0,5.0,0.0,0.0,0.0,5.0,5.0,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [300]:
product_cat = trainsetB[['product_id', 'category_path', 'price', 'user_rating']].drop_duplicates('product_id')
product_cat = product_cat.sort_values(by='product_id')
product_cat

Unnamed: 0,product_id,category_path,price,user_rating
0,1001,Telefony i akcesoria;Telefony stacjonarne,2,5
1,1002,Komputery;Drukarki i skanery;Biurowe urządzeni...,7,1
2,1003,Komputery;Drukarki i skanery;Biurowe urządzeni...,8,4
258,1004,Gry i konsole;Gry na konsole;Gry Xbox 360,1,3
3,1005,Gry i konsole;Gry na konsole;Gry Xbox 360,1,0
...,...,...,...,...
252,1315,Telefony i akcesoria;Akcesoria telefoniczne;Ze...,2,0
513,1316,Telefony i akcesoria;Akcesoria telefoniczne;Ze...,3,4
253,1317,Telefony i akcesoria;Akcesoria telefoniczne;Ze...,6,4
515,1318,Sprzęt RTV;Audio;Słuchawki,5,5


In [301]:
price_matrix = np.reciprocal(euclidean_distances(np.array(product_cat['price']).reshape(-1,1))+1)
euclidean_matrix1 = pd.DataFrame(price_matrix,columns=product_cat['product_id'],index=product_cat['product_id'])

rating_matrix = np.reciprocal(euclidean_distances(np.array(product_cat['user_rating']).reshape(-1,1))+1)
euclidean_matrix2 = pd.DataFrame(rating_matrix,columns=product_cat['product_id'],index=product_cat['product_id'])

tfidf_vectorizer = TfidfVectorizer()
doc_term = tfidf_vectorizer.fit_transform(list(product_cat['category_path']))
dt_matrix = pd.DataFrame(doc_term.toarray().round(3), index=[i for i in product_cat['product_id']], columns=tfidf_vectorizer.get_feature_names())
cos_similar_matrix = pd.DataFrame(cosine_similarity(dt_matrix.values),columns=product_cat['product_id'],index=product_cat['product_id'])

similarity_matrix = euclidean_matrix1.multiply(euclidean_matrix2).multiply(cos_similar_matrix)
content_matrix = train_matrix.dot(similarity_matrix)
std2 = MinMaxScaler(feature_range=(0, 1))
std2.fit(content_matrix.values)
content_matrix = std2.transform(content_matrix.values)
content_matrix = pd.DataFrame(content_matrix,columns=sorted(trainset['product_id'].unique()),index=sorted(trainset['user_id'].unique()))
content_matrix

Unnamed: 0,1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,...,1310,1311,1312,1313,1314,1315,1316,1317,1318,1319
102,0.380102,0.761861,0.503524,0.865636,0.758806,0.751898,0.814728,0.840909,0.753040,0.751898,...,0.864656,0.985660,0.813156,0.788712,0.823701,0.874231,0.547681,0.692474,0.421023,0.377816
103,0.551436,0.644349,0.236190,0.904568,0.922844,0.901505,0.963755,0.960624,0.921271,0.901505,...,0.680994,0.838141,0.684154,0.806754,0.938051,0.774413,0.978806,0.622375,0.880675,0.887086
104,0.335422,0.678543,0.623920,0.738500,0.587572,0.640004,0.684172,0.748921,0.573496,0.640004,...,0.771739,0.717694,0.826251,0.762775,0.709707,0.630313,0.406510,0.631558,0.901194,0.339357
105,0.911502,0.226735,0.293610,0.668941,0.639906,0.609262,0.611094,0.649048,0.621114,0.609262,...,0.541442,0.626122,0.657128,0.752640,0.622252,0.740407,0.763379,0.554563,0.401997,0.937247
106,0.475729,0.808827,0.654306,0.833594,0.728637,0.751120,0.771693,0.811299,0.721894,0.751120,...,0.861501,0.802219,0.759103,0.890825,0.793013,0.550805,0.777445,0.796635,0.980328,0.490452
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,0.946727,0.510279,0.695633,0.817139,0.801380,0.815927,0.826374,0.832324,0.814808,0.815927,...,0.656869,0.808855,0.831642,0.884519,0.791167,0.935271,0.982338,0.903096,0.443055,1.000000
298,0.457304,0.833541,0.716598,0.801065,0.748995,0.820113,0.836868,0.821330,0.816043,0.820113,...,0.914754,0.942701,0.858934,0.752570,0.818851,0.782453,0.683254,0.838218,0.438898,0.411310
299,0.065601,0.063641,0.027444,0.156358,0.144178,0.103845,0.102426,0.129648,0.136636,0.103845,...,0.087599,0.086698,0.123334,0.211044,0.126164,0.031731,0.065852,0.064594,0.111863,0.073763
300,0.008944,0.017392,0.005500,0.169940,0.175348,0.138680,0.091910,0.092997,0.146341,0.138680,...,0.063256,0.139286,0.073839,0.036411,0.133666,0.021948,0.016293,0.009519,0.021707,0.015983


In [302]:
content_df = content_matrix.stack().reset_index()
content_df = content_df.rename(columns={'level_0':'user_id','level_1':'product_id',0:'predicted_interaction'})
content_df

Unnamed: 0,user_id,product_id,predicted_interaction
0,102,1001,0.380102
1,102,1002,0.761861
2,102,1003,0.503524
3,102,1004,0.865636
4,102,1005,0.758806
...,...,...,...
63795,301,1315,0.673584
63796,301,1316,0.752022
63797,301,1317,0.781264
63798,301,1318,0.756315


In [303]:
def get_recommendationB(content_df, user_id, dataset, K):
    s = dataset.index[dataset['user_id'] == user_id].tolist()
    ids = []
    for i in s:
        ids.append(dataset['product_id'].iloc[i])
        
    user_content_df = content_df.loc[content_df['user_id'] == user_id]
    
    mini = user_content_df['predicted_interaction'].min()
    for id in ids:
        user_content_df.loc[user_content_df['product_id'] == id, 'predicted_interaction'] = mini
        
    user_content_df = user_content_df.sort_values(by="predicted_interaction", ascending=False)
    
    recommendations = []
    for x in range(K):
        recommendations.append(user_content_df['product_id'].iloc[x])
    return recommendations

In [304]:
get_recommendationB(content_df, 102, trainsetB, 5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


[1250, 1191, 1213, 1107, 1096]

In [305]:
correct = 0
for index, user in test_users.iterrows():
    print(user['user_id'])
    recommendations = get_recommendationB(content_df, user['user_id'], trainsetB, 5)
    for recommendation in recommendations:
        view = testsetB[(testsetB['product_id'] == recommendation) & (testsetB['user_id'] == user['user_id'])]['user_view']
        if len(view) != 0:
            view = view.item()
            if view == 1:
                correct = correct + 1
correct / (5 * len(test_users))

107
111
116
117
119
122
123
125
140
141
142
150
171
177
178
183
187
190
197
200
201
210
214
218
221
222
225
226
231
235
236
237
259
265
266
269
271
272
273
275
280
283
287
291
301


0.43555555555555553