In [1]:
import numpy as np
import pandas as pd
from scipy import spatial
import operator
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler

In [2]:
users = pd.read_json("../data/raw/users.jsonl", lines=True)
sessions = pd.read_json("../data/raw/sessions.jsonl", lines=True)
products = pd.read_json("../data/raw/products.jsonl", lines=True)

# Model A

In [3]:
sessionsA = sessions.copy()

In [4]:
sessionsA['score'] = sessionsA['event_type'].map({'VIEW_PRODUCT':1, 'BUY_PRODUCT':0})

groupA = sessionsA.groupby(['user_id', 'product_id'])['score'].sum().reset_index()

groupA = pd.pivot_table(groupA, values='score', index='user_id', columns='product_id')
groupA = groupA.fillna(0)
groupA = groupA.stack().reset_index()
groupA = groupA.rename(columns={0:'score'})
groupA['user_view'] = groupA['score'].apply(lambda x: 1 if x > 0 else 0)
groupA

Unnamed: 0,user_id,product_id,score,user_view
0,102,1001,0.0,0
1,102,1002,3.0,1
2,102,1003,2.0,1
3,102,1004,2.0,1
4,102,1005,5.0,1
...,...,...,...,...
63795,301,1315,1.0,1
63796,301,1316,1.0,1
63797,301,1317,4.0,1
63798,301,1318,4.0,1


In [5]:
mask = np.random.rand(len(groupA)) < 0.8
trainsetA = groupA[mask]
testsetA = groupA[~mask]
trainsetA = trainsetA.reset_index()
trainsetA

Unnamed: 0,index,user_id,product_id,score,user_view
0,0,102,1001,0.0,0
1,1,102,1002,3.0,1
2,2,102,1003,2.0,1
3,3,102,1004,2.0,1
4,4,102,1005,5.0,1
...,...,...,...,...,...
51103,63795,301,1315,1.0,1
51104,63796,301,1316,1.0,1
51105,63797,301,1317,4.0,1
51106,63798,301,1318,4.0,1


In [6]:
def one_hot_encode(element, list):
    one_hot_encode_list = []
    
    for e in list:
        if element == e:
            one_hot_encode_list.append(1)
        else:
            one_hot_encode_list.append(0)
    return one_hot_encode_list

In [7]:
category_list = products['category_path'].unique()

productsA = products.copy()
productsA['category_path'] = productsA['category_path'].apply(lambda x: one_hot_encode(x, category_list))
productsA['price'] = (productsA['price'] - productsA['price'].min()) / (productsA['price'].max() - productsA['price'].min())
productsA['user_rating'] = (productsA['user_rating'] - productsA['user_rating'].min()) / (productsA['user_rating'].max() - productsA['user_rating'].min())

In [8]:
def similarity(product_id1, product_id2):
    a = productsA.iloc[product_id1]
    b = productsA.iloc[product_id2]
    
    categoryA = a['category_path']
    categoryB = b['category_path']
    category_distance = spatial.distance.cosine(categoryA, categoryB)
    
    priceA = a['price']
    priceB = b['price']
    price_distance = abs(priceA - priceB) * 1
    
    ratingA = a['user_rating']
    ratingB = b['user_rating']
    rating_distance = abs(ratingA - ratingB)
    
    return category_distance + price_distance + rating_distance

In [9]:
def get_distances(product_id):
    p = products.index[products['product_id'] == product_id][0]
    distances = []
    
    for index, product in products.iterrows():
        if product['product_id'] != product_id:
            dist = similarity(index, p)
            distances.append(dist)
        else:
            distances.append(0)
    
    return distances

def get_neighbours(distances, K):
    distances = [(index, dist) for index, dist in enumerate(distances)]
    distances.sort(key=operator.itemgetter(1))
    neighbours = []
    
    for x in range(K):
        neighbours.append(distances[x])
    return neighbours

In [10]:
all_distances = []
for i in range(len(products)):
    all_distances.append(get_distances(products['product_id'].iloc[i]))

In [11]:
def get_recommendationA(user_id, dataset, K, all_distances):
    s = dataset.index[dataset['user_id'] == user_id].tolist()
    ids = []
    for i in s:
        ids.append(dataset['product_id'].iloc[i])
    
    distances = [0] * len(products)
    for id in ids:
        i = products.index[products['product_id'] == id][0]
        score = dataset.loc[(dataset['user_id'] == user_id) & (dataset['product_id'] == id), 'score']
        if len(score) != 0:
            score = score.item()
            for p in range(len(products)):
                distances[p] = distances[p] + all_distances[i][p] * score
    
    maxi = max(distances)
    for id in ids:
        distances[products.index[products['product_id'] == id][0]] = maxi
    
    return get_neighbours(distances, K)

In [12]:
user_mask = np.random.rand(len(users)) < 0.2
test_users = users[user_mask]
test_users

Unnamed: 0,user_id,name,city,street
6,108,Sylwia Karwot,Poznań,ul. Okrzei 771
17,119,Iwo Nagel,Szczecin,ul. Solidarnosci 984
19,121,Stefan Zajko,Radom,pl. Daszyńskiego 41
26,128,Malwina Bondyra,Warszawa,ul. Myśliwska 600
28,130,Kacper Szrama,Wrocław,ulica Boczna 79
29,131,Leon Dawidek,Kraków,ul. Armii Krajowej 98
30,132,Marcin Piaścik,Warszawa,plac Okrężna 85
42,144,Jacek Prażmo,Szczecin,aleja Kołłątaja 42
44,146,Miłosz Kasica,Poznań,pl. Andersa 346
45,147,Tymon Smagacz,Szczecin,aleja Cisowa 69/85


In [13]:
correct = 0
for index, user in test_users.iterrows():
    print(user['user_id'])
    recommendations = get_recommendationA(user['user_id'], trainsetA, 5, all_distances)
    for recommendation in recommendations:
        id = products.iloc[recommendation[0]]['product_id']
        view = testsetA[(testsetA['product_id'] == id) & (testsetA['user_id'] == user['user_id'])]['user_view']
        if len(view) != 0:
            view = view.item()
            if view == 1:
                correct = correct + 1
correct / (5 * len(test_users))

108
119
121
128
130
131
132
144
146
147
149
154
166
175
180
183
188
190
195
203
210
211
213
217
224
228
236
239
242
255
259
260
261
270
273
275
280
284
288
289
291
294
295
297
298
300


0.32608695652173914

# Model B

In [14]:
sessionsB = sessions.copy()

In [15]:
sessionsB['score'] = sessionsB['event_type'].map({'VIEW_PRODUCT':5, 'BUY_PRODUCT':5})

groupB = sessionsB.groupby(['user_id', 'product_id'])['score'].sum().reset_index()

groupB = sessionsB.groupby(['user_id', 'product_id'])['score'].sum().reset_index()
groupB['score'] = groupB['score'].apply(lambda x: 5 if x>5 else x)
groupB = pd.pivot_table(groupB, values='score', index='user_id', columns='product_id')
groupB = groupB.fillna(0)
groupB = groupB.stack().reset_index()
groupB = groupB.rename(columns={0:'score'})
groupB['user_view'] = groupB['score'].apply(lambda x: 1 if x > 0 else 0)

std1 = MinMaxScaler(feature_range=(0, 1))
std1.fit(groupB['score'].values.reshape(-1,1))
groupB['interaction_score'] = std1.transform(groupB['score'].values.reshape(-1,1))
groupB

Unnamed: 0,user_id,product_id,score,user_view,interaction_score
0,102,1001,0.0,0,0.0
1,102,1002,5.0,1,1.0
2,102,1003,5.0,1,1.0
3,102,1004,5.0,1,1.0
4,102,1005,5.0,1,1.0
...,...,...,...,...,...
63795,301,1315,5.0,1,1.0
63796,301,1316,5.0,1,1.0
63797,301,1317,5.0,1,1.0
63798,301,1318,5.0,1,1.0


In [16]:
def price_bin(price):
    if price <= 25:
        return 0
    if price <= 50:
        return 1
    if price <= 100:
        return 2
    if price <= 250:
        return 3
    if price <= 500:
        return 4
    if price <= 1000:
        return 5
    if price <= 2000:
        return 6
    if price <= 4000:
        return 7
    else:
        return 8

In [17]:
def rating_bin(rating):
    if rating <= 0.5:
        return 0
    if rating <= 1.5:
        return 1
    if rating <= 2.5:
        return 2
    if rating <= 3.5:
        return 3
    if rating <= 4.5:
        return 4
    else:
        return 5

In [18]:
groupB = pd.merge(groupB, products, on="product_id", how="left")
groupB = pd.merge(groupB, users, on="user_id", how="left")
groupB = groupB[['user_id', 'product_id', 'product_name', 'category_path', 'price', 'user_rating', 'score', 'interaction_score', 'user_view']]
groupB['price'] = groupB['price'].apply(lambda x: price_bin(x))
groupB['user_rating'] = groupB['user_rating'].apply(lambda x: rating_bin(x))
groupB

Unnamed: 0,user_id,product_id,product_name,category_path,price,user_rating,score,interaction_score,user_view
0,102,1001,Telefon Siemens Gigaset DA310,Telefony i akcesoria;Telefony stacjonarne,2,5,0.0,0.0,0
1,102,1002,Kyocera FS-1135MFP,Komputery;Drukarki i skanery;Biurowe urządzeni...,7,1,5.0,1.0,1
2,102,1003,Kyocera FS-3640MFP,Komputery;Drukarki i skanery;Biurowe urządzeni...,8,4,5.0,1.0,1
3,102,1004,Fallout 3 (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,1,3,5.0,1.0,1
4,102,1005,Szalone Króliki Na żywo i w kolorze (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,1,0,5.0,1.0,1
...,...,...,...,...,...,...,...,...,...
63795,301,1315,Jabra Talk,Telefony i akcesoria;Akcesoria telefoniczne;Ze...,2,0,5.0,1.0,1
63796,301,1316,Plantronics Voyager Legend,Telefony i akcesoria;Akcesoria telefoniczne;Ze...,3,4,5.0,1.0,1
63797,301,1317,Plantronics Savi W740,Telefony i akcesoria;Akcesoria telefoniczne;Ze...,6,4,5.0,1.0,1
63798,301,1318,Plantronics Savi W710,Sprzęt RTV;Audio;Słuchawki,5,5,5.0,1.0,1


In [19]:
trainsetB = groupB[mask]
testsetB = groupB[~mask]
trainsetB = trainsetB.reset_index()
trainsetB

Unnamed: 0,index,user_id,product_id,product_name,category_path,price,user_rating,score,interaction_score,user_view
0,0,102,1001,Telefon Siemens Gigaset DA310,Telefony i akcesoria;Telefony stacjonarne,2,5,0.0,0.0,0
1,1,102,1002,Kyocera FS-1135MFP,Komputery;Drukarki i skanery;Biurowe urządzeni...,7,1,5.0,1.0,1
2,2,102,1003,Kyocera FS-3640MFP,Komputery;Drukarki i skanery;Biurowe urządzeni...,8,4,5.0,1.0,1
3,3,102,1004,Fallout 3 (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,1,3,5.0,1.0,1
4,4,102,1005,Szalone Króliki Na żywo i w kolorze (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,1,0,5.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...
51103,63795,301,1315,Jabra Talk,Telefony i akcesoria;Akcesoria telefoniczne;Ze...,2,0,5.0,1.0,1
51104,63796,301,1316,Plantronics Voyager Legend,Telefony i akcesoria;Akcesoria telefoniczne;Ze...,3,4,5.0,1.0,1
51105,63797,301,1317,Plantronics Savi W740,Telefony i akcesoria;Akcesoria telefoniczne;Ze...,6,4,5.0,1.0,1
51106,63798,301,1318,Plantronics Savi W710,Sprzęt RTV;Audio;Słuchawki,5,5,5.0,1.0,1


In [20]:
train_matrix = pd.pivot_table(trainsetB, values='score', index='user_id', columns='product_id')
train_matrix = train_matrix.fillna(0)
train_matrix

product_id,1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,...,1310,1311,1312,1313,1314,1315,1316,1317,1318,1319
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
102,0.0,5.0,5.0,5.0,5.0,5.0,5.0,0.0,5.0,0.0,...,5.0,5.0,0.0,0.0,5.0,5.0,5.0,5.0,0.0,0.0
103,0.0,5.0,0.0,0.0,5.0,5.0,5.0,5.0,5.0,5.0,...,0.0,5.0,0.0,5.0,5.0,5.0,0.0,0.0,5.0,5.0
104,0.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,0.0,5.0,...,0.0,5.0,5.0,0.0,5.0,0.0,0.0,0.0,5.0,0.0
105,0.0,0.0,0.0,5.0,0.0,5.0,5.0,5.0,0.0,5.0,...,0.0,5.0,0.0,5.0,5.0,5.0,5.0,5.0,0.0,5.0
106,0.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,0.0,5.0,...,0.0,5.0,0.0,5.0,0.0,0.0,5.0,5.0,5.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,0.0,5.0,...,0.0,5.0,0.0,5.0,0.0,0.0,5.0,5.0,0.0,5.0
298,0.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0
299,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
300,0.0,0.0,0.0,5.0,5.0,0.0,5.0,0.0,0.0,5.0,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
product_cat = trainsetB[['product_id', 'category_path', 'price', 'user_rating']].drop_duplicates('product_id')
product_cat = product_cat.sort_values(by='product_id')
product_cat

Unnamed: 0,product_id,category_path,price,user_rating
0,1001,Telefony i akcesoria;Telefony stacjonarne,2,5
1,1002,Komputery;Drukarki i skanery;Biurowe urządzeni...,7,1
2,1003,Komputery;Drukarki i skanery;Biurowe urządzeni...,8,4
3,1004,Gry i konsole;Gry na konsole;Gry Xbox 360,1,3
4,1005,Gry i konsole;Gry na konsole;Gry Xbox 360,1,0
...,...,...,...,...
264,1315,Telefony i akcesoria;Akcesoria telefoniczne;Ze...,2,0
265,1316,Telefony i akcesoria;Akcesoria telefoniczne;Ze...,3,4
266,1317,Telefony i akcesoria;Akcesoria telefoniczne;Ze...,6,4
267,1318,Sprzęt RTV;Audio;Słuchawki,5,5


In [23]:
price_matrix = np.reciprocal(euclidean_distances(np.array(product_cat['price']).reshape(-1,1))+1)
euclidean_matrix1 = pd.DataFrame(price_matrix,columns=product_cat['product_id'],index=product_cat['product_id'])

rating_matrix = np.reciprocal(euclidean_distances(np.array(product_cat['user_rating']).reshape(-1,1))+1)
euclidean_matrix2 = pd.DataFrame(rating_matrix,columns=product_cat['product_id'],index=product_cat['product_id'])

tfidf_vectorizer = TfidfVectorizer()
doc_term = tfidf_vectorizer.fit_transform(list(product_cat['category_path']))
dt_matrix = pd.DataFrame(doc_term.toarray().round(3), index=[i for i in product_cat['product_id']], columns=tfidf_vectorizer.get_feature_names())
cos_similar_matrix = pd.DataFrame(cosine_similarity(dt_matrix.values),columns=product_cat['product_id'],index=product_cat['product_id'])

similarity_matrix = euclidean_matrix1.multiply(euclidean_matrix2).multiply(cos_similar_matrix)
content_matrix = train_matrix.dot(similarity_matrix)
std2 = MinMaxScaler(feature_range=(0, 1))
std2.fit(content_matrix.values)
content_matrix = std2.transform(content_matrix.values)
content_matrix = pd.DataFrame(content_matrix,columns=sorted(trainsetB['product_id'].unique()),index=sorted(trainsetB['user_id'].unique()))
content_matrix



Unnamed: 0,1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,...,1310,1311,1312,1313,1314,1315,1316,1317,1318,1319
102,0.447620,0.796220,0.882357,0.926009,0.790181,0.810112,0.908526,0.949583,0.809410,0.810112,...,0.866913,0.833715,0.755432,0.629313,0.945055,0.840856,0.765107,0.895708,0.332683,0.441692
103,0.486131,0.743097,0.675911,0.735216,0.724684,0.813123,0.864365,0.820647,0.825825,0.813123,...,0.906438,0.793649,0.880346,0.806132,0.811701,0.742028,0.580967,0.449835,0.913095,0.813825
104,0.359534,0.848971,0.726626,0.727342,0.637200,0.699225,0.794419,0.777952,0.647956,0.699225,...,0.824098,0.739690,0.864437,0.681130,0.727671,0.376796,0.422270,0.310579,0.829240,0.334096
105,0.537420,0.277900,0.366352,0.639179,0.538415,0.596203,0.633679,0.661526,0.598045,0.596203,...,0.436339,0.530597,0.582254,0.632901,0.626382,0.707072,0.806201,0.923985,0.298925,0.876495
106,0.539348,0.846598,0.871160,0.733971,0.680143,0.757498,0.833838,0.767710,0.742514,0.757498,...,0.839943,0.758023,0.794439,0.881206,0.734474,0.559867,0.815986,0.837964,0.950674,0.512659
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,0.876787,0.958646,0.976231,0.762311,0.659011,0.785851,0.848705,0.772237,0.709060,0.785851,...,0.651131,0.736530,0.810427,0.818461,0.714678,0.578652,0.883875,0.856980,0.399924,0.941027
298,0.470685,0.840599,0.905895,0.810684,0.791241,0.905296,0.923860,0.860458,0.867432,0.905296,...,0.938996,0.855560,0.879513,0.686790,0.871171,0.593317,0.712688,0.790760,0.387393,0.416767
299,0.080036,0.071267,0.033226,0.141109,0.147908,0.119749,0.120877,0.159759,0.127230,0.119749,...,0.102691,0.089743,0.136470,0.215489,0.142970,0.063305,0.091536,0.096718,0.111357,0.090370
300,0.008885,0.011866,0.004480,0.156561,0.117478,0.133853,0.134981,0.093604,0.117607,0.133853,...,0.068602,0.135918,0.079698,0.036750,0.124932,0.022328,0.015469,0.009337,0.021609,0.015862


In [24]:
content_df = content_matrix.stack().reset_index()
content_df = content_df.rename(columns={'level_0':'user_id','level_1':'product_id',0:'predicted_interaction'})
content_df

Unnamed: 0,user_id,product_id,predicted_interaction
0,102,1001,0.447620
1,102,1002,0.796220
2,102,1003,0.882357
3,102,1004,0.926009
4,102,1005,0.790181
...,...,...,...
63795,301,1315,0.782215
63796,301,1316,0.825555
63797,301,1317,0.838486
63798,301,1318,0.817094


In [25]:
def get_recommendationB(content_df, user_id, dataset, K):
    s = dataset.index[dataset['user_id'] == user_id].tolist()
    ids = []
    for i in s:
        ids.append(dataset['product_id'].iloc[i])
        
    user_content_df = content_df.loc[content_df['user_id'] == user_id]
    
    mini = user_content_df['predicted_interaction'].min()
    for id in ids:
        user_content_df.loc[user_content_df['product_id'] == id, 'predicted_interaction'] = mini
        
    user_content_df = user_content_df.sort_values(by="predicted_interaction", ascending=False)
    
    recommendations = []
    for x in range(K):
        recommendations.append(user_content_df['product_id'].iloc[x])
    return recommendations

In [26]:
get_recommendationB(content_df, 102, trainsetB, 5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


[1193, 1267, 1244, 1250, 1168]

In [27]:
correct = 0
for index, user in test_users.iterrows():
    print(user['user_id'])
    recommendations = get_recommendationB(content_df, user['user_id'], trainsetB, 5)
    for recommendation in recommendations:
        view = testsetB[(testsetB['product_id'] == recommendation) & (testsetB['user_id'] == user['user_id'])]['user_view']
        if len(view) != 0:
            view = view.item()
            if view == 1:
                correct = correct + 1
correct / (5 * len(test_users))

108
119


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


121
128
130
131
132
144
146
147
149
154
166
175
180
183
188
190
195
203
210
211
213
217
224
228
236
239
242
255
259
260
261
270
273
275
280
284
288
289
291
294
295
297
298
300


0.40869565217391307