In [26]:
import pandas as pd
import re
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, euclidean_distances, cosine_similarity

In [27]:
users_df = pd.read_json("../data/users.jsonl", lines=True)
sessions_df = pd.read_json("../data/sessions.jsonl", lines=True)
products_df = pd.read_json("../data/products.jsonl", lines=True)

In [28]:
products_cbf = products_df.copy()
# # products_cbf["category_path"] = products_cbf["category_path"].apply(lambda x: " ".join(x.split(";")))                                                           
# tfidf_name = TfidfVectorizer()
# # tfidf_category = TfidfVectorizer()

# products_content_name = tfidf_name.fit_transform(products_cbf['product_name'].unique())
# # products_content_category = tfidf_category.fit_transform(products_cbf['category_path'].unique())

# cosine_sim_name = linear_kernel(products_content_name, products_content_name)
# # cosine_sim_category = linear_kernel(products_content_category, products_content_category)

In [29]:
model_B_sessions = sessions_df.copy()
model_B_sessions['action_points'] = model_B_sessions['event_type'].map(lambda x: 1 if x == "VIEW_PRODUCT" else 3)
actions_sums = model_B_sessions.groupby(['user_id', 'product_id'], as_index=False)['action_points'].sum()
actions_sums_excel = pd.pivot_table(actions_sums, values='action_points', index='user_id', columns='product_id')
actions_sums_excel = actions_sums_excel.fillna(0)
actions_sums_excel.stack()

user_id  product_id
102      1001          1.0
         1002          0.0
         1003          0.0
         1004          0.0
         1005          0.0
                      ... 
301      1615          0.0
         1616          0.0
         1617          0.0
         1618          1.0
         1619          0.0
Length: 123800, dtype: float64

In [30]:
def min_max_scale(df, columns):
    for column in columns:
        df[column] = (df[column] - df[column].min()) / (df[column].max() - df[column].min())
    return df

In [31]:
user_and_product = pd.merge(actions_sums, products_cbf, on="product_id", how="left")
user_and_product = user_and_product[['user_id', 'product_id', 'product_name', 'price', 'user_rating', 'user_rating_count']]
user_and_product = min_max_scale(user_and_product, ["price",'user_rating_count','user_rating'])
product_cat = user_and_product[['product_id', 'product_name', 'price', 'user_rating', 'user_rating_count']].drop_duplicates('product_id')
tfidf_name = TfidfVectorizer()
products_content_name = tfidf_name.fit_transform(product_cat['product_name'].unique())
cosine_sim_name = linear_kernel(products_content_name, products_content_name)


In [32]:
price_matrix = np.reciprocal(euclidean_distances(np.array(product_cat['price']).reshape(-1,1))+1)
euclidean_matrix1 = pd.DataFrame(price_matrix,columns=product_cat['product_id'],index=product_cat['product_id'])

rating_matrix = np.reciprocal(euclidean_distances(np.array(product_cat['user_rating']).reshape(-1,1))+1)
euclidean_matrix2 = pd.DataFrame(rating_matrix,columns=product_cat['product_id'],index=product_cat['product_id'])

rating_matrix = np.reciprocal(euclidean_distances(np.array(product_cat['user_rating_count']).reshape(-1,1))+1)
euclidean_matrix3 = pd.DataFrame(rating_matrix,columns=product_cat['product_id'],index=product_cat['product_id'])

In [33]:
similarity_matrix = euclidean_matrix1.multiply(euclidean_matrix2).multiply(euclidean_matrix3).multiply(cosine_sim_name)
content_matrix = actions_sums_excel.dot(similarity_matrix)
content_matrix

product_id,1001,1006,1013,1018,1023,1029,1033,1034,1047,1057,...,1162,1188,1286,1231,1572,1320,1181,1383,1447,1610
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
102,1.316841,2.768688,1.627099,2.490445,2.763227,2.898180,2.950113,6.799455,1.426620,1.769907,...,0.144552,0.591990,0.206984,0.302711,0.219330,0.170876,0.584887,2.007868,0.706865,0.000000
103,26.371025,27.015099,32.593937,43.178265,32.336778,18.553848,38.865658,29.830548,17.541889,20.544531,...,9.128892,14.088958,14.820611,17.569253,23.022856,18.118155,12.096249,42.543019,14.334669,7.584558
104,35.190331,35.813788,34.454195,46.740823,41.886982,16.612856,33.222729,40.162023,19.240438,26.977573,...,7.696515,17.507954,12.686443,16.532528,18.278920,18.004343,13.853529,39.443221,17.103974,6.334087
105,6.390550,10.494817,14.352590,14.304791,11.138143,9.773563,8.093654,8.385915,6.836536,15.136092,...,6.140933,5.726747,8.730778,10.401922,11.916329,11.641913,4.282386,13.518417,4.977697,4.876977
106,35.744005,34.609923,47.176679,51.099122,45.663773,18.901365,41.061196,41.813094,27.458136,23.909515,...,10.941364,20.266379,19.739914,20.516727,27.144640,28.061142,17.322309,64.222028,18.656087,6.019948
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,26.296291,40.490529,48.291429,50.456002,49.711967,29.666520,52.241581,34.215663,26.173362,20.374900,...,10.438539,20.058078,18.693910,18.823544,24.798901,22.058671,17.269181,62.317721,19.047648,8.140728
298,32.808619,42.673948,57.866682,58.590323,52.110276,15.889676,33.258614,39.219607,26.073562,25.029697,...,11.071833,22.883459,20.122223,17.643774,27.099006,24.470373,18.774761,44.993596,21.625049,9.155962
299,22.994992,24.013827,34.692790,30.254513,30.637455,15.337927,25.758622,19.425585,16.562677,14.769962,...,6.567806,12.793090,10.980826,10.028940,14.993119,16.278739,11.319679,30.364656,11.417674,4.044823
300,7.501783,10.302134,24.021595,16.652971,13.912381,5.612402,15.771098,11.840327,11.095435,4.176386,...,4.713105,6.960640,9.118326,5.589003,13.706659,10.008110,6.549386,20.737283,6.124180,1.765761
