In [2]:
import pandas as pd
import re
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, euclidean_distances, cosine_similarity

In [3]:
users_df = pd.read_json("../data/users.jsonl", lines=True)
sessions_df = pd.read_json("../data/sessions.jsonl", lines=True)
products_df = pd.read_json("../data/products.jsonl", lines=True)

In [4]:
products_cbf = products_df.copy()
products_cbf["category_path"] = products_cbf["category_path"].apply(lambda x: " ".join(x.split(";")))                                                           
tfidf_name = TfidfVectorizer()
tfidf_category = TfidfVectorizer()

products_content_name = tfidf_name.fit_transform(products_cbf['product_name'].unique())
products_content_category = tfidf_category.fit_transform(products_cbf['category_path'].unique())

cosine_sim_name = linear_kernel(products_content_name, products_content_name)
cosine_sim_category = linear_kernel(products_content_category, products_content_category)

In [5]:
indices_category = pd.Series(products_cbf.index, index=products_cbf['category_path']).drop_duplicates()
indices_product = pd.Series(products_cbf.index, index=products_cbf['product_name']).drop_duplicates()

print(cosine_sim_name.shape)

(619, 619)


In [6]:
def get_recommendations(title, cosine_sim, indices):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:6]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return products_cbf['product_name'].iloc[movie_indices]

In [7]:
get_recommendations('Monitor LED AOC U27P2; 27 " 3840 x 2160 px IPS / PLS', cosine_sim_name, indices_product)

165    Monitor LED Samsung LS27A700NWUXEN 27 " 3840 x...
344    Monitor LED Samsung LS27A800NMUXEN 27 " 3840 x...
385    Monitor LED LG 27UL500-W 27 " 3840 x 2160 px I...
497    Monitor LED LG 27UP850-W 27 " 3840 x 2160 px I...
100    Monitor LED Samsung U28R550UQR 28 " 3840 x 216...
Name: product_name, dtype: object

In [8]:
model_B_sessions = sessions_df.copy()
model_B_sessions['action_points'] = model_B_sessions['event_type'].map(lambda x: 1 if x == "VIEW_PRODUCT" else 3)
actions_sums = model_B_sessions.groupby(['user_id', 'product_id'], as_index=False)['action_points'].sum()
actions_sums_excel = pd.pivot_table(actions_sums, values='action_points', index='user_id', columns='product_id')
actions_sums_excel = actions_sums_excel.fillna(0)
actions_sums_excel

product_id,1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,...,1610,1611,1612,1613,1614,1615,1616,1617,1618,1619
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
102,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
103,18.0,3.0,0.0,2.0,8.0,9.0,11.0,1.0,5.0,4.0,...,0.0,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
104,25.0,1.0,1.0,0.0,13.0,16.0,15.0,9.0,5.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
105,3.0,1.0,1.0,0.0,0.0,5.0,3.0,0.0,5.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0
106,24.0,2.0,3.0,0.0,7.0,12.0,16.0,13.0,26.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,3.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,14.0,2.0,4.0,2.0,10.0,19.0,18.0,8.0,0.0,4.0,...,2.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
298,19.0,1.0,6.0,1.0,8.0,17.0,24.0,13.0,0.0,3.0,...,1.0,0.0,4.0,3.0,0.0,2.0,0.0,0.0,1.0,5.0
299,15.0,1.0,0.0,3.0,5.0,10.0,16.0,1.0,0.0,5.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
300,3.0,1.0,3.0,0.0,5.0,2.0,7.0,2.0,5.0,5.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
def min_max_scale(df, columns):
    for column in columns:
        df[column] = (df[column] - df[column].min()) / (df[column].max() - df[column].min())
    return df

In [10]:
user_and_product = pd.merge(actions_sums, products_cbf, on="product_id", how="left")
user_and_product = user_and_product[['user_id', 'product_id', 'product_name', 'price', 'user_rating', 'user_rating_count']]
user_and_product = min_max_scale(user_and_product, ["price",'user_rating_count','user_rating'])
user_and_product

Unnamed: 0,user_id,product_id,product_name,price,user_rating,user_rating_count
0,102,1001,"Monitor LED Philips 193V5LSB2/10 18.5 "" 1366 x...",0.458458,0.80,0.975806
1,102,1006,"Monitor LED AOC 24B2XDAM 24 "" VA",0.689690,0.20,0.588710
2,102,1013,"Monitor LED iiyama G-Master 24 "" 1920 x 1080 p...",0.799810,0.98,0.105847
3,102,1018,"Monitor LED Samsung C24F390 24 "" 1920 x 1080 p...",0.585586,0.24,0.677419
4,102,1023,"Monitor LED Samsung LC27RG50FQUXEN 27 "" 1920 x...",0.890891,0.30,0.930444
...,...,...,...,...,...,...
42458,301,1592,Myszka bezprzewodowa Logitech inny sensor opty...,0.162683,0.74,0.025202
42459,301,1596,Klawiatura z myszką Titanum,0.034975,0.04,0.380040
42460,301,1603,Kamera internetowa Logitech C505,0.179179,0.64,0.005040
42461,301,1612,Słuchawki nauszne Sennheiser PC 2 CHAT,0.081261,0.46,0.898185


In [11]:
price_matrix = np.reciprocal(euclidean_distances(np.array(user_and_product['price']).reshape(-1,1))+1)
price_matrix.shape

(42463, 42463)