In [17]:
import pandas as pd
import re
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [18]:
users_df = pd.read_json("../data/users.jsonl", lines=True)
sessions_df = pd.read_json("../data/sessions.jsonl", lines=True)
products_df = pd.read_json("../data/products.jsonl", lines=True)

In [22]:
products_cbf = products_df.copy()
products_cbf["category_path"] = products_cbf["category_path"].apply(lambda x: " ".join(x.split(";")))                                                           
tfidf_name = TfidfVectorizer()
tfidf_category = TfidfVectorizer()

products_content_name = tfidf_name.fit_transform(products_cbf['product_name'].unique())
products_content_category = tfidf_category.fit_transform(products_cbf['category_path'].unique())

cosine_sim_name = linear_kernel(products_content_name, products_content_name)
cosine_sim_category = linear_kernel(products_content_category, products_content_category)

In [23]:
indices_category = pd.Series(products_cbf.index, index=products_cbf['category_path']).drop_duplicates()
indices_product = pd.Series(products_cbf.index, index=products_cbf['product_name']).drop_duplicates()

print(indices_category)
print(indices_product)

category_path
Monitory komputerowe Monitory               0
Klawiatury Przewodowe                       1
Monitory komputerowe Monitory               2
Zestaw klawiatura i mysz Bezprzewodowe      3
Monitory komputerowe Monitory               4
                                         ... 
Myszki Bezprzewodowe                      614
Klawiatury Bezprzewodowe                  615
Myszki Przewodowe                         616
Monitory komputerowe Monitory             617
Zestaw klawiatura i mysz Bezprzewodowe    618
Length: 619, dtype: int64
product_name
Monitor LED Philips 193V5LSB2/10 18.5 " 1366 x 768 px TN           0
Klawiatura mechaniczna Marvo KG962 EN B                            1
Monitor LED iiyama XUB2390HS-B1 23 " 1920 x 1080 px IPS / PLS      2
Zestaw bezprzewodowy 2w1 klawiatura + mysz Natec                   3
Monitor LED AOC U27P2; 27 " 3840 x 2160 px IPS / PLS               4
                                                                ... 
Myszka bezprzewodowa Logite

In [24]:
def get_recommendations(title, cosine_sim, indices):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return products_cbf['product_name'].iloc[movie_indices]

In [25]:
get_recommendations('Monitor LED AOC U27P2; 27 " 3840 x 2160 px IPS / PLS', cosine_sim_name, indices_product)

165    Monitor LED Samsung LS27A700NWUXEN 27 " 3840 x...
344    Monitor LED Samsung LS27A800NMUXEN 27 " 3840 x...
385    Monitor LED LG 27UL500-W 27 " 3840 x 2160 px I...
497    Monitor LED LG 27UP850-W 27 " 3840 x 2160 px I...
100    Monitor LED Samsung U28R550UQR 28 " 3840 x 216...
182    Monitor LED AOC 27V2Q 27 " 1920 x 1080 px IPS ...
407    Monitor LED AOC Q2790PQE 27 " 2560 x 1440 px I...
211    Monitor LED BenQ 9H.LF9LA.TBE 32 " 3840 x 2160...
47                     Monitor AOC U27P2CA 27" IPS / PLS
401    Monitor LED Samsung F32TU870VR 32 " 3840 x 216...
Name: product_name, dtype: object