In [1]:
import pandas as pd
import re
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, euclidean_distances, cosine_similarity

In [2]:
users_df = pd.read_json("../data/users.jsonl", lines=True)
sessions_df = pd.read_json("../data/sessions.jsonl", lines=True)
products_df = pd.read_json("../data/products.jsonl", lines=True)

In [27]:
products_cbf = products_df.copy()

In [181]:
model_B_sessions = sessions_df.copy()
model_B_sessions['action_points'] = model_B_sessions['event_type'].map(lambda x: 1 if x == "VIEW_PRODUCT" else 3)
actions_sums = model_B_sessions.groupby(['user_id', 'product_id'], as_index=False)['action_points'].sum()
actions_sums_excel = pd.pivot_table(actions_sums, values='action_points', index='user_id', columns='product_id')
actions_sums_excel = actions_sums_excel.fillna(0)
actions_sums_excel_2 = actions_sums_excel.copy()
actions_sums_excel = actions_sums_excel.stack().reset_index().rename(columns={0:"action_points"})
actions_sums_excel["viewed"] = actions_sums_excel['action_points'].apply(lambda x: 1 if x > 0 else 0)
train_data = actions_sums_excel.sample(frac = 0.8)
test_data = actions_sums_excel.drop(train_data.index)

In [182]:
def min_max_scale(df, columns):
    for column in columns:
        df[column] = (df[column] - df[column].min()) / (df[column].max() - df[column].min())
    return df

In [183]:
user_and_product = pd.merge(train_data, products_cbf, on="product_id", how="left")
user_and_product = user_and_product[['user_id', 'product_id', 'product_name', 'price', 'user_rating', 'user_rating_count']]
user_and_product = min_max_scale(user_and_product, ["price",'user_rating_count','user_rating'])
product_cat = user_and_product[['product_id', 'product_name', 'price', 'user_rating', 'user_rating_count']].drop_duplicates('product_id')
tfidf_name = TfidfVectorizer()
products_content_name = tfidf_name.fit_transform(product_cat['product_name'].unique())
cosine_sim_name = linear_kernel(products_content_name, products_content_name)


In [184]:
price_matrix = np.reciprocal(euclidean_distances(np.array(product_cat['price']).reshape(-1,1))+1)
euclidean_matrix1 = pd.DataFrame(price_matrix,columns=product_cat['product_id'],index=product_cat['product_id'])

rating_matrix = np.reciprocal(euclidean_distances(np.array(product_cat['user_rating']).reshape(-1,1))+1)
euclidean_matrix2 = pd.DataFrame(rating_matrix,columns=product_cat['product_id'],index=product_cat['product_id'])

rating_matrix = np.reciprocal(euclidean_distances(np.array(product_cat['user_rating_count']).reshape(-1,1))+1)
euclidean_matrix3 = pd.DataFrame(rating_matrix,columns=product_cat['product_id'],index=product_cat['product_id'])

In [185]:
similarity_matrix = euclidean_matrix1.multiply(euclidean_matrix2).multiply(euclidean_matrix3).multiply(cosine_sim_name)
content_matrix = actions_sums_excel_2.dot(similarity_matrix)
content_df = content_matrix.stack().reset_index()
content_df = content_df.rename(columns={'level_0':'user_id','level_1':'product_id',0:'predicted_interaction'})
content_df

Unnamed: 0,user_id,product_id,predicted_interaction
0,102,1497,0.176448
1,102,1276,0.228600
2,102,1168,0.480910
3,102,1084,0.337705
4,102,1457,0.514518
...,...,...,...
123795,301,1585,7.144697
123796,301,1268,6.804175
123797,301,1045,23.898203
123798,301,1393,15.536729


In [186]:
def get_recommendationB(content_df, user_id, dataset, K):
    '''
    Zwraca id produktów K "najlepszych" rekomendacji dla podanego użytkownika.
    Jako dataset należy podać zbiór danych zawierający liczbę "kliknięć" dla każdej pary użytkownik-produkt.
    Rekomendacje nie zawierają produktów, które użytkownik już kiedyś wyświetlił.
    '''    
    content_df = content_df.loc[content_df['user_id'] == user_id]
    user_content_df = content_df.sort_values(by="predicted_interaction", ascending=False)
    for pid in dataset[(dataset["viewed"]==1) & (dataset["user_id"]==user_id)]["product_id"]:
        user_content_df.drop(user_content_df[user_content_df['product_id'] == pid].index, inplace = True)
  
    return list(user_content_df.head(K)["product_id"])

In [189]:
test_users = users_df.sample(frac = 0.05)["user_id"]
K = 5
correct = 0
for user in test_users:
    recommendations = get_recommendationB(content_df, user, train_data, K)
    for recommendation in recommendations:
        view = test_data[(test_data['product_id'] == recommendation) & (test_data['user_id'] == user)]['viewed']
        if view.any():
            correct = correct + 1
result = correct / (K * len(test_users))

print("{:.4f}".format(result))

0.3800
