In [3]:
import pandas as pd
import re
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB

In [4]:
users_df = pd.read_json("../data/users.jsonl", lines=True)
sessions_df = pd.read_json("../data/sessions.jsonl", lines=True)
products_df = pd.read_json("../data/products.jsonl", lines=True)
print(users_df)

     user_id             name      city                 street
0        102  Monika Forysiak    Poznań      plac Dębowa 11/53
1        103  Kacper Malewicz   Wrocław   aleja Browarna 79/72
2        104    Tomasz Janiuk    Kraków  ulica Cegielniana 318
3        105    Roksana Mućka     Radom        plac Perłowa 48
4        106     Wiktor Jarka  Warszawa            al. Bema 37
..       ...              ...       ...                    ...
195      297    Błażej Pachla    Kraków      ulica Lisia 09/00
196      298     Cezary Jonak    Gdynia    aleja Kołłątaja 110
197      299     Sylwia Karol   Wrocław       al. Podleśna 999
198      300      Bruno Cisoń     Radom   ulica Malinowa 64/08
199      301        Tola Osik  Szczecin    plac Tulipanowa 386

[200 rows x 4 columns]


In [5]:
def extract_color(products_df):
    for index, row in products_df.iterrows():
        color = re.search("'color': '\w+'", str(row["optional_attributes"]))
        if color != None:
            products_df.loc[index, "color"] = color.group(0)[10:-1]
        else:
            products_df.loc[index, "color"] = None
    return products_df

In [7]:
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]], prefix="", prefix_sep="")
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return(res) 

In [8]:
def normalize(df, columns_to_norm):
    for feature_name in df.columns:
        if feature_name in columns_to_norm:
            max_value = df[feature_name].max()
            min_value = df[feature_name].min()
            df[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return df

In [64]:
products_model_A = products_df.copy()
products_model_A = extract_color(products_model_A)
products_model_A = encode_and_bind(products_model_A, 'category_path')
products_model_A = products_model_A.drop(columns=["product_name", "brand", "optional_attributes","weight_kg","color", "user_rating_count"])
products_model_A = normalize(products_model_A, ['price', 'user_rating'])
products_model_A.to_pickle("products_a.pickle")
X = products_model_A.drop(columns=["product_id"])
nbrs = NearestNeighbors(n_neighbors=5, algorithm='brute', metric="cosine").fit(X)



In [65]:
import pickle
with open("model_basic.pickle", "wb") as f:
    pickle.dump(nbrs, f)

In [10]:
def get_users_views(sessions):
    sessions_model_A = sessions.copy()
    sessions_model_A['view'] = sessions_model_A['event_type'].map(lambda x: 1 if x == "VIEW_PRODUCT" else 0)
    users_views = sessions_model_A.groupby(['user_id', 'product_id'], as_index=False)['view'].sum()
    return users_views

In [16]:
def get_product_params(product_id, products):
    ret_product = products.loc[products["product_id"] == product_id]
    ret_product = ret_product.drop(columns="product_id")
    return ret_product
    

In [57]:
def get_recommendationsA(user_id, model, users_views, products, k):
    most_viewed = users_views.loc[users_views["user_id"] == user_id].sort_values(by=["view"], ascending=False)
    viewed_products = list(most_viewed["product_id"])
    final_reccomendation = []
    for product in viewed_products:
        distances, recommended = model.kneighbors(get_product_params(product, products))
        for recommended_product in recommended[0]:
            recommended_product_id = products.iloc[[recommended_product]]['product_id']
            if int(recommended_product_id) not in viewed_products:
                final_reccomendation.append(int(recommended_product_id))
            if len(final_reccomendation) == k:
                return final_reccomendation

In [63]:
k = 5
user_views_data = get_users_views(sessions_df).reset_index()
train_data_A = user_views_data.sample(frac = 0.8)
test_data_A = user_views_data.drop(train_data_A.index)

test_users = users_df.sample(frac = 0.05)["user_id"]

correct = 0
for user in test_users:
    recommendations = get_recommendationsA(user, nbrs, train_data_A, products_model_A, k)
    for recommendation in recommendations:
        view = test_data_A.loc[(test_data_A['product_id'] == recommendation) & (test_data_A['user_id'] == user)]['view']
        if view.any():
            correct += 1
            
result = correct / (k*len(test_users))

print("{:.4f}".format(result))

0.1800
