In [1]:
# potrzebne importy
import pandas as pd
import re
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB

In [6]:
# wczytanie danych
users_df = pd.read_json("../data/users.jsonl", lines=True)
sessions_df = pd.read_json("../data/sessions.jsonl", lines=True)
products_df = pd.read_json("../data/products.jsonl", lines=True)

#### Funkcje pomocnicze do przetwarzania danych

In [3]:
def extract_color(products_df):
    for index, row in products_df.iterrows():
        color = re.search("'color': '\w+'", str(row["optional_attributes"]))
        if color != None:
            products_df.loc[index, "color"] = color.group(0)[10:-1]
        else:
            products_df.loc[index, "color"] = None
    return products_df

In [4]:
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]], prefix="", prefix_sep="")
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return(res) 

In [5]:
def normalize(df, columns_to_norm):
    for feature_name in df.columns:
        if feature_name in columns_to_norm:
            max_value = df[feature_name].max()
            min_value = df[feature_name].min()
            df[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return df

In [8]:
def get_users_views(sessions):
    sessions_model_A = sessions.copy()
    sessions_model_A['view'] = sessions_model_A['event_type'].map(lambda x: 1 if x == "VIEW_PRODUCT" else 0)
    users_views = sessions_model_A.groupby(['user_id', 'product_id'], as_index=False)['view'].sum()
    return users_views

In [9]:
def get_product_params(product_id, products):
    ret_product = products.loc[products["product_id"] == product_id]
    ret_product = ret_product.drop(columns="product_id")
    return ret_product
    

## Model naiwny

Model naiwny korzysta z algorytmu K najbliższych sąsiadów. Na podstawie bazy wszystkich produktów obliczane są dystanse podobieństwa pomiędzy produktami na podstawie metryki cosinusowej. Wybrane atrybuty do obliczania metryk to: cena, ścieżka kategorii, ocena użytkownika. Następnie budowana jest tablica krotek: (użytkownik, produkt, liczba wyświetleń produktu) - tablica popularności danego produktu względem użytkownika. Taka tablica jest zbiorem trenującym, dla każdego użytkownika następuje sortowanie produktów malejąco według liczby wyświetleń, następnie dla najpopularniejszego produktu dla każdego użytkownika jest uruchamiany algorytm KNN znajdujący produkty najbardziej podobne do tego najchętniej oglądanego. Gdyby polecane produkty znalazły się już w tych wyświetlonych przez użytkownika to algorytm będzie szukał produktów podobnych dla mniej popularnych pozycji po to, aby użytkownik miał w polecanych produktach tylko te, których sam jeszcze nie widział. 

In [34]:
def prepare_products_set_basic(products_df):
    products_set = products_df.copy()
    products_set = extract_color(products_set)
    products_set = encode_and_bind(products_set, 'category_path')
    products_set = products_set.drop(columns=["product_name", "brand", "optional_attributes","weight_kg","color", "user_rating_count"])
    products_set = normalize(products_set, ['price', 'user_rating'])
    return products_set

In [40]:
# trenowanie modelu KNN
products_set = prepare_products_set_basic(products_df)

In [46]:
def train_basic_model(train_set, k=5, algo="brute", metric="cosine"):
    X = products_model_A.drop(columns=["product_id"])
    return NearestNeighbors(n_neighbors=k, algorithm=algo, metric=metric).fit(X)

In [47]:
nbrs = train_basic_model(products_set)

In [172]:
# zapis zserializowanego modelu oraz przetworzonych danych do pliku
products_model_A.to_pickle("products_basic.pickle")
import pickle
with open("model_basic.pickle", "wb") as f:
    pickle.dump(nbrs, f)

In [327]:
def recommendations_basic(user_id, model, users_views, products, k):
    most_viewed = users_views.loc[users_views["user_id"] == user_id].sort_values(by=["view"], ascending=False)
    viewed_products = list(most_viewed["product_id"])
    final_reccomendation = []
    for product in viewed_products:
        recommended = model.kneighbors(get_product_params(product, products), return_distance=False)
        for recommended_product in recommended[0]:
            recommended_product_id = products.iloc[[recommended_product]]['product_id']
            pid = int(recommended_product_id)
            if pid not in viewed_products and pid not in final_reccomendation:
                final_reccomendation.append(int(recommended_product_id))
            if len(final_reccomendation) == k:
                return final_reccomendation

In [217]:
user_views_data = get_users_views(sessions_df).reset_index()
train_data = user_views_data.sample(frac = 0.8)
test_data = user_views_data.drop(train_data.index)
test_users = users_df.sample(frac = 0.05)["user_id"]

In [329]:

# ocena poprawności modelu naiwnego
train_data_basic = train_data.copy()
test_data_basic = test_data.copy()
k = 5
correct = 0
for user in test_users:
    recommendations = recommendations_basic(user, nbrs, train_data_basic, products_model_A, k)
    for recommendation in recommendations:
        view = test_data_basic.loc[(test_data_basic['product_id'] == recommendation) & (test_data_basic['user_id'] == user)]['view']
        if view.any():
            correct += 1
            
result = correct / (k*len(test_users))

print("{:.4f}".format(result))

0.2000


In [319]:
train_data_basic.to_pickle("train_data_basic.pickle")
test_data_basic.to_pickle("test_data_basic.pickle")

In [321]:
list(test_users.reset_index(drop=True))


[299, 172, 191, 183, 126, 135, 245, 136, 127, 248]