In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from datetime import datetime, timedelta

from tqdm import tqdm

In [110]:
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

from scipy.sparse import csr_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

In [16]:
def min_preprocessing(actions, catalog):
    
    actions.date = pd.to_datetime(actions.date)
    actions = actions.explode('products')
    actions.products = actions.products.fillna(0)
    actions.products = actions.products.astype(int)
    actions.drop('loc_user_id', axis=1, inplace=True)
    actions.rename({'products': 'productId'}, axis=1, inplace=True)

    catalog.add_date = pd.to_datetime(catalog.add_date)
    catalog.shop_id = catalog.shop_id.astype(int)
    catalog.category_id = catalog.category_id.astype(int)
    catalog.product_id = catalog.product_id.astype(int)
    catalog.drop(['title'], axis=1, inplace=True)
    
    return actions, catalog

In [14]:
train_actions = pd.read_parquet('../data/raw/train_actions.pq', engine='pyarrow')
stokman_catalog = pd.read_parquet('../data/raw/stokman_catalog_preprocessed.pq', engine='pyarrow')

In [17]:
actions, catalog = min_preprocessing(train_actions, stokman_catalog)
del train_actions, stokman_catalog

In [77]:
users_actions = actions.groupby('user_id').agg({'action':'count'}).sort_values(by='action', ascending=False)
actions_cutted = actions[actions.user_id.isin(users_actions.sort_values(by='action', ascending=False)[:10000].index)]

In [113]:
del actions, users_actions

In [88]:
actions_test = actions_cutted[actions_cutted['date'] >= '2024-09-26']
actions_train = actions_cutted[(actions_cutted['date'] < '2024-09-26') & 
                               (actions_cutted.user_id.isin(actions_test.user_id.unique()))]
actions_test = actions_test[actions_test.user_id.isin(actions_train.user_id.unique())]

In [247]:
train = pd.DataFrame(index=actions_train['user_id'].unique(), 
                     columns=catalog.product_id[catalog.product_id.isin(actions_train['productId'].unique())].unique())

In [251]:
test = pd.DataFrame(index=actions_test['user_id'].unique(), 
                    columns=catalog.product_id[catalog.product_id.isin(actions_train['productId'].unique())].unique())

БЫЛО <br>
actions_coefs_arr = np.array([1, 6, 8, -4, -4, 10, 1, 2, 2, 4]) <br>
СТАЛО (после 5 результата)
actions_coefs_arr = np.array([1, 6, 8, -1, -1, 100, 1, 2, 2, 4]) <br>
После 8 результата <br>
actions_coefs_arr = np.array([1, 1, 1, 1, 1, 10, 1, 1, 1, 1])

In [310]:
ACTIONS = {
    0: 'view',
    1: 'like',
    2: 'addB',
    3: 'delB',
    4: 'clearB',
    5: 'order',
    6: 'listB',
    7: 'visit',
    8: 'visitCategory',
    9: 'search'
}

actions_coefs_arr = np.array([1, 5, 4, 0, 0, 10, 3, 2, 1, 1])

In [249]:
action_counts_train = actions_train.groupby(['user_id', 'productId', 'action']).size().unstack(fill_value=0).reset_index()
action_counts_train['score'] = (action_counts_train.iloc[:, 2:].values @ actions_coefs_arr)
pivot_scores_train = action_counts_train.pivot_table(values='score', index='user_id', columns='productId', fill_value=0)
train.update(pivot_scores_train)

In [252]:
action_counts_test = actions_test.groupby(['user_id', 'productId', 'action']).size().unstack(fill_value=0).reset_index()
action_counts_test['score'] = (action_counts_test.iloc[:, 2:].values @ actions_coefs_arr)
pivot_scores_test = action_counts_test.pivot_table(values='score', index='user_id', columns='productId', fill_value=0)
test.update(pivot_scores_test)

In [253]:
del pivot_scores_train, action_counts_train

In [254]:
del pivot_scores_test, action_counts_test

In [255]:
train.fillna(0, inplace=True)

In [256]:
test.fillna(0, inplace=True)

  test.fillna(0, inplace=True)


In [284]:
svd = TruncatedSVD(n_components=25, n_oversamples=10, random_state=42, algorithm='randomized', )
svd.fit(train)

In [291]:
mses = []
for i in range(5, 55, 5):
    for j in range(10, 11, 1):
        svd = TruncatedSVD(n_components=i, n_oversamples=j, random_state=42, algorithm='randomized')
        svd.fit(train)
        test_transformed = svd.transform(test)
        # Предполагаем, что у вас есть переменная с матрицей факторов товаров
        item_factors = svd.components_  # Транспонируем, чтобы факторы товаров были в нужном формате

        # Получение предсказанных значений для пользователей и товаров
        predicted_ratings = np.dot(test_transformed, item_factors)
        mse = (np.mean((test.values.flatten() - predicted_ratings.flatten()) ** 2))
        mses.append(mse)
        print(f'i = {i}, j= {j}, mse = {mse:.5f}')
print("MSE на тестовой выборке:", mses)

i = 5, j= 10, mse = 0.00462
i = 10, j= 10, mse = 0.00461
i = 15, j= 10, mse = 0.00461
i = 20, j= 10, mse = 0.00460
i = 25, j= 10, mse = 0.00460
i = 30, j= 10, mse = 0.00459
i = 35, j= 10, mse = 0.00458
i = 40, j= 10, mse = 0.00458
i = 45, j= 10, mse = 0.00458
i = 50, j= 10, mse = 0.00458
MSE на тестовой выборке: [np.float64(0.004621053392097344), np.float64(0.004612716518334134), np.float64(0.004605438709940398), np.float64(0.004600245642055351), np.float64(0.004597280455908079), np.float64(0.004594838082856883), np.float64(0.004583538553977201), np.float64(0.0045811764172529815), np.float64(0.004578650293874213), np.float64(0.004576780330754094)]


In [311]:
load = pd.DataFrame(index=actions_cutted['user_id'].unique(), 
                     columns=catalog.product_id[catalog.product_id.isin(actions_cutted['productId'].unique())].unique())
action_counts_load = actions_cutted.groupby(['user_id', 'productId', 'action']).size().unstack(fill_value=0).reset_index()
action_counts_load['score'] = (action_counts_load.iloc[:, 2:].values @ actions_coefs_arr)
pivot_scores_load = action_counts_load.pivot_table(values='score', index='user_id', columns='productId', fill_value=0)
load.update(pivot_scores_load)
del pivot_scores_load, action_counts_load
load.fillna(0, inplace=True)

In [312]:
svd.fit(load)

In [313]:
load_transformed = svd.transform(load)
item_factors = svd.components_
predicted_ratings_load = np.dot(load_transformed, item_factors)

In [314]:
results = pd.DataFrame(predicted_ratings_load, index=load.index, columns=load.columns)
valid_catalog = catalog[catalog.product_id.isin(actions_cutted['productId'].unique())]

user_cats_ordering = (
    actions_cutted
    .merge(valid_catalog[['product_id', 'category_id']], left_on='productId', right_on='product_id', how='left')
)

category_counts = user_cats_ordering.groupby(['user_id', 'category_id']).agg({'action': 'count'}).reset_index()
category_counts = category_counts.sort_values(by='action', ascending=False)

top_25_categories = (category_counts.groupby('user_id')
                     .head(25)
                     .reset_index(drop=True))

top_25_categories.drop('action', axis=1, inplace=True)

valid_product_ids = valid_catalog[valid_catalog.category_id.isin(top_25_categories.category_id.unique())]['product_id']

results_sorted_cats = results[[*valid_product_ids.tolist()]]

top_25_results_with_cats = results_sorted_cats.apply(lambda x: x.nlargest(25).index.tolist(), axis=1)
top25_with_cats = top_25_results_with_cats.to_frame().reset_index().rename({'index':'user_id', 0:'products'}, axis=1)
top25_with_cats.products = top25_with_cats.products.apply(lambda elem: [str(x) for x in elem])
top25_with_cats.iloc[0:3000].to_csv('../output/predictions/SVD_load_with_cats_7.csv', index=False)

In [292]:
print("MSE на тестовой выборке:", mse)

MSE на тестовой выборке: 0.004576780330754094


In [261]:
results = pd.DataFrame(predicted_ratings, index=test.index, columns=test.columns)

In [270]:
valid_catalog = catalog[catalog.product_id.isin(actions_train['productId'].unique())]

In [273]:
user_cats_ordering = (
    actions_cutted
    .merge(valid_catalog[['product_id', 'category_id']], left_on='productId', right_on='product_id', how='left')
)

In [274]:
category_counts = user_cats_ordering.groupby(['user_id', 'category_id']).agg({'action': 'count'}).reset_index()
category_counts = category_counts.sort_values(by='action', ascending=False)

top_25_categories = (category_counts.groupby('user_id')
                     .head(25)
                     .reset_index(drop=True))

In [275]:
top_25_categories.drop('action', axis=1, inplace=True)

In [277]:
valid_product_ids = valid_catalog[valid_catalog.category_id.isin(top_25_categories.category_id.unique())]['product_id']

In [278]:
results_sorted_cats = results[[*valid_product_ids.tolist()]]

In [279]:
top_25_results_with_cats = results_sorted_cats.apply(lambda x: x.nlargest(25).index.tolist(), axis=1)
top25_with_cats = top_25_results_with_cats.to_frame().reset_index().rename({'index':'user_id', 0:'products'}, axis=1)
top25_with_cats.products = top25_with_cats.products.apply(lambda elem: [str(x) for x in elem])
top25_with_cats.iloc[0:3000].to_csv('../output/predictions/SVD_result_with_cats_3.csv', index=False)

In [190]:
top_25_results = results.apply(lambda x: x.nlargest(25).index.tolist(), axis=1)
top25_new = top_25_results.to_frame().reset_index().rename({'index':'user_id', 0:'products'}, axis=1)
top25_new.products = top25_new.products.apply(lambda elem: [str(x) for x in elem])
top25_new.iloc[0:3000].to_csv('../output/predictions/SVD_result_9.csv', index=False)

In [191]:
top_10_results = results.apply(lambda x: x.nlargest(10).index.tolist(), axis=1)

top10_new = top_10_results.to_frame().reset_index().rename({'index':'user_id', 0:'products'}, axis=1)
top10_new.products = top10_new.products.apply(lambda elem: [str(x) for x in elem])
top10_new.iloc[0:3000].to_csv('../output/predictions/SVD_result_10.csv', index=False)