In [60]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from datetime import datetime, timedelta

from tqdm import tqdm

In [2]:
train_actions = pd.read_parquet('../data/raw/train_actions.pq', engine='pyarrow')
stokman_catalog = pd.read_parquet('../data/raw/stokman_catalog_preprocessed.pq', engine='pyarrow')

In [3]:
def min_preprocessing(actions, catalog):
    
    actions.date = pd.to_datetime(actions.date)
    actions = actions.explode('products')
    actions.products = actions.products.fillna(0)
    actions.products = actions.products.astype(int)
    actions.rename({'products': 'productId'}, axis=1, inplace=True)

    catalog.add_date = pd.to_datetime(catalog.add_date)
    catalog.shop_id = catalog.shop_id.astype(int)
    catalog.category_id = catalog.category_id.astype(int)
    catalog.product_id = catalog.product_id.astype(int)
    catalog.drop(['title'], axis=1, inplace=True)
    
    return actions, catalog

In [4]:
actions, catalog = min_preprocessing(train_actions, stokman_catalog)
del train_actions, stokman_catalog

In [5]:
users_actions = actions.groupby('user_id').agg({'action':'count'}).sort_values(by='action', ascending=False)

In [6]:
actions_cutted = actions[actions.user_id.isin(users_actions.sort_values(by='action', ascending=False)[:10000].index)]
del actions, users_actions

In [7]:
scores = pd.DataFrame(index=actions_cutted['user_id'].unique(), columns=catalog.product_id.unique())

In [8]:
scores = scores.fillna(0)

  scores = scores.fillna(0)


In [37]:
ACTIONS = {
    0: 'view',
    1: 'like',
    2: 'addB',
    3: 'delB',
    4: 'clearB',
    5: 'order',
    6: 'listB',
    7: 'visit',
    8: 'visitCategory',
    9: 'search'
}

action_coefficients = {
    0: 1,   # view
    1: 6,   # like
    2: 8,   # addB
    3: -4,  # delB
    4: -4,  # clearB
    5: 10,  # order
    6: 0,   # listB
    7: 2,   # visit
    8: 2,   # visitCategory
    9: 4    # search
}
actions_coefs_arr = np.array([1, 6, 8, -4, -4, 10, 0, 2, 2, 4])

In [10]:
action_counts_df = actions_cutted.groupby(['user_id', 'productId', 'action']).size().unstack(fill_value=0).reset_index()

In [86]:
action_counts_df['score'] = (action_counts_df.iloc[:, 2:].values @ actions_coefs_arr)

In [88]:
pivot_scores = action_counts_df.pivot_table(values='score', index='user_id', columns='productId', fill_value=0)

In [89]:
scores.update(pivot_scores)

In [90]:
del pivot_scores

In [92]:
scores.shape

(10000, 91888)

In [93]:
scores = scores.drop(columns=scores.columns[(scores == 0).all()])

In [94]:
scores.shape

(10000, 62174)

In [96]:
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

In [97]:
svd = TruncatedSVD(n_components=2)
decomposed_matrix = svd.fit_transform(scores)
item_matrix = svd.components_

User features matrix:
[[ 5.45602900e+03 -1.04852608e+00]
 [ 1.09400169e+03 -1.78226156e-01]
 [ 6.51999354e+02 -9.69670755e-02]
 ...
 [ 3.31999813e+02 -5.01851991e-02]
 [ 2.19999950e+02 -3.34752651e-02]
 [ 3.02000326e+02 -4.82740071e-02]]

Item features matrix:
[[ 8.18733261e-08  5.77285700e-07  9.56147397e-07 ...  3.18106536e-08
   8.40247645e-06  9.67356661e-08]
 [-4.31445397e-07 -3.16535203e-06 -5.62487861e-06 ... -1.67253953e-07
  -5.18509972e-05 -5.06760865e-07]]


In [99]:
reconstructed_matrix = np.dot(decomposed_matrix, item_matrix)
predicted_ratings = pd.DataFrame(reconstructed_matrix, columns=scores.columns, index=scores.index)

In [117]:
top_25_items_per_user = predicted_ratings.apply(lambda row: row.nlargest(25).index.tolist(), axis=1)

In [118]:
top_25_items_per_user = top_25_items_per_user.reset_index().rename({'index':'user_id', 0:'products'}, axis=1)
top_25_items_per_user.products = top_25_items_per_user.products.apply(lambda elem: [str(x) for x in elem])

In [135]:
top_25_items_per_user.iloc[0:3000].to_csv('../output/predictions/SVD_result_1.csv', index=False)

In [125]:
action_counts_df.score.describe()

count    378209.000000
mean         16.236977
std         141.601901
min         -20.000000
25%           1.000000
50%           1.000000
75%           2.000000
max       23564.000000
Name: score, dtype: float64

In [127]:
for col in predicted_ratings.columns:
    predicted_ratings[col] = predicted_ratings[col].astype(int)

In [129]:
predicted_ratings = predicted_ratings.drop(columns=predicted_ratings.columns[(predicted_ratings == 0).all()])

In [137]:
top25_new = predicted_ratings.apply(lambda row: row.nlargest(25).index.tolist(), axis=1)

In [138]:
top25_new = top25_new.reset_index().rename({'index':'user_id', 0:'products'}, axis=1)
top25_new.products = top25_new.products.apply(lambda elem: [str(x) for x in elem])
top25_new.iloc[0:3000].to_csv('../output/predictions/SVD_result_2.csv', index=False)