In [61]:
import pandas as pd
import numpy as np

from preprocess import train_val_test_split, prepare_splitted_data

In [48]:
PROJECT_PATH = '~/negative_feedback/'
DATA_PATH = f'{PROJECT_PATH}/data/zvuk/'

RELEVANCE_COL = 'play_duration'
RELEVANCE_THRESHOLD = 30

In [14]:
path_to_df = '~/negative_feedback/data/zvuk-interactions.parquet'
data = pd.read_parquet(path_to_df)

In [15]:
data = data.rename(columns=
    {'datetime': 'timestamp'})

In [16]:
def prepare_clickstream(
    clickstream_df,
    num_users=20000,
    min_items_per_user=5,
    min_items=5,
    min_duration=0,
    aggregate_repeated_items=True,
    aggregation_type='sum',
):
    clickstream_df = clickstream_df[clickstream_df['play_duration'] >= min_duration]
    print(f'Filtered `play_duration` < {min_duration}')

    if aggregate_repeated_items: 
        if aggregation_type == 'sum':
            clickstream_df = clickstream_df.merge(
                clickstream_df.groupby(
                    ['user_id', 'track_id', 'session_id']
                )['play_duration'].sum(), on=['user_id', 'track_id', 'session_id'],
                how='left'
            ).drop_duplicates(subset=['user_id', 'track_id', 'session_id'])

        elif aggregation_type == 'max':
            clickstream_df = clickstream_df.merge(
                clickstream_df.groupby(
                    ['user_id', 'track_id', 'session_id']
                )['play_duration'].max(), on=['user_id', 'track_id', 'session_id'],
                how='left'
            ).drop_duplicates(subset=['user_id', 'track_id', 'session_id'])

        clickstream_df = clickstream_df.rename({'play_duration_y': 'play_duration'}, axis=1)
        clickstream_df = clickstream_df.drop(['play_duration_x'], axis=1)
        print('Aggregated')

    uc = clickstream_df['user_id'].value_counts()
    appropriate_users = uc[uc > min_items_per_user].index
    clickstream_df = clickstream_df[clickstream_df.loc[:, 'user_id'].isin(appropriate_users)]
    print('Filtered users by interactions num')
    
    selected_users = np.random.choice(clickstream_df['user_id'].unique(), size=num_users, replace=False)
    clickstream_df = clickstream_df[clickstream_df.loc[:, 'user_id'].isin(selected_users)]
    print(f'Picked {num_users} users')
    
    ic = clickstream_df['track_id'].value_counts()
    clickstream_df = clickstream_df[clickstream_df.loc[:, 'track_id'].isin(ic[ic > min_items].index)]
    print('Filtered items by interactions num')
    
    clickstream_df.sort_values(by=['user_id', 'timestamp'], inplace=True)
    clickstream_df.insert(0, 'user_idx', clickstream_df['user_id'].astype('category').cat.codes)
    clickstream_df.insert(2, 'item_idx', clickstream_df['track_id'].astype('category').cat.codes)
    clickstream_df.insert(4, 'session_idx', clickstream_df['session_id'].astype('category').cat.codes)

    return clickstream_df.drop(['user_id', 'track_id', 'session_id'], axis=1)

In [19]:
prepared_data = prepare_clickstream(data)

Filtered `play_duration` < 0
Aggregated
Filtered users by interactions num
Picked 20000 users
Filtered items by interactions num


In [21]:
prepared_data = prepared_data[['user_idx', 'item_idx', 'timestamp', 'play_duration']].rename(columns=
    {'user_idx': 'user_id', 'item_idx': 'item_id'})

In [46]:
np.mean(prepared_data.play_duration < 30)

0.25330848779665244

In [43]:
(prepared_data.groupby('item_id').count()['user_id'] < 5).mean()

0.0

In [44]:
(prepared_data.groupby('user_id').count()['item_id'] < 5).mean()

0.007255078554988492

In [51]:
train, val, test = train_val_test_split(prepared_data, RELEVANCE_THRESHOLD, RELEVANCE_COL, test_quantile=0.9)

Filtering items..
Number of items before 147781
Number of items after 107397
Interactions length before: 8776895
Interactions length after: 8255964
Filtering users..
Number of users before 19978
Number of users after 19693
Interactions length before: 8255964
Interactions length after: 8254635


In [52]:
val.user_id.nunique()

4347

In [53]:
train.user_id.nunique()

19047

In [54]:
train.groupby('user_id')['item_id'].agg('count').describe()

count    19047.000000
mean       390.044154
std        756.232773
min          1.000000
25%         29.000000
50%        116.000000
75%        413.000000
max      15122.000000
Name: item_id, dtype: float64

In [55]:
def filter_last_positive(df):

    df = df.sort_values(['user_id', 'timestamp'])
    last_item = df.groupby('user_id').tail(1)
    last_item = last_item[last_item[RELEVANCE_COL] >= RELEVANCE_THRESHOLD]
    df = df[df['user_id'].isin(last_item['user_id'])]

    return df

In [56]:
val = filter_last_positive(val)

In [57]:
val.user_id.nunique()

3091

In [59]:
train.to_parquet(f'{DATA_PATH}/train.parquet', index=False)
test.to_parquet(f'{DATA_PATH}/test.parquet', index=False)
val.to_parquet(f'{DATA_PATH}/validation.parquet', index=False)