# Setup

In [4]:
import boto3

import columns
import constants
from utils.io.s3 import download_dataframe, list_objects

In [5]:
s3_session = boto3.session.Session()
s3_client = s3_session.client(service_name='s3', endpoint_url='https://storage.yandexcloud.net')

In [6]:
dataset_df = download_dataframe(s3_client, constants.S3_BUCKET, constants.DATA_PATH / 'dataset.parquet')

# Browse data

In [5]:
dataset_df.head()

Unnamed: 0,user_id,artist_id
0,d705b538-1bd8-48a3-9dad-8941dee23ff7,69c71d72-7ed8-42c4-b9ec-c33976a310b9
1,d705b538-1bd8-48a3-9dad-8941dee23ff7,30bf469f-9abd-4011-a210-ff19fee29d49
2,d705b538-1bd8-48a3-9dad-8941dee23ff7,a26c9335-2459-4c89-a00c-fdecbeb2c8c4
3,d705b538-1bd8-48a3-9dad-8941dee23ff7,69c903b5-dff0-4ded-86e4-ea97ac4e1265
4,d705b538-1bd8-48a3-9dad-8941dee23ff7,af8eef9d-13aa-4ffa-b77a-777645ce76cd


In [6]:
dataset_df.describe(include='all')

Unnamed: 0,user_id,artist_id
count,2275900,2275900
unique,50000,90076
top,d4ac1f72-0347-4719-bebe-8c12ec738f57,5cd0ffb5-0cf2-4ecd-8c5b-ca2102e33198
freq,119,10471


In [10]:
dataset_df.groupby(columns.USER_ID_COLUMN)[columns.ARTIST_ID_COLUMN].agg(lambda items: len(items)).describe()

count    50000.000000
mean        45.518000
std          8.025457
min         16.000000
25%         42.000000
50%         45.000000
75%         48.000000
max        119.000000
Name: artist_id, dtype: float64

In [11]:
dataset_df.groupby(columns.ARTIST_ID_COLUMN)[columns.USER_ID_COLUMN].agg(lambda items: len(items)).describe()

count    90076.000000
mean        25.266442
std        159.648585
min          1.000000
25%          1.000000
50%          3.000000
75%          8.000000
max      10471.000000
Name: user_id, dtype: float64

In [13]:
(dataset_df.groupby([columns.USER_ID_COLUMN, columns.ARTIST_ID_COLUMN]).size() == 1).all()

False

In [15]:
dataset_df.groupby(columns.USER_ID_COLUMN)[columns.ARTIST_ID_COLUMN].agg(lambda items: items.nunique() != items.shape[0]).sum()

54

# Show recs

In [11]:
pipeline_id = 'w2v'

In [12]:
list_objects(s3_client, constants.S3_BUCKET, constants.SUBMISSION_PATH / pipeline_id)

['hardml/recsys/lesson5/submissions/w2v/20240309T103120.parquet',
 'hardml/recsys/lesson5/submissions/w2v/20240309T104819.parquet',
 'hardml/recsys/lesson5/submissions/w2v/20240309T120211.parquet',
 'hardml/recsys/lesson5/submissions/w2v/20240309T121129.parquet',
 'hardml/recsys/lesson5/submissions/w2v/20240309T121502.parquet',
 'hardml/recsys/lesson5/submissions/w2v/20240310T090904.parquet']

In [13]:
submission_df = download_dataframe(
    s3_client,
    constants.S3_BUCKET,
    constants.SUBMISSION_PATH / pipeline_id / '20240310T090904.parquet'
)
submission_df.head(20)

Unnamed: 0,user_id,y_rec
0,0000037c-9533-48b4-8d12-7b808fd4dabb,"[40d5f4f7-3560-4638-a562-055013eee0d4, d1947eb..."
1,00009c72-2a2b-4cbe-999c-aa7579c72ef4,"[5689dc20-471e-44d7-a428-1d3bcae7c7e3, 50bf3bf..."
2,0001c857-a396-43c5-b8bc-e32c41636a88,"[5cd0ffb5-0cf2-4ecd-8c5b-ca2102e33198, cc4f544..."
3,0003534e-6452-43b4-9afd-7d8f01e05ded,"[50bf3bfe-4b58-4a6e-a193-ccc406a6f6d7, b632249..."
4,0005762b-85e7-4283-98ed-cbb402bdda8c,"[310cdc9f-5ea4-4244-b4e8-f2943109c44f, 7de993c..."
5,00066930-0e79-43de-8b0b-489ae642b951,"[93bf07d3-5233-4270-a3c4-9815e9d786da, 60b3db7..."
6,00094b33-eae0-4d36-a599-dc911d63e7ea,"[c70a9c8c-a44c-4db1-a3ac-268a5326521a, d1947eb..."
7,000d2745-9420-40ee-a62a-667f68c7512a,"[3edd02d9-f2ff-4591-bf9b-106e890c08ef, b5baedd..."
8,000f6aa7-094e-4685-9ccf-a8a703387463,"[e0d0391a-7454-4d3e-a690-950204ef59bf, 1b73674..."
9,000f8349-8692-4d79-bc5b-5b9abee3b7d5,"[5cd0ffb5-0cf2-4ecd-8c5b-ca2102e33198, 40d5f4f..."


In [26]:
mask = submission_df['user_id'].str.startswith('1d664c61-55cb')
submission_df.loc[mask, 'y_rec'].values[0]

array(['35a3b882-19ee-421c-8135-3bd7f7773b4c',
       'ea63c8fd-4e9d-4735-8f87-15c27cd98540',
       'c6bc589a-9bd8-4903-b0c9-4eac9e2e2ee6',
       'c1ed6dd4-6e22-42c5-983b-8a219955b420',
       '42b2f695-ea98-4f1c-8ece-0f61167fdbc9',
       'c5920dc9-bde8-4a16-8c64-98c51458497e',
       '3809986d-afcc-4329-98c5-f1272de708b4',
       '4b23366c-7fe0-4529-8034-90c8866c3e4e',
       'a040d9c1-5e56-462b-bc5b-6ab6cc5b2844',
       '79e81ed7-39fd-4046-a942-e3a0ded32039'], dtype=object)