# Setup

In [1]:
import boto3

import columns
import constants
from utils.io.s3 import download_dataframe, list_objects

In [2]:
s3_session = boto3.session.Session()
s3_client = s3_session.client(service_name='s3', endpoint_url='https://storage.yandexcloud.net')

In [3]:
dataset_df = download_dataframe(s3_client, constants.S3_BUCKET, constants.DATA_PATH / 'dataset.parquet')

# Browse data

In [5]:
dataset_df.head()

Unnamed: 0,user_id,artist_id
0,d705b538-1bd8-48a3-9dad-8941dee23ff7,69c71d72-7ed8-42c4-b9ec-c33976a310b9
1,d705b538-1bd8-48a3-9dad-8941dee23ff7,30bf469f-9abd-4011-a210-ff19fee29d49
2,d705b538-1bd8-48a3-9dad-8941dee23ff7,a26c9335-2459-4c89-a00c-fdecbeb2c8c4
3,d705b538-1bd8-48a3-9dad-8941dee23ff7,69c903b5-dff0-4ded-86e4-ea97ac4e1265
4,d705b538-1bd8-48a3-9dad-8941dee23ff7,af8eef9d-13aa-4ffa-b77a-777645ce76cd


In [6]:
dataset_df.describe(include='all')

Unnamed: 0,user_id,artist_id
count,2275900,2275900
unique,50000,90076
top,d4ac1f72-0347-4719-bebe-8c12ec738f57,5cd0ffb5-0cf2-4ecd-8c5b-ca2102e33198
freq,119,10471


In [10]:
dataset_df.groupby(columns.USER_ID_COLUMN)[columns.ARTIST_ID_COLUMN].agg(lambda items: len(items)).describe()

count    50000.000000
mean        45.518000
std          8.025457
min         16.000000
25%         42.000000
50%         45.000000
75%         48.000000
max        119.000000
Name: artist_id, dtype: float64

In [11]:
dataset_df.groupby(columns.ARTIST_ID_COLUMN)[columns.USER_ID_COLUMN].agg(lambda items: len(items)).describe()

count    90076.000000
mean        25.266442
std        159.648585
min          1.000000
25%          1.000000
50%          3.000000
75%          8.000000
max      10471.000000
Name: user_id, dtype: float64

In [13]:
(dataset_df.groupby([columns.USER_ID_COLUMN, columns.ARTIST_ID_COLUMN]).size() == 1).all()

False

In [15]:
dataset_df.groupby(columns.USER_ID_COLUMN)[columns.ARTIST_ID_COLUMN].agg(lambda items: items.nunique() != items.shape[0]).sum()

54

# Show recs

In [5]:
pipeline_id = 'baseline'

In [6]:
list_objects(s3_client, constants.S3_BUCKET, constants.SUBMISSION_PATH / pipeline_id)

['hardml/recsys/lesson5/submissions/baseline/20240227T182341.parquet',
 'hardml/recsys/lesson5/submissions/baseline/20240227T182609.parquet',
 'hardml/recsys/lesson5/submissions/baseline/20240227T182800.parquet',
 'hardml/recsys/lesson5/submissions/baseline/20240227T182855.parquet',
 'hardml/recsys/lesson5/submissions/baseline/20240227T183229.parquet',
 'hardml/recsys/lesson5/submissions/baseline/20240227T183406.parquet']

In [7]:
submission_df = download_dataframe(
    s3_client,
    constants.S3_BUCKET,
    constants.SUBMISSION_PATH / pipeline_id / '20240227T183406.parquet'
)
submission_df.head()

Unnamed: 0,user_id,y_rec
0,0000037c-9533-48b4-8d12-7b808fd4dabb,"[5cd0ffb5-0cf2-4ecd-8c5b-ca2102e33198, 1228929..."
1,00009c72-2a2b-4cbe-999c-aa7579c72ef4,"[a26c9335-2459-4c89-a00c-fdecbeb2c8c4, ee0f3f0..."
2,0001c857-a396-43c5-b8bc-e32c41636a88,"[5cd0ffb5-0cf2-4ecd-8c5b-ca2102e33198, 1228929..."
3,0003534e-6452-43b4-9afd-7d8f01e05ded,"[12289298-d9dc-4b1d-bc27-16480829de75, a26c933..."
4,0005762b-85e7-4283-98ed-cbb402bdda8c,"[5cd0ffb5-0cf2-4ecd-8c5b-ca2102e33198, 1228929..."
