In [1]:
import boto3
import numpy
import pandas

import constants
from utils.io.s3 import download_dataframe, upload_dataframe

In [2]:
import logging

logging.basicConfig(level=logging.INFO)

In [3]:
s3_session = boto3.session.Session()
s3_client = s3_session.client(service_name='s3', endpoint_url='https://storage.yandexcloud.net')

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [4]:
from utils.features.calculation.wareshouse import S3DataWareHouse

dwh = S3DataWareHouse(s3_client)

dwh.register('test', constants.S3_BUCKET, constants.DATA_PATH / 'ad_features.parquet', lazy_loading=False)

INFO:root:Register metadata for "test" data
INFO:root:Download "test" data to cache


In [6]:
from pandas import DataFrame
from utils.features.calculation.builder import PandasFeaturesBuilder
from utils.features.calculation.calcer import PrecalculatedFeatureCalcer
from utils.features.calculation.wareshouse import DataWarehouseBase

import columns


class TestCalcer(PrecalculatedFeatureCalcer[pandas.DataFrame]):
    def __init__(self, calcer_id: str, dwh: DataWarehouseBase) -> None:
        super().__init__(calcer_id, dwh)

    def get_data_on(self, sample: pandas.DataFrame) -> pandas.DataFrame:
        return sample.merge(self.dwh['test'], how='inner', on=columns.ITEM_ID_COLUMN)

calcer = TestCalcer('test', dwh)
builder = PandasFeaturesBuilder([calcer])

sample_df = pandas.DataFrame({columns.ITEM_ID_COLUMN: [63133, 313401]})

builder.calculate_on(sample_df)

INFO:root:Got features from "test" calcer: Index(['adgroup_id', 'cate_id', 'campaign_id', 'customer', 'brand', 'price'], dtype='object')
INFO:root:Got features with shape: (2, 6)


Unnamed: 0,adgroup_id,cate_id,campaign_id,customer,brand,price
0,63133,6406,83237,1,95471.0,170.0
1,313401,6406,83237,1,87331.0,199.0


In [4]:
dataset_df = download_dataframe(s3_client, constants.S3_BUCKET, constants.DATA_PATH / 'train.parquet')
dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2872562 entries, 0 to 2872561
Data columns (total 2 columns):
 #   Column      Dtype
---  ------      -----
 0   uid         int64
 1   friend_uid  int64
dtypes: int64(2)
memory usage: 43.8 MB


In [6]:
dataset_df.describe()

Unnamed: 0,uid,friend_uid
count,28726.0,28726.0
mean,38251.947574,79131.462403
std,27179.773768,28380.548545
min,0.0,718.0
25%,15253.75,58743.0
50%,33598.5,83791.0
75%,57176.0,103094.75
max,117356.0,120060.0
