# LIBRARY LOADS

In [1]:
import sys
sys.executable

'/beegfs/home/r.zagidullin/RAPIDS_VERSIONS/LightAutoML_GPU/lama_venv/bin/python3'

In [2]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
from scipy import sparse

from lightautoml.reader.cudf_reader import CudfReader
from lightautoml.reader.hybrid_reader import HybridReader
from lightautoml.reader.base import PandasToPandasReader
from lightautoml.tasks import Task

import cudf
import dask_cudf

from numba import cuda
import cupy as cp

from lightautoml.transformers.base import SequentialTransformer, UnionTransformer


from lightautoml.transformers import numeric_gpu, categorical_gpu, datetime_gpu
from lightautoml.transformers import numeric, categorical, datetime

from lightautoml.pipelines.utils import get_columns_by_role

from dask.distributed import Client
from dask_cuda import LocalCUDACluster

import os

In [3]:
from lightautoml.reader.daskcudf_reader import DaskCudfReader

In [4]:
cluster = LocalCUDACluster(rmm_managed_memory=True,
                           CUDA_VISIBLE_DEVICES="0, 1",
                           protocol="ucx", enable_nvlink=True)

client = Client(cluster)
client.run(cudf.set_allocator, "managed")
#client.run(os.getpid)

distributed.preloading - INFO - Import preload module: dask_cuda.initialize
distributed.diskutils - INFO - Found stale lock file and directory '/beegfs/home/r.zagidullin/RAPIDS_VERSIONS/LightAutoML_GPU/jupyter_tests/dask-worker-space/worker-pqcn3kq_', purging
distributed.diskutils - INFO - Found stale lock file and directory '/beegfs/home/r.zagidullin/RAPIDS_VERSIONS/LightAutoML_GPU/jupyter_tests/dask-worker-space/worker-0xire7eq', purging
distributed.preloading - INFO - Import preload module: dask_cuda.initialize


{'ucx://127.0.0.1:42225': None, 'ucx://127.0.0.1:45935': None}

# DATA AND READERS PREPARATION

In [5]:
data = pd.read_csv('./application_train.csv')

In [6]:
data['BIRTH_DATE'] = (np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]'))
                   ).astype(str)

data['constant'] = 1
data['allnan'] = np.nan

data.drop(['DAYS_BIRTH',  'DAYS_EMPLOYED'], axis = 1, inplace = True)


In [14]:
task = task = Task('binary')
adv_roles = True
parts = 2
reader = PandasToPandasReader(task, advanced_roles=adv_roles,
                              n_jobs=1)
gpu_reader = CudfReader(task, advanced_roles=adv_roles, n_jobs=1)
dd_reader = DaskCudfReader(task, advanced_roles=adv_roles, 
                           n_jobs=1, index_ok = True, compute=False, npartitions=parts)

hy_reader = HybridReader(task, num_cpu_readers=1,
                         num_gpu_readers=2, gpu_ratio=0.7,
                         output='mgpu', advanced_roles=adv_roles, index_ok = True,
                         compute = False,
                         npartitions=parts, n_jobs=1)

In [8]:
gpu_data = cudf.DataFrame.from_pandas(data, nan_as_null=False)
dd_data = dask_cudf.from_cudf(gpu_data, npartitions=parts)

# TIMING READERS

In [9]:
%%time
ds = reader.fit_read(data, roles = {'target': 'TARGET'})

CPU times: user 20.8 s, sys: 783 ms, total: 21.6 s
Wall time: 22.2 s


In [15]:
%%time
gpu_ds = gpu_reader.fit_read(gpu_data, roles = {'target': 'TARGET'})

CPU times: user 10.4 s, sys: 2.88 s, total: 13.3 s
Wall time: 13.2 s


In [12]:
%%time
dd_ds = dd_reader.fit_read(dd_data, roles = {'target': 'TARGET'})

CPU times: user 18.9 s, sys: 4.5 s, total: 23.4 s
Wall time: 37.3 s


In [16]:
%%time
dd_ds = hy_reader.fit_read(data, roles={'target': 'TARGET'})

CPU times: user 6 s, sys: 2.05 s, total: 8.06 s
Wall time: 15.9 s


# TIMING LABEL ENCODER

In [17]:
trf = categorical.LabelEncoder()
gpu_trf = categorical_gpu.LabelEncoder_gpu()
dd_trf = categorical_gpu.LabelEncoder_gpu()

cats = ds[:, get_columns_by_role(ds, 'Category')]
gpu_cats = gpu_ds[:, get_columns_by_role(gpu_ds, 'Category')]
dd_cats = dd_ds[:, get_columns_by_role(dd_ds, 'Category')]

print(cats.shape, gpu_cats.shape, dd_cats.shape)

(307511, 17) (307511, 17) (307511, 15)


In [19]:
%%time
enc = trf.fit_transform(cats)

CPU times: user 814 ms, sys: 8.08 ms, total: 822 ms
Wall time: 807 ms


In [20]:
%%time
enc = gpu_trf.fit_transform(gpu_cats)

CPU times: user 454 ms, sys: 229 ms, total: 684 ms
Wall time: 672 ms


In [21]:
%%time
enc = dd_trf.fit_transform(dd_cats)

CPU times: user 378 ms, sys: 36.5 ms, total: 414 ms
Wall time: 884 ms


# TIMING TARGET ENCODER

In [22]:
trf = SequentialTransformer(
    [categorical.LabelEncoder(), categorical.TargetEncoder()]
)
gpu_trf = SequentialTransformer(
    [categorical_gpu.LabelEncoder_gpu(), categorical_gpu.TargetEncoder_gpu()]
)
dd_trf = SequentialTransformer(
    [categorical_gpu.LabelEncoder_gpu(), categorical_gpu.TargetEncoder_gpu()]
)


In [23]:
%%time
enc = trf.fit_transform(cats)

CPU times: user 3.44 s, sys: 35.3 ms, total: 3.48 s
Wall time: 3.41 s


In [24]:
%%time
enc = gpu_trf.fit_transform(gpu_cats)

CPU times: user 692 ms, sys: 452 ms, total: 1.14 s
Wall time: 1.14 s


In [25]:
%%time
enc = dd_trf.fit_transform(dd_cats)

0.00027377810329198837  superfit
0.0004821261391043663 preps
0.2752639423124492 prior
0.01825645612552762 persisting
0.17154101701453328 two map partitions with dask add
0.0006037228740751743 folds_prior
0.001797736156731844 before cycling
5.003375689033419 after cycling
5.06318174302578e-05 output
0.03567400388419628 setting data
CPU times: user 4.02 s, sys: 217 ms, total: 4.24 s
Wall time: 6.58 s


# TIMING OTHER CATEGORICAL TRANSFORMERS

## FreqEncoder

In [26]:
trf = categorical.FreqEncoder()
gpu_trf = categorical_gpu.FreqEncoder_gpu()
dd_trf = categorical_gpu.FreqEncoder_gpu()

In [27]:
%%timeit
enc = trf.fit_transform(cats)

750 ms ± 3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [30]:
%%timeit
enc = gpu_trf.fit_transform(gpu_cats)

595 ms ± 5.59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [29]:
%%time
enc = dd_trf.fit_transform(dd_cats)

CPU times: user 366 ms, sys: 20.7 ms, total: 387 ms
Wall time: 924 ms


## OrdinalEncoder

In [31]:
trf = categorical.OrdinalEncoder()
gpu_trf = categorical_gpu.OrdinalEncoder_gpu()
dd_trf = categorical_gpu.OrdinalEncoder_gpu()

In [33]:
%%time
enc = trf.fit_transform(cats)

CPU times: user 778 ms, sys: 5.76 ms, total: 784 ms
Wall time: 771 ms


In [34]:
%%time
enc = gpu_trf.fit_transform(gpu_cats)

CPU times: user 451 ms, sys: 207 ms, total: 658 ms
Wall time: 645 ms


In [35]:
%%time
enc = dd_trf.fit_transform(dd_cats)

CPU times: user 397 ms, sys: 22.2 ms, total: 419 ms
Wall time: 902 ms


## OHEEncoder

In [36]:
trf = SequentialTransformer(
    [categorical.LabelEncoder(), categorical.OHEEncoder(make_sparse=True)]
)
gpu_trf = SequentialTransformer(
    [categorical_gpu.LabelEncoder_gpu(), categorical_gpu.OHEEncoder_gpu(make_sparse=False)]
)
dd_trf = SequentialTransformer(
    [categorical_gpu.LabelEncoder_gpu(), categorical_gpu.OHEEncoder_gpu(make_sparse=False)]
)

In [37]:
%%time
enc = trf.fit_transform(cats)

CPU times: user 1.27 s, sys: 46.5 ms, total: 1.32 s
Wall time: 1.29 s


In [38]:
%%time
enc = gpu_trf.fit_transform(gpu_cats)

MemoryError: std::bad_alloc: CUDA error at: /beegfs/home/r.zagidullin/RAPIDS_VERSIONS/LightAutoML_GPU/lama_venv/include/rmm/mr/device/cuda_memory_resource.hpp:69: cudaErrorMemoryAllocation out of memory

In [39]:
%%time
enc = dd_trf.fit_transform(dd_cats)

NotImplementedError: Series getitem is only supported for other series objects with matching partition structure

## CatIntersections

In [40]:
trf = SequentialTransformer(
    [categorical.LabelEncoder(), categorical.CatIntersectstions()]
)
gpu_trf = SequentialTransformer(
    [categorical_gpu.LabelEncoder_gpu(), categorical_gpu.CatIntersections_gpu()]
)
dd_trf = SequentialTransformer(
    [categorical_gpu.LabelEncoder_gpu(), categorical_gpu.CatIntersections_gpu()]
)

In [41]:
%%time
enc = trf.fit_transform(cats)

CPU times: user 1min 48s, sys: 2.87 s, total: 1min 51s
Wall time: 1min 50s


In [42]:
%%time
enc = gpu_trf.fit_transform(gpu_cats)

CPU times: user 4.8 s, sys: 3.34 s, total: 8.14 s
Wall time: 8.13 s


In [43]:
%%time
enc = dd_trf.fit_transform(dd_cats)

CPU times: user 5.09 s, sys: 220 ms, total: 5.31 s
Wall time: 9.4 s


# TIMING DATETIME TRANSFORMERS

## TimeToNum

In [44]:
trf = datetime.TimeToNum()
gpu_trf = datetime_gpu.TimeToNum_gpu()
dd_trf = datetime_gpu.TimeToNum_gpu()

dats = ds[:, get_columns_by_role(ds, 'Datetime')]
gpu_dats = gpu_ds[:, get_columns_by_role(gpu_ds, 'Datetime')]
dd_dats = dd_ds[:, get_columns_by_role(dd_ds, 'Datetime')]

print(dats.shape, gpu_dats.shape, dd_dats.shape)

(307511, 2) (307511, 2) (307511, 2)


In [45]:
%%time
enc = trf.fit_transform(dats)

CPU times: user 14.1 ms, sys: 10 µs, total: 14.1 ms
Wall time: 14.6 ms


In [47]:
%%time
enc = gpu_trf.fit_transform(gpu_dats)

CPU times: user 5.27 ms, sys: 3.91 ms, total: 9.18 ms
Wall time: 8.15 ms


In [48]:
%%time
enc = dd_trf.fit_transform(dd_dats)

CPU times: user 22.8 ms, sys: 2.98 ms, total: 25.8 ms
Wall time: 25.4 ms


## BaseDiff

In [49]:
trf = datetime.BaseDiff(base_names=[dats.features[0]], diff_names=[dats.features[1]])
gpu_trf = datetime_gpu.BaseDiff_gpu(base_names=[dats.features[0]], diff_names=[dats.features[1]])
dd_trf = datetime_gpu.BaseDiff_gpu(base_names=[dats.features[0]], diff_names=[dats.features[1]])

In [50]:
%%time
enc = trf.fit_transform(dats)

CPU times: user 8.68 ms, sys: 0 ns, total: 8.68 ms
Wall time: 7.6 ms


In [52]:
%%time
enc = gpu_trf.fit_transform(gpu_dats)

CPU times: user 6.58 ms, sys: 7.04 ms, total: 13.6 ms
Wall time: 12.5 ms


In [53]:
%%time
enc = dd_trf.fit_transform(dd_dats)

CPU times: user 18.2 ms, sys: 1.12 ms, total: 19.3 ms
Wall time: 18.9 ms


## DateSeasons

In [54]:
trf = datetime.DateSeasons()
gpu_trf = datetime_gpu.DateSeasons_gpu()
dd_trf = datetime_gpu.DateSeasons_gpu()

In [55]:
%%time
enc = trf.fit_transform(dats)

CPU times: user 198 ms, sys: 1.96 ms, total: 200 ms
Wall time: 196 ms


In [56]:
%%time
enc = gpu_trf.fit_transform(gpu_dats)

CPU times: user 15.1 ms, sys: 3.88 ms, total: 18.9 ms
Wall time: 23.6 ms


In [57]:
%%time
enc = dd_trf.fit_transform(dd_dats)

CPU times: user 19.5 ms, sys: 2.67 ms, total: 22.2 ms
Wall time: 21.3 ms


# TIMING NUMERICAL TRANSFORMERS

## NaNFlags

In [58]:
trf = numeric.NaNFlags()
gpu_trf = numeric_gpu.NaNFlags_gpu()
dd_trf = numeric_gpu.NaNFlags_gpu()

nums = ds[:, get_columns_by_role(ds, 'Numeric')]
gpu_nums = gpu_ds[:, get_columns_by_role(gpu_ds, 'Numeric')]
dd_nums = dd_ds[:, get_columns_by_role(dd_ds, 'Numeric')]

print(nums.shape, gpu_nums.shape, dd_nums.shape)

(307511, 77) (307511, 84) (307511, 78)


In [59]:
%%time
enc = trf.fit_transform(nums)

CPU times: user 54.8 ms, sys: 27.9 ms, total: 82.7 ms
Wall time: 80.3 ms


In [60]:
%%time
enc = gpu_trf.fit_transform(gpu_nums)

CPU times: user 62.3 ms, sys: 35.8 ms, total: 98 ms
Wall time: 147 ms


In [61]:
%%time
enc = dd_trf.fit_transform(dd_nums)

CPU times: user 162 ms, sys: 9.69 ms, total: 171 ms
Wall time: 276 ms


## FillnaMedian

In [62]:
trf = numeric.FillnaMedian()
gpu_trf = numeric_gpu.FillnaMedian_gpu()
dd_trf = numeric_gpu.FillnaMedian_gpu()

In [63]:
%%time
enc = trf.fit_transform(nums)

CPU times: user 425 ms, sys: 20.9 ms, total: 446 ms
Wall time: 434 ms


In [64]:
%%time
enc = gpu_trf.fit_transform(gpu_nums)

CPU times: user 99 ms, sys: 72.3 ms, total: 171 ms
Wall time: 405 ms


In [65]:
%%time
enc = dd_trf.fit_transform(dd_nums)

CPU times: user 1.84 s, sys: 34.9 ms, total: 1.87 s
Wall time: 2.07 s


## FillInf

In [66]:
trf = numeric.FillInf()
gpu_trf = numeric_gpu.FillInf_gpu()
dd_trf = numeric_gpu.FillInf_gpu()

In [67]:
%%time
enc = trf.fit_transform(nums)

CPU times: user 38.2 ms, sys: 22.1 ms, total: 60.3 ms
Wall time: 56 ms


In [75]:
%%time
enc = gpu_trf.fit_transform(gpu_nums)

CPU times: user 29.2 ms, sys: 66 ms, total: 95.2 ms
Wall time: 92.6 ms


In [76]:
%%time
enc = dd_trf.fit_transform(dd_nums)

CPU times: user 61.3 ms, sys: 16.1 ms, total: 77.3 ms
Wall time: 76.6 ms


## LogOdds

In [70]:
trf = numeric.LogOdds()
gpu_trf = numeric_gpu.LogOdds_gpu()
dd_trf = numeric_gpu.LogOdds_gpu()

In [71]:
%%time
enc = trf.fit_transform(nums)

CPU times: user 84.3 ms, sys: 72.2 ms, total: 157 ms
Wall time: 153 ms


In [77]:
%%time
enc = gpu_trf.fit_transform(gpu_nums)

CPU times: user 30.2 ms, sys: 52.1 ms, total: 82.3 ms
Wall time: 80 ms


In [78]:
%%time
enc = dd_trf.fit_transform(dd_nums)

CPU times: user 63.8 ms, sys: 13.4 ms, total: 77.2 ms
Wall time: 77.2 ms


## StandardScaler

In [79]:
trf = numeric.StandardScaler()
gpu_trf = numeric_gpu.StandardScaler_gpu()
dd_trf = numeric_gpu.StandardScaler_gpu()

In [80]:
%%time
enc = trf.fit_transform(nums)

CPU times: user 498 ms, sys: 70.2 ms, total: 568 ms
Wall time: 554 ms


In [82]:
%%time
enc = gpu_trf.fit_transform(gpu_nums)

CPU times: user 56.9 ms, sys: 61.2 ms, total: 118 ms
Wall time: 115 ms


In [83]:
%%time
enc = dd_trf.fit_transform(dd_nums)

CPU times: user 247 ms, sys: 24.6 ms, total: 272 ms
Wall time: 2.08 s


## QuantileBinning

In [84]:
trf = numeric.QuantileBinning()
gpu_trf = numeric_gpu.QuantileBinning_gpu()
dd_trf = numeric_gpu.QuantileBinning_gpu()

In [85]:
%%time
enc = trf.fit_transform(nums)

CPU times: user 1.31 s, sys: 41.3 ms, total: 1.35 s
Wall time: 1.32 s


In [86]:
%%time
enc = gpu_trf.fit_transform(gpu_nums)

CPU times: user 211 ms, sys: 169 ms, total: 380 ms
Wall time: 379 ms


In [87]:
%%time
enc = dd_trf.fit(dd_nums)

CPU times: user 2.48 s, sys: 156 ms, total: 2.64 s
Wall time: 5.4 s


In [88]:
%%time
enc = dd_trf.transform(dd_nums)

CPU times: user 70.3 ms, sys: 2.06 ms, total: 72.4 ms
Wall time: 81.8 ms


In [None]:
# TIMING MULTICLASS TARGET ENCODER

In [97]:
task = task = Task('multiclass')
adv_roles = True
parts = 2
reader = PandasToPandasReader(task, advanced_roles=adv_roles,
                              n_jobs=1)
gpu_reader = CudfReader(task, advanced_roles=adv_roles, n_jobs=1)
dd_reader = DaskCudfReader(task, advanced_roles=adv_roles, 
                           n_jobs=1, index_ok = True, compute=False, npartitions=parts)

hy_reader = HybridReader(task, num_cpu_readers=1,
                         num_gpu_readers=2, gpu_ratio=0.6,
                         output='mgpu', advanced_roles=adv_roles, index_ok = True,
                         compute = False,
                         npartitions=parts, n_jobs=1)

In [90]:
data_len = data.shape[0]
data['TARGET'] = pd.Series(np.random.randint(0, 4, data_len)).astype('i')

In [91]:
gpu_data = cudf.DataFrame.from_pandas(data, nan_as_null=False)
dd_data = dask_cudf.from_cudf(gpu_data, npartitions=parts)

In [92]:
%%time
ds = reader.fit_read(data, roles = {'target': 'TARGET'})

CPU times: user 40 s, sys: 2.37 s, total: 42.4 s
Wall time: 41.6 s


In [94]:
%%time
gpu_ds = gpu_reader.fit_read(gpu_data, roles = {'target': 'TARGET'})

CPU times: user 22.4 s, sys: 9.5 s, total: 31.9 s
Wall time: 31.7 s


In [95]:
%%time
dd_ds = dd_reader.fit_read(dd_data, roles = {'target': 'TARGET'})

CPU times: user 28.6 s, sys: 10.4 s, total: 39 s
Wall time: 40.7 s


In [98]:
%%time
dd_ds = hy_reader.fit_read(data, roles =  {'target': 'TARGET'})

CPU times: user 6.18 s, sys: 2.18 s, total: 8.36 s
Wall time: 40.8 s


In [99]:
cats = ds[:, get_columns_by_role(ds, 'Category')]
gpu_cats = gpu_ds[:, get_columns_by_role(gpu_ds, 'Category')]
dd_cats = dd_ds[:, get_columns_by_role(dd_ds, 'Category')]

#cpu cats have zero cat columns
print(cats.shape, gpu_cats.shape, dd_cats.shape)

(307511, 1) (307511, 1) (307511, 1)


In [100]:
trf = SequentialTransformer(
    [categorical.LabelEncoder(), categorical.MultiClassTargetEncoder()]
)
gpu_trf = SequentialTransformer(
    [categorical_gpu.LabelEncoder_gpu(), categorical_gpu.MultiClassTargetEncoder_gpu()]
)
dd_trf = SequentialTransformer(
    [categorical_gpu.LabelEncoder_gpu(), categorical_gpu.MultiClassTargetEncoder_gpu()]
)


In [101]:
%time
enc = trf.fit_transform(cats)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.01 µs


In [104]:
%%time
enc = gpu_trf.fit_transform(gpu_cats)

CPU times: user 61.4 ms, sys: 43.1 ms, total: 105 ms
Wall time: 101 ms


In [105]:
%%time
enc = dd_trf.fit_transform(dd_cats)

CPU times: user 476 ms, sys: 17.8 ms, total: 493 ms
Wall time: 741 ms


In [106]:
data.shape

(307511, 124)