In [1]:
import sys
sys.executable

'/home/user/miniconda3/bin/python'

In [2]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
from scipy import sparse

from lightautoml.reader.cudf_reader import CudfReader
from lightautoml.reader.daskcudf_reader import DaskCudfReader
from lightautoml.reader.base import PandasToPandasReader
from lightautoml.tasks import Task

import cudf
import dask_cudf

from numba import cuda
import cupy as cp

from lightautoml.transformers.base import SequentialTransformer, UnionTransformer


from lightautoml.transformers import numeric_gpu, categorical_gpu, datetime_gpu
from lightautoml.transformers import numeric, categorical, datetime

from lightautoml.pipelines.utils import get_columns_by_role

from dask.distributed import Client
from dask_cuda import LocalCUDACluster

import os

In [3]:
cluster = LocalCUDACluster(rmm_managed_memory=True, CUDA_VISIBLE_DEVICES="0, 1",
                           protocol="ucx", enable_nvlink=True,
                           memory_limit="8GB")

client = Client(cluster)
# client.run(cudf.set_allocator, "managed")
client.run(os.getpid)

{'ucx://127.0.0.1:34115': 10653, 'ucx://127.0.0.1:54567': 10656}

In [4]:
?client.run

In [5]:
data = pd.read_csv('../PyCharmProj/lightautoml/application_train.csv')
# data = cudf.read_csv('../PyCharmProj/lightautoml/application_train.csv')


In [6]:
data['BIRTH_DATE'] = (np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]'))
                   ).astype(str)

data['constant'] = 1
data['allnan'] = np.nan

data.drop(['DAYS_BIRTH',  'DAYS_EMPLOYED'], axis = 1, inplace = True)


In [7]:
gpu_data = cudf.DataFrame(data)
dd_data = dask_cudf.from_cudf(gpu_data, npartitions=2)

In [8]:
# время работы DaskCudf
# advanced с обоими
# низкая утилизация

In [9]:
task = task = Task('binary',)
adv_roles = False
reader = PandasToPandasReader(task, advanced_roles=adv_roles)
gpu_reader = CudfReader(task, advanced_roles=adv_roles)
dd_reader = DaskCudfReader(task, advanced_roles=adv_roles)


In [10]:
%%time
ds = reader.fit_read(data, roles = {'target': 'TARGET'})

CPU times: user 1.25 s, sys: 261 ms, total: 1.51 s
Wall time: 1.49 s


In [11]:
%%time
gpu_ds = gpu_reader.fit_read(gpu_data, roles = {'target': 'TARGET'})

CPU times: user 1.29 s, sys: 118 ms, total: 1.41 s
Wall time: 1.36 s


In [31]:
%%time
dd_ds = dd_reader.fit_read(dd_data, roles = {'target': 'TARGET'})

CPU times: user 16.1 s, sys: 3.6 s, total: 19.7 s
Wall time: 20.3 s


In [13]:
ds.roles 

{'SK_ID_CURR': Numeric role, dtype <class 'numpy.float32'>. Additional params: [('force_input', False), ('prob', False), ('discretization', False)],
 'NAME_CONTRACT_TYPE': Category role, dtype <class 'object'>. Additional params: [('encoding_type', 'auto'), ('unknown', 5), ('force_input', False), ('label_encoded', False), ('ordinal', False)],
 'CODE_GENDER': Category role, dtype <class 'object'>. Additional params: [('encoding_type', 'auto'), ('unknown', 5), ('force_input', False), ('label_encoded', False), ('ordinal', False)],
 'FLAG_OWN_CAR': Category role, dtype <class 'object'>. Additional params: [('encoding_type', 'auto'), ('unknown', 5), ('force_input', False), ('label_encoded', False), ('ordinal', False)],
 'FLAG_OWN_REALTY': Category role, dtype <class 'object'>. Additional params: [('encoding_type', 'auto'), ('unknown', 5), ('force_input', False), ('label_encoded', False), ('ordinal', False)],
 'CNT_CHILDREN': Numeric role, dtype <class 'numpy.float32'>. Additional params: [(

In [12]:
trf = categorical.LabelEncoder()
gpu_trf = categorical_gpu.LabelEncoder_gpu()
dd_trf = categorical_gpu.LabelEncoder_gpu()

In [14]:
cats = ds[:, get_columns_by_role(ds, 'Category')]
gpu_cats = gpu_ds[:, get_columns_by_role(ds, 'Category')]
# dd_cats = dd_ds[:, get_columns_by_role(ds, 'Category')]

In [15]:
get_columns_by_role(ds, 'Category')

['CODE_GENDER',
 'EMERGENCYSTATE_MODE',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'FONDKAPREMONT_MODE',
 'HOUSETYPE_MODE',
 'NAME_CONTRACT_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'NAME_INCOME_TYPE',
 'NAME_TYPE_SUITE',
 'OCCUPATION_TYPE',
 'ORGANIZATION_TYPE',
 'WALLSMATERIAL_MODE',
 'WEEKDAY_APPR_PROCESS_START']

In [16]:
gpu_cats.roles

{'CODE_GENDER': Category role, dtype <class 'object'>. Additional params: [('encoding_type', 'auto'), ('unknown', 5), ('force_input', False), ('label_encoded', False), ('ordinal', False)],
 'EMERGENCYSTATE_MODE': Category role, dtype <class 'object'>. Additional params: [('encoding_type', 'auto'), ('unknown', 5), ('force_input', False), ('label_encoded', False), ('ordinal', False)],
 'FLAG_OWN_CAR': Category role, dtype <class 'object'>. Additional params: [('encoding_type', 'auto'), ('unknown', 5), ('force_input', False), ('label_encoded', False), ('ordinal', False)],
 'FLAG_OWN_REALTY': Category role, dtype <class 'object'>. Additional params: [('encoding_type', 'auto'), ('unknown', 5), ('force_input', False), ('label_encoded', False), ('ordinal', False)],
 'FONDKAPREMONT_MODE': Category role, dtype <class 'object'>. Additional params: [('encoding_type', 'auto'), ('unknown', 5), ('force_input', False), ('label_encoded', False), ('ordinal', False)],
 'HOUSETYPE_MODE': Category role, d

In [17]:
%%timeit
enc = trf.fit_transform(cats)

782 ms ± 5.44 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [18]:
gpu_cats

       CODE_GENDER EMERGENCYSTATE_MODE FLAG_OWN_CAR FLAG_OWN_REALTY  \
0                M                  No            N               Y   
1                F                  No            N               N   
2                M                <NA>            Y               Y   
3                F                <NA>            N               Y   
4                M                <NA>            N               Y   
...            ...                 ...          ...             ...   
307506           M                  No            N               N   
307507           F                  No            N               Y   
307508           F                  No            N               Y   
307509           F                  No            N               Y   
307510           F                  No            N               N   

       FONDKAPREMONT_MODE  HOUSETYPE_MODE NAME_CONTRACT_TYPE  \
0        reg oper account  block of flats         Cash loans   
1        reg oper a

In [19]:
%%timeit
enc = gpu_trf.fit_transform(gpu_cats)
cuda.synchronize()

563 ms ± 9.31 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [18]:
%%timeit
enc = dd_trf.fit_transform(dd_cats)
cuda.synchronize()

1.18 s ± 36.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
get_columns_by_role(ds, 'Category')

['CODE_GENDER',
 'EMERGENCYSTATE_MODE',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'FONDKAPREMONT_MODE',
 'HOUSETYPE_MODE',
 'NAME_CONTRACT_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'NAME_INCOME_TYPE',
 'NAME_TYPE_SUITE',
 'OCCUPATION_TYPE',
 'ORGANIZATION_TYPE',
 'WALLSMATERIAL_MODE',
 'WEEKDAY_APPR_PROCESS_START']

In [21]:
trf = SequentialTransformer(
    [categorical.LabelEncoder(), categorical.TargetEncoder()]
)

gpu_trf = SequentialTransformer(
    [categorical_gpu.LabelEncoder_gpu(), categorical_gpu.TargetEncoder_gpu()]
)

dd_trf = SequentialTransformer(
    [categorical_gpu.LabelEncoder_gpu(), categorical_gpu.TargetEncoder_gpu()]
)


In [22]:
%%timeit
enc = trf.fit_transform(cats)

3 s ± 18.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [32]:
%%timeit
enc = gpu_trf.fit_transform(gpu_cats)
cuda.synchronize()

860 ms ± 7.39 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [24]:
%%timeit
enc = dd_trf.fit_transform(dd_cats)
cuda.synchronize()

21.4 s ± 276 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [29]:
data.shape

(307511, 124)