In [1]:
import os
import pickle
import time

from lightautoml.dataset.np_pd_dataset_cupy import *
from lightautoml.dataset.roles import *
from lightautoml.dataset.utils import roles_parser

from lightautoml.tasks import Task

In [2]:
from lightautoml.transformers import numeric_gpu, categorical_gpu, datetime_gpu

# Initial data load

In [3]:
data = pd.read_csv('./example_data/test_data_files/sampled_app_train.csv',
                       usecols=['TARGET', 'NAME_CONTRACT_TYPE', 'AMT_CREDIT',
                                'NAME_TYPE_SUITE', 'AMT_GOODS_PRICE',
                                'DAYS_BIRTH', 'DAYS_EMPLOYED'])

# Fix dates and convert to date type

data['BIRTH_DATE'] = np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))
data['EMP_DATE'] = np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]'))
data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)

# Create folds
data['__fold__'] = np.random.randint(0, 5, len(data))

# Print data head
print(data.head())

# # Set roles for columns

check_roles = {
    TargetRole(): 'TARGET',
    CategoryRole(dtype=str): ['NAME_CONTRACT_TYPE', 'NAME_TYPE_SUITE'],
    NumericRole(np.float32): ['AMT_CREDIT', 'AMT_GOODS_PRICE'],
    DatetimeRole(seasonality=['y', 'm', 'wd']): ['BIRTH_DATE', 'EMP_DATE'],
    FoldsRole(): '__fold__'
}

# create Task
task = Task('binary')

# # Creating PandasDataSet
pd_dataset = PandasDataset(data, roles_parser(check_roles), task=task)

cudf_dataset = pd_dataset.to_cudf()

   TARGET NAME_CONTRACT_TYPE  AMT_CREDIT  AMT_GOODS_PRICE NAME_TYPE_SUITE  \
0       0         Cash loans    327024.0         270000.0   Unaccompanied   
1       0         Cash loans    675000.0         675000.0   Unaccompanied   
2       0    Revolving loans    270000.0         270000.0   Unaccompanied   
3       0         Cash loans    142200.0         112500.0   Unaccompanied   
4       0         Cash loans   1483231.5        1354500.0          Family   

  BIRTH_DATE   EMP_DATE  __fold__  
0 1967-02-08 2017-05-20         3  
1 1962-05-19 2007-07-16         4  
2 1991-11-14 2015-01-21         2  
3 1986-09-25 2011-10-13         4  
4 1981-11-08 2013-02-21         4  


In [4]:
np.isnan(data.NAME_TYPE_SUITE.iloc[180])

True

In [5]:
# check for cudf to cupy values conversion
print(cudf_dataset.data['AMT_GOODS_PRICE'].values)

# check for nan correct representation
print(cudf_dataset.data['AMT_GOODS_PRICE'].values.sum())

[ 270000.  675000.  270000. ...  540000.  436500. 1800000.]
nan


# Numeric dataset creation & numerical transformers test

In [6]:
check_roles = {
    NumericRole(np.float32): ['AMT_CREDIT', 'AMT_GOODS_PRICE'],
}

numeric_dataset = CudfDataset(data=cudf_dataset.data[['AMT_CREDIT',\
                                                      'AMT_GOODS_PRICE']].copy(),
                        roles=roles_parser(check_roles),
                        task=task)

In [7]:
numeric_dataset.folds

## FillNA transformer

In [8]:
filler = numeric_gpu.FillnaMedian()

In [9]:
filled_dataset = filler.fit_transform(numeric_dataset)
print(filled_dataset)

array([[ 327024. ,  270000. ],
       [ 675000. ,  675000. ],
       [ 270000. ,  270000. ],
       ...,
       [ 540000. ,  540000. ],
       [ 505642.5,  436500. ],
       [2013840. , 1800000. ]], dtype=float32)


## Standard scaler transformer

In [10]:
scaler = numeric_gpu.StandardScaler()

In [11]:
scaler.fit_transform(filled_dataset)

array([[-0.6588682 , -0.71132535],
       [ 0.19484693,  0.37249017],
       [-0.7987694 , -0.71132535],
       ...,
       [-0.1363585 ,  0.01121833],
       [-0.22065029, -0.26575673],
       [ 3.4795218 ,  3.3830886 ]], dtype=float32)

## NANRate

In [12]:
nan_rate = numeric_gpu.NaNFlags()

In [13]:
nan_rate.fit_transform(numeric_dataset)

array([], shape=(10000, 0), dtype=float64)

## FillInf transformer

In [14]:
fill_inf = numeric_gpu.FillInf()

In [15]:
test_dataset = numeric_dataset.to_numpy().to_cupy()

In [16]:
test_dataset.data[-1,0] = cp.inf

In [17]:
fill_inf.fit_transform(test_dataset)

array([[ 327024. ,  270000. ],
       [ 675000. ,  675000. ],
       [ 270000. ,  270000. ],
       ...,
       [ 540000. ,  540000. ],
       [ 505642.5,  436500. ],
       [      nan, 1800000. ]], dtype=float32)

In [18]:
test_dataset

array([[ 327024. ,  270000. ],
       [ 675000. ,  675000. ],
       [ 270000. ,  270000. ],
       ...,
       [ 540000. ,  540000. ],
       [ 505642.5,  436500. ],
       [      inf, 1800000. ]], dtype=float32)

## QuantileBinning transformer

In [19]:
qb = numeric_gpu.QuantileBinning()

In [20]:
qb.fit_transform(filled_dataset)

array([[ 4,  3],
       [ 7,  7],
       [ 3,  3],
       ...,
       [ 6,  7],
       [ 5,  5],
       [10, 10]], dtype=int32)

In [21]:
filled_dataset

array([[ 327024. ,  270000. ],
       [ 675000. ,  675000. ],
       [ 270000. ,  270000. ],
       ...,
       [ 540000. ,  540000. ],
       [ 505642.5,  436500. ],
       [2013840. , 1800000. ]], dtype=float32)

# Categorical dataset creation & categorical transformers test

In [22]:
check_roles = {
    CategoryRole(dtype=str): ['NAME_CONTRACT_TYPE', 'NAME_TYPE_SUITE'],
}

In [23]:
categorical_dataset = CudfDataset(data=cudf_dataset.data[['NAME_CONTRACT_TYPE',\
                                                      'NAME_TYPE_SUITE']].copy(),
                        roles=roles_parser(check_roles),
                        task=task)

In [24]:
categorical_dataset.data.NAME_TYPE_SUITE.isna().sum()

0

## Label Encoder transformer

In [25]:
le = categorical_gpu.LabelEncoder()

In [26]:
a = le.fit_transform(categorical_dataset).data

In [27]:
encoded_dataset = le.transform(categorical_dataset)

In [28]:
from lightautoml.transformers.categorical import LabelEncoder as LE

In [29]:
label = LE()
b = label.fit_transform(encoded_dataset.to_cudf().to_pandas()).data

In [30]:
(b == cp.asnumpy(a)).all()

True

## OneHot Encoder transformer

In [31]:
ohe = categorical_gpu.OHEEncoder()

In [32]:
ohe.fit_transform(encoded_dataset)

array([[1., 0., 1., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 1., 1., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [33]:
from cuml.preprocessing import OneHotEncoder

In [34]:
ohe_ = OneHotEncoder(categories='auto',
                                 dtype=cp.float32, sparse=False,
                                 handle_unknown='ignore')

In [35]:
a = ohe_.fit_transform(encoded_dataset.data)

In [36]:
b = ohe.fit_transform(encoded_dataset)

In [37]:
(a == b.data).all()

array(True)

## FreqEncoder transformer

In [38]:
freq_enc = categorical_gpu.FreqEncoder()

In [39]:
freq_enc.fit_transform(encoded_dataset)

array([[8972., 8045.],
       [8972., 8045.],
       [1028., 8045.],
       ...,
       [1028., 8045.],
       [8972., 8045.],
       [8972.,  387.]], dtype=float32)

In [40]:
from lightautoml.transformers.categorical import FreqEncoder as fe

In [41]:
fe_ = fe()

In [42]:
fe_.fit_transform(encoded_dataset.to_cudf().to_pandas())

array([[8972., 8045.],
       [8972., 8045.],
       [1028., 8045.],
       ...,
       [1028., 8045.],
       [8972., 8045.],
       [8972.,  387.]], dtype=float32)

In [43]:
(freq_enc.fit_transform(encoded_dataset).to_numpy().data ==\
fe_.fit_transform(encoded_dataset.to_cudf().to_pandas()).data).all()

True

## TargetEncoder transformer

In [44]:
encoded_dataset.folds = cudf_dataset.folds
encoded_dataset.target = cudf_dataset.target

In [45]:
target_enc = categorical_gpu.TargetEncoder()

In [46]:
a = target_enc.fit_transform(encoded_dataset)

In [47]:
from lightautoml.transformers.categorical import TargetEncoder as te

In [48]:
target_cpu = te()

encoded_cpu = encoded_dataset.to_cudf().to_pandas().to_numpy()
encoded_cpu.folds = cp.asnumpy(encoded_dataset.folds)
encoded_cpu.target = cp.asnumpy(encoded_dataset.target)
b = target_cpu.fit_transform(encoded_cpu)

In [49]:
np.allclose(cp.asnumpy(a.data), b.data)

True

## Multiclass TargetEncoder transformer

In [50]:
task = Task('multiclass')
check_roles = {
    CategoryRole(dtype=int): ['le__NAME_CONTRACT_TYPE', 'le__NAME_TYPE_SUITE'],
}

encoded_multiclass = CupyDataset(data=encoded_dataset.data.copy(),
                        features=['le__NAME_CONTRACT_TYPE', 'le__NAME_TYPE_SUITE'],
                        roles=roles_parser(check_roles),
                        task=task)

encoded_multiclass.target = cudf.Series(cp.random.randint(0,5,10000))

In [51]:
le = categorical_gpu.LabelEncoder()
multi_te = categorical_gpu.MultiClassTargetEncoder()

In [52]:
encoded_le = le.fit_transform(encoded_multiclass)
encoded_le.target = encoded_multiclass.target
encoded_le.folds = encoded_dataset.folds
a = multi_te.fit_transform(encoded_le).data

In [53]:
encoded_cpu = encoded_le.to_cudf().to_pandas().to_numpy()
encoded_cpu.target = encoded_multiclass.target.to_pandas().values
encoded_cpu.folds = encoded_dataset.folds.to_pandas().values

In [54]:
from lightautoml.transformers.categorical import MultiClassTargetEncoder as MCTE

In [55]:
mlc = MCTE()

In [56]:
b = mlc.fit_transform(encoded_cpu).data

In [57]:
np.allclose(a, cp.asarray(b))

array(True)

In [58]:
(np.abs(cp.asnumpy(a)-b).max()/np.abs(b)).max()

7.868959e-08

## CatIntersections transformer

In [59]:
cat_intersect_transformer = categorical_gpu.CatIntersectstions()
a = cat_intersect_transformer.fit_transform(encoded_dataset).data

In [60]:
from lightautoml.transformers.categorical import CatIntersectstions as CI
cat_int = CI()
b = cat_int.fit_transform(encoded_dataset.to_cudf().to_pandas()).data

In [61]:
print((cp.asnumpy(a)==b).mean())

0.9992


## Ordinal Encoder transformer

In [62]:
ordinal_encoder = categorical_gpu.OrdinalEncoder()

In [63]:
a = ordinal_encoder.fit_transform(categorical_dataset).data

In [64]:
from lightautoml.transformers.categorical import OrdinalEncoder as OE

In [65]:
ordinal = OE()
b = ordinal.fit_transform(categorical_dataset.to_cudf().to_pandas()).data

In [66]:
(cp.asnumpy(a) == b).all()

True

# Datetime transformers

In [67]:
check_roles = {
    DatetimeRole(seasonality=['y', 'm', 'wd']): ['BIRTH_DATE', 'EMP_DATE'],
}

datetime_dataset = CudfDataset(data=cudf_dataset.data[['BIRTH_DATE',\
                                                      'EMP_DATE']].copy(),
                        roles=roles_parser(check_roles),
                        task=task)

In [68]:
datetime_dataset.data

Unnamed: 0,BIRTH_DATE,EMP_DATE
0,1967-02-08,2017-05-20
1,1962-05-19,2007-07-16
2,1991-11-14,2015-01-21
3,1986-09-25,2011-10-13
4,1981-11-08,2013-02-21
...,...,...
9995,1958-09-28,2018-01-01
9996,1984-02-09,2017-07-19
9997,1974-10-14,1997-01-12
9998,1961-01-02,2018-01-01


## TimeToNum transformer

In [69]:
time_to_num = datetime_gpu.TimeToNum()

In [70]:
time_to_num.fit_transform(datetime_dataset)

array([[-19320.,   -956.],
       [-21046.,  -4552.],
       [-10275.,  -1806.],
       ...,
       [-16515.,  -8389.],
       [-21548.,   -730.],
       [-13896.,  -1653.]], dtype=float32)

## BaseDiff transformer

In [71]:
basediff = datetime_gpu.BaseDiff(base_names=['EMP_DATE'], diff_names=['BIRTH_DATE'])

In [72]:
basediff.fit_transform(datetime_dataset)

array([[-18364.],
       [-16494.],
       [ -8469.],
       ...,
       [ -8126.],
       [-20818.],
       [-12243.]], dtype=float32)

## DateSeasons transformer

In [73]:
date_seasons = datetime_gpu.DateSeasons()

date_seasons.fit_transform(datetime_dataset)

array([[1967,    2,    2, 2017,    5,    5],
       [1962,    5,    5, 2007,    7,    0],
       [1991,   11,    3, 2015,    1,    2],
       ...,
       [1974,   10,    0, 1997,    1,    6],
       [1961,    1,    0, 2018,    1,    0],
       [1981,   12,    1, 2015,    6,    1]], dtype=int32)