In [None]:
import os
import pickle
import time

from lightautoml.dataset.np_pd_dataset_cupy import *
from lightautoml.dataset.roles import *
from lightautoml.dataset.utils import roles_parser

from lightautoml.tasks import Task

In [None]:
from lightautoml.transformers import numeric_gpu, categorical_gpu, datetime_gpu

# Initial data load

In [None]:
data = pd.read_csv('./example_data/test_data_files/sampled_app_train.csv',
                       usecols=['TARGET', 'NAME_CONTRACT_TYPE', 'AMT_CREDIT',
                                'NAME_TYPE_SUITE', 'AMT_GOODS_PRICE',
                                'DAYS_BIRTH', 'DAYS_EMPLOYED'])

# Fix dates and convert to date type

data['BIRTH_DATE'] = np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))
data['EMP_DATE'] = np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]'))
data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)

# Create folds
data['__fold__'] = np.random.randint(0, 5, len(data))

# Print data head
print(data.head())

# # Set roles for columns

check_roles = {
    TargetRole(): 'TARGET',
    CategoryRole(dtype=str): ['NAME_CONTRACT_TYPE', 'NAME_TYPE_SUITE'],
    NumericRole(np.float32): ['AMT_CREDIT', 'AMT_GOODS_PRICE'],
    DatetimeRole(seasonality=['y', 'm', 'wd']): ['BIRTH_DATE', 'EMP_DATE'],
    FoldsRole(): '__fold__'
}

# create Task
task = Task('binary')

# # Creating PandasDataSet
pd_dataset = PandasDataset(data, roles_parser(check_roles), task=task)

cudf_dataset = pd_dataset.to_cudf()

In [None]:
np.isnan(data.NAME_TYPE_SUITE.iloc[180])

In [None]:
# check for cudf to cupy values conversion
print(cudf_dataset.data['AMT_GOODS_PRICE'].values)

# check for nan correct representation
print(cudf_dataset.data['AMT_GOODS_PRICE'].values.sum())

# Numeric dataset creation & numerical transformers test

In [None]:
check_roles = {
    NumericRole(np.float32): ['AMT_CREDIT', 'AMT_GOODS_PRICE'],
}

numeric_dataset = CudfDataset(data=cudf_dataset.data[['AMT_CREDIT',\
                                                      'AMT_GOODS_PRICE']].copy(),
                        roles=roles_parser(check_roles),
                        task=task)

In [None]:
numeric_dataset.folds

## FillNA transformer

In [None]:
filler = numeric_gpu.FillnaMedian()

In [None]:
filled_dataset = filler.fit_transform(numeric_dataset)
print(filled_dataset)

## Standard scaler transformer

In [None]:
scaler = numeric_gpu.StandardScaler()

In [None]:
scaler.fit_transform(filled_dataset)

## NANRate

In [None]:
nan_rate = numeric_gpu.NaNFlags()

In [None]:
nan_rate.fit_transform(numeric_dataset)

## FillInf transformer

In [None]:
fill_inf = numeric_gpu.FillInf()

In [None]:
test_dataset = numeric_dataset.to_numpy().to_cupy()

In [None]:
test_dataset.data[-1,0] = cp.inf

In [None]:
fill_inf.fit_transform(test_dataset)

In [None]:
test_dataset

## QuantileBinning transformer

In [None]:
qb = numeric_gpu.QuantileBinning()

In [None]:
qb.fit_transform(filled_dataset)

In [None]:
filled_dataset

# Categorical dataset creation & categorical transformers test

In [None]:
check_roles = {
    CategoryRole(dtype=str): ['NAME_CONTRACT_TYPE', 'NAME_TYPE_SUITE'],
}

In [None]:
categorical_dataset = CudfDataset(data=cudf_dataset.data[['NAME_CONTRACT_TYPE',\
                                                      'NAME_TYPE_SUITE']].copy(),
                        roles=roles_parser(check_roles),
                        task=task)

In [None]:
categorical_dataset.data.NAME_TYPE_SUITE.isna().sum()

## Label Encoder transformer

In [None]:
le = categorical_gpu.LabelEncoder()

In [None]:
type(categorical_dataset) == CudfDataset

In [None]:
a = le.fit_transform(categorical_dataset).data

In [None]:
encoded_dataset = le.transform(categorical_dataset)

In [None]:
from lightautoml.transformers.categorical import LabelEncoder as LE

In [None]:
label = LE()
b = label.fit_transform(encoded_dataset.to_cudf().to_pandas()).data

In [None]:
(b == cp.asnumpy(a)).all()

## OneHot Encoder transformer

In [None]:
ohe = categorical_gpu.OHEEncoder()

In [None]:
ohe.fit_transform(encoded_dataset)

In [None]:
from cuml.preprocessing import OneHotEncoder

In [None]:
ohe_ = OneHotEncoder(categories='auto',
                                 dtype=cp.float32, sparse=False,
                                 handle_unknown='ignore')

In [None]:
a = ohe_.fit_transform(encoded_dataset.data)

In [None]:
b = ohe.fit_transform(encoded_dataset)

In [None]:
(a == b.data).all()

In [None]:
b.data

## FreqEncoder transformer

In [None]:
freq_enc = categorical_gpu.FreqEncoder()

In [None]:
freq_enc.fit_transform(encoded_dataset)

In [None]:
from lightautoml.transformers.categorical import FreqEncoder as fe

In [None]:
fe_ = fe()

In [None]:
fe_.fit_transform(encoded_dataset.to_cudf().to_pandas())

In [None]:
(freq_enc.fit_transform(encoded_dataset).to_numpy().data ==\
fe_.fit_transform(encoded_dataset.to_cudf().to_pandas()).data).all()

## TargetEncoder transformer

In [None]:
encoded_dataset.folds = cudf_dataset.folds
encoded_dataset.target = cudf_dataset.target

In [None]:
target_enc = categorical_gpu.TargetEncoder()

In [None]:
a = target_enc.fit_transform(encoded_dataset)

In [None]:
from lightautoml.transformers.categorical import TargetEncoder as te

In [None]:
target_cpu = te()

encoded_cpu = encoded_dataset.to_cudf().to_pandas().to_numpy()
encoded_cpu.folds = cp.asnumpy(encoded_dataset.folds)
encoded_cpu.target = cp.asnumpy(encoded_dataset.target)
b = target_cpu.fit_transform(encoded_cpu)

In [None]:
np.allclose(cp.asnumpy(a.data), b.data)

## Multiclass TargetEncoder transformer

In [None]:
task = Task('multiclass')
check_roles = {
    CategoryRole(dtype=int): ['le__NAME_CONTRACT_TYPE', 'le__NAME_TYPE_SUITE'],
}

encoded_multiclass = CupyDataset(data=encoded_dataset.data.copy(),
                        features=['le__NAME_CONTRACT_TYPE', 'le__NAME_TYPE_SUITE'],
                        roles=roles_parser(check_roles),
                        task=task)

encoded_multiclass.target = cudf.Series(cp.random.randint(0,5,10000))

In [None]:
le = categorical_gpu.LabelEncoder()
multi_te = categorical_gpu.MultiClassTargetEncoder()

In [None]:
encoded_le = le.fit_transform(encoded_multiclass)
encoded_le.target = encoded_multiclass.target
encoded_le.folds = encoded_dataset.folds
a = multi_te.fit_transform(encoded_le).data

In [None]:
encoded_cpu = encoded_le.to_cudf().to_pandas().to_numpy()
encoded_cpu.target = encoded_multiclass.target.to_pandas().values
encoded_cpu.folds = encoded_dataset.folds.to_pandas().values

In [None]:
from lightautoml.transformers.categorical import MultiClassTargetEncoder as MCTE

In [None]:
mlc = MCTE()

In [None]:
b = mlc.fit_transform(encoded_cpu).data

In [None]:
np.allclose(a, cp.asarray(b))

In [None]:
(np.abs(cp.asnumpy(a)-b).max()/np.abs(b)).max()

## CatIntersections transformer

In [None]:
cat_intersect_transformer = categorical_gpu.CatIntersectstions()
a = cat_intersect_transformer.fit_transform(encoded_dataset.to_cudf()).data

In [None]:
from lightautoml.transformers.categorical import CatIntersectstions as CI
cat_int = CI()
b = cat_int.fit_transform(encoded_dataset.to_cudf().to_pandas()).data

In [None]:
print((cp.asnumpy(a)==b).mean())

## Ordinal Encoder transformer

In [None]:
ordinal_encoder = categorical_gpu.OrdinalEncoder()

In [None]:
a = ordinal_encoder.fit_transform(categorical_dataset).data

In [None]:
from lightautoml.transformers.categorical import OrdinalEncoder as OE

In [None]:
ordinal = OE()
b = ordinal.fit_transform(categorical_dataset.to_cudf().to_pandas()).data

In [None]:
(cp.asnumpy(a) == b).all()

# Datetime transformers

In [None]:
check_roles = {
    DatetimeRole(seasonality=['y', 'm', 'wd']): ['BIRTH_DATE', 'EMP_DATE'],
}

datetime_dataset = CudfDataset(data=cudf_dataset.data[['BIRTH_DATE',\
                                                      'EMP_DATE']].copy(),
                        roles=roles_parser(check_roles),
                        task=task)

In [None]:
datetime_dataset.data

## TimeToNum transformer

In [None]:
time_to_num = datetime_gpu.TimeToNum()

In [None]:
time_to_num.fit_transform(datetime_dataset)

## BaseDiff transformer

In [None]:
basediff = datetime_gpu.BaseDiff(base_names=['EMP_DATE'], diff_names=['BIRTH_DATE'])

In [None]:
basediff.fit_transform(datetime_dataset)

## DateSeasons transformer

In [None]:
date_seasons = datetime_gpu.DateSeasons()

date_dataset = date_seasons.fit_transform(datetime_dataset)

In [None]:
type(numeric_dataset)

In [None]:
date_dataset.to_cudf()

# ML Algos test

In [None]:
from lightautoml.ml_algo import linear_gpu, linear_sklearn

In [None]:
check_roles = {
    TargetRole(): 'TARGET',
}

# create Task
task = Task('binary')

# # Creating PandasDataSet
target_dataset = CudfDataset(data[['TARGET']], roles_parser(check_roles), task=task)

In [None]:
filled_dataset = filled_dataset.to_cudf()
filled_dataset = filled_dataset.concat([filled_dataset, date_dataset.to_cudf(), target_dataset])
full_dataset = filled_dataset

In [None]:
check_roles = {
    TargetRole(cp.float32): 'TARGET',
    NumericRole(cp.float32): ['fillnamed__AMT_CREDIT', 'fillnamed__AMT_GOODS_PRICE',
       'season_y__BIRTH_DATE', 'season_m__BIRTH_DATE', 'season_wd__BIRTH_DATE',
       'season_y__EMP_DATE', 'season_m__EMP_DATE', 'season_wd__EMP_DATE'],
}

In [None]:
task = Task('binary', device='gpu')

In [None]:
full_dataset = CupyDataset(filled_dataset.data.values,
                            features=filled_dataset.data.columns.to_list(),
                              roles=roles_parser(check_roles),
                              task=task,
                              **{'target': filled_dataset.data['TARGET']})

In [None]:
from lightautoml.validation.utils import create_validation_iterator

In [None]:
train_valid = create_validation_iterator(full_dataset[:9000], full_dataset[9000:], n_folds=10)

In [None]:
linear_one = linear_gpu.LinearLBFGS()

In [None]:
preds = linear_one.fit_predict(train_valid)

In [None]:
y_true = full_dataset[9000:].target
y_pred = preds

In [None]:
linear_cuml = linear_gpu.LinearL1CD()

In [None]:
_=linear_cuml.fit_predict(train_valid)

In [None]:
from cupyx.scipy.sparse import csr_matrix, hstack, coo_matrix

In [None]:
a = csr_matrix(cp.eye(5))

In [None]:
type(csr_matrix(a).toarray())

In [None]:
a.shape

In [None]:
numeric_dataset.to_numpy().to_sparse_gpu().data

In [None]:
matrix = coo_matrix(a, dtype=cp.float32)

In [None]:
matrix.data

In [None]:
import torch

In [None]:
b = torch.as_tensor(matrix.data, device='cuda')

In [None]:
matrix.data[0] = 0

In [None]:
b

In [None]:
matrix.data

In [None]:
a = [np.mean, np.sqrt]

In [None]:
for f in a:
    print(f(np.array([2])))

In [None]:
categorical_dataset.data.sample(100000, replace=True).reset_index().drop(['index'],axis=1)

In [None]:
cudf_dataset.data.head()