In [None]:
import numpy as np
import pandas as pd
import dask_cudf
import cudf
import cupy as cp
import random

from time import perf_counter

from dask.distributed import Client
from dask_cuda import LocalCUDACluster

from lightautoml.tasks import Task
from lightautoml.reader.hybrid_reader import HybridReader
from lightautoml.reader.daskcudf_reader import DaskCudfReader
from lightautoml.reader.cudf_reader import CudfReader

# Standard python libraries
import logging
import os
import time
import requests
logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.INFO)

# Installed libraries
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
import torch

# Imports from our package
from lightautoml.automl.base import AutoML



from lightautoml.pipelines.features.lgb_pipeline_gpu import LGBSimpleFeatures_gpu, LGBAdvancedPipeline_gpu
from lightautoml.pipelines.features.linear_pipeline_gpu import LinearFeatures_gpu
from lightautoml.ml_algo.boost_xgb_gpu import BoostXGB, BoostXGB_dask
from lightautoml.ml_algo.boost_cb_gpu import BoostCB_gpu

from lightautoml.ml_algo.linear_gpu import LinearLBFGS_gpu
from lightautoml.ml_algo.tuning.optuna import OptunaTuner

from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.pipelines.selection.importance_based import ModelBasedImportanceEstimator, ImportanceCutoffSelector

from lightautoml.automl.blend_gpu import WeightedBlender_gpu

from lightautoml.utils.profiler import Profiler
from lightautoml.utils.timer import PipelineTimer

from numba import jit
import string

In [None]:
from os import listdir

files = listdir('.')
csv_files = [elem for elem in files if elem.endswith(".csv")]

print(csv_files)
print(len(csv_files))

In [None]:
N_THREADS = 8 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 600 # Time in seconds for automl run
TARGETS_DICT = {'covertype.csv' : 'class' , 'albert.csv' : 'class',
                'higgs.csv' : 'class', 'guillermo.csv' : 'class',
                'bank-marketing.csv' : 'Class',
                'numerai28.6.csv' : 'attribute_21', 'volkert.csv' : 'class',
               }
task_types = {'covertype.csv' : 'multiclass' , 'albert.csv' : 'binary',
             'higgs.csv' : 'binary', 'guillermo.csv' : 'binary',
             'bank-marketing.csv' : 'binary',
              'numerai28.6.csv' : 'binary', 'volkert.csv' : 'multiclass',
             }

In [None]:
cudf.set_allocator('managed')

In [None]:
current_file = csv_files[6]
TARGET_NAME = TARGETS_DICT[current_file] # Target column name
task_type = task_types[current_file]
print(task_type)
print(current_file)
print(TARGET_NAME)

In [None]:
data = pd.read_csv(current_file)
for col in data.columns:
    if data[col].isin(['?']).any():
        data[col] = data[col].replace('?', np.nan).astype(np.float32)
print(data.shape)

In [None]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)
    
timer = PipelineTimer(600, mode=2)
timer_gbm = timer.get_task_timer('gbm') # Get task timer from pipeline timer 

feat_sel_0 = LGBSimpleFeatures_gpu()
#mod_sel_0 = BoostXGB(timer=timer_gbm)
mod_sel_0 = BoostCB_gpu(timer=timer_gbm)
imp_sel_0 = ModelBasedImportanceEstimator()
selector_0 = ImportanceCutoffSelector(feat_sel_0, mod_sel_0, imp_sel_0, cutoff=0, )
feats_gbm_0 = LGBAdvancedPipeline_gpu(top_intersections=4, 
                              output_categories=True, 
                              feats_imp=imp_sel_0)

timer_gbm_0 = timer.get_task_timer('gbm')
timer_gbm_1 = timer.get_task_timer('gbm')
    
gbm_0 = BoostCB_gpu(timer=timer_gbm_0)
gbm_1 = BoostCB_gpu(timer=timer_gbm_1)
#gbm_0 = BoostXGB(timer=timer_gbm_0)
#gbm_1 = BoostXGB(timer=timer_gbm_1)

tuner_0 = OptunaTuner(n_trials=20, timeout=30, fit_on_holdout=True)
gbm_lvl0 = MLPipeline([
        (gbm_0, tuner_0),
        gbm_1
    ],
    pre_selection=selector_0,
    features_pipeline=feats_gbm_0, 
    post_selection=None
)
    
feats_reg_0 = LinearFeatures_gpu(output_categories=True, 
                         sparse_ohe='auto')

timer_reg = timer.get_task_timer('reg')
reg_0 = LinearLBFGS_gpu(timer=timer_reg)

reg_lvl0 = MLPipeline([
        reg_0
    ],
    pre_selection=None,
    features_pipeline=feats_reg_0, 
    post_selection=None
)

metrics = {'binary':'logloss', 'multiclass':'crossentropy'}

task = Task(task_type, metric = metrics[task_type], device='gpu')
   
reader = HybridReader(task = task, num_cpu_readers=1, num_gpu_readers=1,
                          gpu_ratio=0.4, output='gpu',
                          samples = 100000 , max_nan_rate = 1,
                          max_constant_rate = 1, advanced_roles = True,
                          drop_score_co = -1, n_jobs = 1)

'''reader = CudfReader(task = task, device_num = 0,
                    samples = 100000 , max_nan_rate = 1,
                    max_constant_rate = 1, advanced_roles = True,
                    drop_score_co = -1, n_jobs = 1)'''

blender = WeightedBlender_gpu()
automl = AutoML(reader=reader, levels=[
    [gbm_lvl0, reg_lvl0]
], timer=timer, skip_conn=False, blender=blender)

In [None]:
%%time

oof_pred = automl.fit_predict(data, roles={'target':TARGET_NAME})