In [1]:
# https://docs.rapids.ai/api/cuml/stable/api.html#random-forest
# https://docs.rapids.ai/api/cuml/stable/api.html#support-vector-machines
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [2]:
import time
import datetime
import psutil
from cuml.model_selection import train_test_split as tts_cuml
from sklearn.metrics import accuracy_score as acc_sk
from cuml.metrics import accuracy_score as acc_cuml
import cuml
import cupy
import sklearn
from cupy import asnumpy
from sklearn.ensemble import RandomForestClassifier
import cudf
from loguru import logger
import platform

logger.add("medium1-cuml.log")
logger.info(f"psutil.__version__: {psutil.__version__}")
logger.info(f"cuml.__version__: {cuml.__version__}")
logger.info(f"cudf.__version__: {cudf.__version__}")
logger.info(f"cupy.__version__: {cupy.__version__}")
logger.info(f"sklearn.__version__: {sklearn.__version__}")

from pynvml import *
nvmlInit()

2023-01-07 18:08:07.204 | INFO     | __main__:<module>:20 - psutil.__version__: 5.9.4
2023-01-07 18:08:07.205 | INFO     | __main__:<module>:21 - cuml.__version__: 22.12.00
2023-01-07 18:08:07.206 | INFO     | __main__:<module>:22 - cudf.__version__: 22.12.01
2023-01-07 18:08:07.206 | INFO     | __main__:<module>:23 - cupy.__version__: 11.4.0
2023-01-07 18:08:07.207 | INFO     | __main__:<module>:24 - sklearn.__version__: 1.2.0


In [3]:
logger.info(f"Driver Version: {nvmlSystemGetDriverVersion()}")

2023-01-07 18:08:07.247 | INFO     | __main__:<module>:1 - Driver Version: b'470.161.03'


In [4]:
deviceCount = nvmlDeviceGetCount()
for i in range(deviceCount):
    handle = nvmlDeviceGetHandleByIndex(i)
    logger.info(f"Device {i}: {nvmlDeviceGetName(handle)}")

2023-01-07 18:08:07.282 | INFO     | __main__:<module>:4 - Device 0: b'NVIDIA GeForce RTX 2060'


In [5]:
# !nvidia-smi # CUDA Version: 11.4

In [6]:
logger.info(f"Python version: {platform.python_version()}")

2023-01-07 18:08:07.333 | INFO     | __main__:<module>:1 - Python version: 3.9.15


In [7]:
f = open("/etc/os-release", "r")
logger.info(f"Sistema operacional:\n {f.read()}")
f.close()

2023-01-07 18:08:07.357 | INFO     | __main__:<module>:2 - Sistema operacional:
 NAME="Ubuntu"
VERSION="18.04.6 LTS (Bionic Beaver)"
ID=ubuntu
ID_LIKE=debian
PRETTY_NAME="Ubuntu 18.04.6 LTS"
VERSION_ID="18.04"
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
VERSION_CODENAME=bionic
UBUNTU_CODENAME=bionic



In [8]:
meminfo = dict((i.split()[0].rstrip(':'),int(i.split()[1])) for i in open('/proc/meminfo').readlines())
mem_kib = meminfo['MemTotal']
logger.info(f"Memória total: {str(mem_kib)[:2]} GB")

2023-01-07 18:08:07.402 | INFO     | __main__:<module>:3 - Memória total: 16 GB


In [9]:
!lsmem

RANGE                                 SIZE  STATE REMOVABLE  BLOCK
0x0000000000000000-0x000000007fffffff   2G online       yes   0-15
0x0000000100000000-0x000000047fffffff  14G online       yes 32-143

Memory block size:       128M
Total online memory:      16G
Total offline memory:      0B


In [10]:
# !lscpu # Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz

In [11]:
# https://psutil.readthedocs.io/en/latest/
import psutil
logger.info(f"Total de núcleos do processador: {psutil.cpu_count(logical=True)}")

2023-01-07 18:08:08.140 | INFO     | __main__:<module>:3 - Total de núcleos do processador: 12


In [12]:
def time_init(inductor):
    logger.info(f"INDUTOR: {inductor}")
    time_start = time.time()
    datetime_start = datetime.datetime.now()
    return datetime_start, time_start

In [13]:
def time_print(datetime_start, time_start):
    time_end = time.time()
    datetime_end = datetime.datetime.now()
    elapsed_datetime = (datetime_end - datetime_start)
    elapsed_time = (time_end - time_start)
    logger.warning(f"Tempo de execução (datetime): {str(elapsed_datetime)} segundos")
    logger.warning(f"Tempo de execução (time): {str(elapsed_time)} segundos")
    return elapsed_time

In [14]:
def print_performance(rapids, sklearn):
    exp1 = 'SKLEARN' if sklearn < rapids else 'CUML'
    exp2 = 'SKLEARN' if sklearn > rapids else 'CUML'
    exp3 = (sklearn/rapids if sklearn > rapids else rapids/sklearn)
    exp4 = exp3 >= 2
    exp5 = ('vezes' if exp4 else 'vez')
    logger.success(f"RESULTADO: {exp1} foi mais rápido que o {exp2} {exp3} {exp5}")

In [15]:
def metrics_cuml_sklearn(framework, inductor, y_test_cuml, predictions_cuml, y_test_sk, predictions_sk):
    cu_score = acc_cuml(y_test_cuml, predictions_cuml)
    sk_score = acc_sk(y_test_sk, predictions_sk)
    logger.debug(f"{framework} - cuml accuracy: {cu_score}")
    logger.debug(f"{framework} - sklearn accuracy: {sk_score}")
    logger.trace(f"PARAMETROS: {str(inductor.get_params())}")

In [17]:
def load_prepare_dataset(filename):
    '''
    Carregando os dados usando o CUDF.
    '''
    logger.info(f"filename: {filename}")
    cu_df = cudf.read_csv(filename)
    logger.info(f"cu_df.shape: {cu_df.shape}")
    y = cu_df['class']
    X = cu_df.drop(columns=['class'])
    logger.info(f"X.shape: {X.shape}")
    logger.info(f"y.shape: {y.shape}")
    return tts_cuml( X, y, random_state = 3, test_size=0.3)

<h1>Dataset 1</h1>
<h3>250.000 amostras</h3>

In [18]:
filename = "medium1_dt1.csv"
X_train, X_test, y_train, y_test = load_prepare_dataset(filename)

2023-01-07 18:08:08.382 | INFO     | __main__:load_prepare_dataset:5 - filename: medium1_dt1.csv
2023-01-07 18:08:11.964 | INFO     | __main__:load_prepare_dataset:7 - cu_df.shape: (250000, 102)
2023-01-07 18:08:11.984 | INFO     | __main__:load_prepare_dataset:10 - X.shape: (250000, 101)
2023-01-07 18:08:11.985 | INFO     | __main__:load_prepare_dataset:11 - y.shape: (250000,)


<h1>Random Forest</h1>

In [19]:
params_rf = {'max_features':'sqrt', 'max_depth':16, 'random_state':7, 'verbose':0}
logger.info(f"params_rf: {params_rf}")

2023-01-07 18:08:12.120 | INFO     | __main__:<module>:2 - params_rf: {'max_features': 'sqrt', 'max_depth': 16, 'random_state': 7, 'verbose': 0}


In [20]:
from cuml.ensemble import RandomForestClassifier as cuRFC
datetime_start, time_start = time_init(cuRFC)
cuml_model = cuRFC(**params_rf)
cuml_model.fit(X_train,y_train)
predictions = cuml_model.predict (X_test)
metrics_cuml_sklearn('CUML', cuml_model, y_test, predictions, asnumpy(y_test), asnumpy(predictions))
rap = time_print(datetime_start, time_start)

2023-01-07 18:08:12.139 | INFO     | __main__:time_init:2 - INDUTOR: <class 'cuml.ensemble.randomforestclassifier.RandomForestClassifier'>
  return func(**kwargs)
  ret_val = func(*args, **kwargs)
2023-01-07 18:08:15.768 | DEBUG    | __main__:metrics_cuml_sklearn:4 - CUML - cuml accuracy: 0.7810800075531006
2023-01-07 18:08:15.768 | DEBUG    | __main__:metrics_cuml_sklearn:5 - CUML - sklearn accuracy: 0.78108


In [21]:
from sklearn.ensemble import RandomForestClassifier as skRF_n_jobs
datetime_start, time_start = time_init(skRF_n_jobs)
clf = skRF_n_jobs(**params_rf, n_jobs=11)
clf.fit(X_train.to_numpy(), y_train.to_numpy())
predictions = clf.predict (X_test.to_numpy())
metrics_cuml_sklearn('SKLEARN', clf, y_test.to_numpy(), predictions, y_test.to_numpy(), predictions)
skl = time_print(datetime_start, time_start)
print_performance(rap, skl)

2023-01-07 18:08:15.788 | INFO     | __main__:time_init:2 - INDUTOR: <class 'sklearn.ensemble._forest.RandomForestClassifier'>
2023-01-07 18:08:49.246 | DEBUG    | __main__:metrics_cuml_sklearn:4 - SKLEARN - cuml accuracy: 0.8966266512870789
2023-01-07 18:08:49.247 | DEBUG    | __main__:metrics_cuml_sklearn:5 - SKLEARN - sklearn accuracy: 0.8966266666666667
2023-01-07 18:08:49.263 | SUCCESS  | __main__:print_performance:7 - RESULTADO: CUML foi mais rápido que o SKLEARN 9.22411990036686 vezes


In [22]:
from sklearn.ensemble import RandomForestClassifier as skRF_single_job
datetime_start, time_start = time_init(skRF_single_job)
clf = skRF_single_job(**params_rf)
clf.fit(X_train.to_numpy(), y_train.to_numpy())
predictions = clf.predict (X_test.to_numpy())
metrics_cuml_sklearn('SKLEARN', clf, y_test.to_numpy(), predictions, y_test.to_numpy(), predictions)
skl = time_print(datetime_start, time_start)
print_performance(rap, skl)

2023-01-07 18:08:49.284 | INFO     | __main__:time_init:2 - INDUTOR: <class 'sklearn.ensemble._forest.RandomForestClassifier'>
2023-01-07 18:11:49.551 | DEBUG    | __main__:metrics_cuml_sklearn:4 - SKLEARN - cuml accuracy: 0.8966266512870789
2023-01-07 18:11:49.552 | DEBUG    | __main__:metrics_cuml_sklearn:5 - SKLEARN - sklearn accuracy: 0.8966266666666667
2023-01-07 18:11:49.554 | SUCCESS  | __main__:print_performance:7 - RESULTADO: CUML foi mais rápido que o SKLEARN 49.677072571208974 vezes


<h1>LinearSVC</h1>

In [23]:
params_linear_svc = {'loss':'squared_hinge', 'penalty':'l2', 'C':1, 'verbose':0}
logger.info(f"params_linear_svc: {params_linear_svc}")

2023-01-07 18:11:49.578 | INFO     | __main__:<module>:2 - params_linear_svc: {'loss': 'squared_hinge', 'penalty': 'l2', 'C': 1, 'verbose': 0}


In [24]:
from cuml.svm import LinearSVC as cuLinearSVC
datetime_start, time_start = time_init(cuLinearSVC)
cu_linear_SVC = cuLinearSVC(**params_linear_svc)
cu_linear_SVC.fit(X_train, y_train)
predictions = cu_linear_SVC.predict (X_test)
metrics_cuml_sklearn('CUML', cu_linear_SVC , y_test, predictions, y_test.to_numpy(), predictions.to_numpy())
rap = time_print(datetime_start, time_start)

2023-01-07 18:11:49.600 | INFO     | __main__:time_init:2 - INDUTOR: <class 'cuml.svm.linear_svc.LinearSVC'>
2023-01-07 18:11:50.337 | DEBUG    | __main__:metrics_cuml_sklearn:4 - CUML - cuml accuracy: 0.873520016670227
2023-01-07 18:11:50.337 | DEBUG    | __main__:metrics_cuml_sklearn:5 - CUML - sklearn accuracy: 0.87352


In [25]:
from sklearn.svm import LinearSVC as skLinearSVC
datetime_start, time_start = time_init(skLinearSVC)
sk_linear_SVC = skLinearSVC(**params_linear_svc)
sk_linear_SVC.fit(X_train.to_numpy(), y_train.to_numpy())
predictions = sk_linear_SVC.predict (X_test.to_numpy())
metrics_cuml_sklearn('SKLEARN', sk_linear_SVC, y_test.to_numpy(), predictions, y_test.to_numpy(), predictions)
skl = time_print(datetime_start, time_start)
print_performance(rap, skl)

2023-01-07 18:11:50.357 | INFO     | __main__:time_init:2 - INDUTOR: <class 'sklearn.svm._classes.LinearSVC'>
2023-01-07 18:15:56.115 | DEBUG    | __main__:metrics_cuml_sklearn:4 - SKLEARN - cuml accuracy: 0.8750799894332886
2023-01-07 18:15:56.117 | DEBUG    | __main__:metrics_cuml_sklearn:5 - SKLEARN - sklearn accuracy: 0.87508
2023-01-07 18:15:56.120 | SUCCESS  | __main__:print_performance:7 - RESULTADO: CUML foi mais rápido que o SKLEARN 333.3467858851044 vezes


<h1>Dataset 2</h1>
<h3>100.000 amostras</h3>

In [26]:
filename = "medium1_dt_100k.csv"
X_train, X_test, y_train, y_test = load_prepare_dataset(filename)

2023-01-07 18:15:56.205 | INFO     | __main__:load_prepare_dataset:5 - filename: medium1_dt_100k.csv
2023-01-07 18:15:56.343 | INFO     | __main__:load_prepare_dataset:7 - cu_df.shape: (100000, 102)
2023-01-07 18:15:56.356 | INFO     | __main__:load_prepare_dataset:10 - X.shape: (100000, 101)
2023-01-07 18:15:56.357 | INFO     | __main__:load_prepare_dataset:11 - y.shape: (100000,)


<h1>SVC</h1>

In [27]:
params_svc = {'kernel':'poly', 'degree':2, 'gamma':'scale', 'C':1, 'random_state':7, 'verbose':0}
logger.info(f"params_svc: {params_svc}")

2023-01-07 18:15:56.399 | INFO     | __main__:<module>:2 - params_svc: {'kernel': 'poly', 'degree': 2, 'gamma': 'scale', 'C': 1, 'random_state': 7, 'verbose': 0}


In [28]:
from cuml.svm import SVC as cumlSVC
datetime_start, time_start = time_init(cumlSVC)
cuml_model = cumlSVC(**params_svc)
cuml_model.fit(X_train, y_train)
predictions = cuml_model.predict (X_test)
metrics_cuml_sklearn('CUML', cuml_model, y_test, predictions, asnumpy(y_test), asnumpy(predictions))
rap = time_print(datetime_start, time_start)

2023-01-07 18:15:56.423 | INFO     | __main__:time_init:2 - INDUTOR: <class 'cuml.svm.svc.SVC'>
2023-01-07 18:16:54.121 | DEBUG    | __main__:metrics_cuml_sklearn:4 - CUML - cuml accuracy: 0.6628999710083008
2023-01-07 18:16:54.122 | DEBUG    | __main__:metrics_cuml_sklearn:5 - CUML - sklearn accuracy: 0.6629


In [29]:
from sklearn.svm import SVC as skSVC
datetime_start, time_start = time_init(skSVC)
clf = skSVC(**params_svc)
clf.fit(X_train.to_numpy(), y_train.to_numpy())
predictions = clf.predict (X_test.to_numpy())
metrics_cuml_sklearn('SKLEARN', clf, y_test.to_numpy(), predictions, y_test.to_numpy(), predictions)
skl = time_print(datetime_start, time_start)
print_performance(rap, skl)

2023-01-07 18:16:54.145 | INFO     | __main__:time_init:2 - INDUTOR: <class 'sklearn.svm._classes.SVC'>
2023-01-07 18:34:01.104 | DEBUG    | __main__:metrics_cuml_sklearn:4 - SKLEARN - cuml accuracy: 0.6615666747093201
2023-01-07 18:34:01.104 | DEBUG    | __main__:metrics_cuml_sklearn:5 - SKLEARN - sklearn accuracy: 0.6615666666666666
2023-01-07 18:34:01.107 | SUCCESS  | __main__:print_performance:7 - RESULTADO: CUML foi mais rápido que o SKLEARN 17.799037449985395 vezes
