In [1]:
# https://docs.rapids.ai/api/cudf/stable/user_guide/10min.html#what-are-these-libraries
# https://nvidia-merlin.github.io/NVTabular/v0.6.1/api/dataset.html
# https://github.com/rapidsai/cudf
# https://rapids.ai/pip.html#install

In [2]:
!nvidia-smi

Fri Jan 13 09:44:16 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.161.03   Driver Version: 470.161.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0  On |                  N/A |
| N/A   51C    P0    33W /  N/A |    837MiB /  5934MiB |     42%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
!python --version

Python 3.9.15


In [4]:
!cat /etc/os-release

NAME="Ubuntu"
VERSION="18.04.6 LTS (Bionic Beaver)"
ID=ubuntu
ID_LIKE=debian
PRETTY_NAME="Ubuntu 18.04.6 LTS"
VERSION_ID="18.04"
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
VERSION_CODENAME=bionic
UBUNTU_CODENAME=bionic


In [5]:
import time
import datetime
from cuml.datasets.classification import make_classification
import pandas as pd
import cupy as cp
import cudf
import cupy
import psutil
from loguru import logger
logger.add("medium2-cudf-loguru.log")
import platform
logger.info(f"cudf.__version__: {cudf.__version__}")
logger.info(f"cupy.__version__: {cupy.__version__}")
logger.info(f"pandas.__version__: {pd.__version__}")
from pynvml import *
nvmlInit()

2023-01-13 09:44:23.848 | INFO     | __main__:<module>:12 - cudf.__version__: 22.12.01
2023-01-13 09:44:23.849 | INFO     | __main__:<module>:13 - cupy.__version__: 11.4.0
2023-01-13 09:44:23.849 | INFO     | __main__:<module>:14 - pandas.__version__: 1.5.2


In [6]:
logger.info(f"Driver Version: {nvmlSystemGetDriverVersion()}")

2023-01-09 21:01:45.127 | INFO     | __main__:<module>:1 - Driver Version: b'470.161.03'


In [7]:
deviceCount = nvmlDeviceGetCount()
for i in range(deviceCount):
    handle = nvmlDeviceGetHandleByIndex(i)
    logger.info(f"Device {i}: {nvmlDeviceGetName(handle)}")

2023-01-09 21:01:45.159 | INFO     | __main__:<module>:4 - Device 0: b'NVIDIA GeForce RTX 2060'


In [8]:
# !nvidia-smi # CUDA Version: 11.4

In [9]:
logger.info(f"Python version: {platform.python_version()}")

2023-01-09 21:01:45.209 | INFO     | __main__:<module>:1 - Python version: 3.9.15


In [10]:
f = open("/etc/os-release", "r")
logger.info(f"Sistema operacional:\n {f.read()}")
f.close()

2023-01-09 21:01:45.235 | INFO     | __main__:<module>:2 - Sistema operacional:
 NAME="Ubuntu"
VERSION="18.04.6 LTS (Bionic Beaver)"
ID=ubuntu
ID_LIKE=debian
PRETTY_NAME="Ubuntu 18.04.6 LTS"
VERSION_ID="18.04"
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
VERSION_CODENAME=bionic
UBUNTU_CODENAME=bionic



In [11]:
meminfo = dict((i.split()[0].rstrip(':'),int(i.split()[1])) for i in open('/proc/meminfo').readlines())
mem_kib = meminfo['MemTotal']
logger.info(f"Memória total: {str(mem_kib)[:2]} GB")

2023-01-09 21:01:45.254 | INFO     | __main__:<module>:3 - Memória total: 16 GB


In [12]:
!lsmem

RANGE                                 SIZE  STATE REMOVABLE  BLOCK
0x0000000000000000-0x000000007fffffff   2G online       yes   0-15
0x0000000100000000-0x000000047fffffff  14G online       yes 32-143

Memory block size:       128M
Total online memory:      16G
Total offline memory:      0B


In [13]:
# !lscpu # Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz

In [14]:
logger.info(f"Total de núcleos do processador: {psutil.cpu_count(logical=True)}")

2023-01-09 21:01:45.994 | INFO     | __main__:<module>:3 - Total de núcleos do processador: 12


<h1>Testes</h1>

In [15]:
def time_init(experimento):
    logger.info(f"Nome do experimento: {experimento}")
    time_start = time.time()
    datetime_start = datetime.datetime.now()
    return datetime_start, time_start

In [16]:
def time_print(datetime_start, time_start):
    time_end = time.time()
    datetime_end = datetime.datetime.now()
    elapsed_datetime = (datetime_end - datetime_start)
    elapsed_time = (time_end - time_start)
    logger.warning(f"Tempo de execução (datetime): {str(elapsed_datetime)} segundos")
    logger.warning(f"Tempo de execução (time): {str(elapsed_time)} segundos")
    return elapsed_time

In [17]:
def print_performance(rapids, pandas):
    exp1 = 'PANDAS' if pandas < rapids else 'CUDF'
    exp2 = 'PANDAS' if pandas > rapids else 'CUDF'
    exp3 = (pandas/rapids if pandas > rapids else rapids/pandas)
    exp4 = exp3 >= 2
    exp5 = ('vezes' if exp4 else 'vez')
    logger.success(f"RESULTADO: {exp1} foi mais rápido que o {exp2} {exp3} {exp5}")

In [18]:
def create_dataset(filename):
    '''
    Usado apenas na criação dos datasets usados nos experimentos.
    '''
    n_samples=6000000
    n_features=10
    n_classes=5
    n_informative=4
    random_state=42
    n_redundant=0
    n_clusters_per_class=2
    X, y = make_classification (n_samples = n_samples,
                                n_features = n_features,
                                n_classes = n_classes,
                                n_informative=n_informative,
                                random_state = random_state,
                                n_redundant = n_redundant,
                                n_clusters_per_class=n_clusters_per_class
                                )
    cu_df = cudf.DataFrame(X, columns=[("ft_" + str(col)) for col in range(1, (n_features + 1))])
    cu_df['class'] = y
    cu_df.to_csv(filename, index=False)

In [19]:
# filename = 'medium2_dt1_6m.csv'
# create_dataset(filename)

In [20]:
def load_dataset_cudf(filename):
    '''
    Carregando os dados usando o CUDF.
    '''
    logger.info(f"filename: {filename}")
    cu_df = cudf.read_csv(filename)
    logger.info(f"cu_df.shape: {cu_df.shape}")
    return cu_df

In [21]:
def load_dataset_pandas(filename):
    '''
    Carregando os dados usando o Pandas.
    '''
    logger.info(f"filename: {filename}")
    df = pd.read_csv(filename)
    logger.info(f"cu_df.shape: {df.shape}")
    return df

In [22]:
filename = "medium2_dt1_6m.csv"

1. Dataframe

In [23]:
# cudf
datetime_start, time_start = time_init('1. Dataframe - cudf')
# Carregando o Dataframe CUDF
dfc = load_dataset_cudf(filename)
rap = time_print(datetime_start, time_start)
dfc

2023-01-09 21:01:46.205 | INFO     | __main__:time_init:2 - Nome do experimento: 1. Dataframe - cudf
2023-01-09 21:01:46.206 | INFO     | __main__:load_dataset_cudf:5 - filename: medium2_dt1_6m.csv
2023-01-09 21:01:50.481 | INFO     | __main__:load_dataset_cudf:7 - cu_df.shape: (6000000, 11)


Unnamed: 0,ft_1,ft_2,ft_3,ft_4,ft_5,ft_6,ft_7,ft_8,ft_9,ft_10,class
0,-1.673429,0.028931,1.917717,1.181257,-1.832658,0.343881,0.315469,-0.129159,0.924812,-0.269902,4
1,1.960336,0.326862,1.328991,3.113274,-1.875332,-0.039869,0.342638,-1.433796,-1.281674,0.040962,1
2,2.643136,0.436733,1.328592,-0.553129,0.803483,-1.395908,1.264633,-1.102456,0.151704,-0.878418,0
3,-0.292212,-0.341506,-3.798096,-0.087105,-1.545334,-1.696620,-2.130271,1.668405,0.851985,-0.301172,1
4,-1.867337,1.619357,-0.788673,-0.958960,-0.946285,-0.488466,0.894230,1.593357,0.363961,0.275600,1
...,...,...,...,...,...,...,...,...,...,...,...
5999995,-0.991822,-0.093061,0.011192,-1.414739,0.213473,-1.260788,0.655387,-1.921485,-0.357718,-0.486765,4
5999996,-0.608770,0.292661,2.840556,2.550355,-0.316182,1.327945,-0.716522,-0.930350,-0.194090,0.978567,0
5999997,0.145235,0.266196,-3.626625,-1.706966,-1.539231,0.799616,-1.636984,0.516574,0.532125,-1.229511,4
5999998,-1.666411,-1.134341,1.084400,1.169858,1.132448,-1.962018,0.233862,-0.188001,0.442977,0.009678,0


In [24]:
# Pandas
datetime_start, time_start = time_init('1. Dataframe - Pandas')
# Carregando o Dataframe PANDAS
dfp = load_dataset_pandas(filename)
pan = time_print(datetime_start, time_start)
print_performance(rap, pan)
dfp

2023-01-09 21:01:50.543 | INFO     | __main__:time_init:2 - Nome do experimento: 1. Dataframe - Pandas
2023-01-09 21:01:50.545 | INFO     | __main__:load_dataset_pandas:5 - filename: medium2_dt1_6m.csv
2023-01-09 21:01:55.619 | INFO     | __main__:load_dataset_pandas:7 - cu_df.shape: (6000000, 11)
2023-01-09 21:01:55.621 | SUCCESS  | __main__:print_performance:7 - RESULTADO: CUDF foi mais rápido que o PANDAS 1.1868936035439 vez


Unnamed: 0,ft_1,ft_2,ft_3,ft_4,ft_5,ft_6,ft_7,ft_8,ft_9,ft_10,class
0,-1.673429,0.028931,1.917717,1.181257,-1.832658,0.343881,0.315469,-0.129159,0.924812,-0.269902,4
1,1.960336,0.326862,1.328991,3.113274,-1.875332,-0.039869,0.342638,-1.433796,-1.281674,0.040962,1
2,2.643136,0.436733,1.328592,-0.553129,0.803483,-1.395908,1.264633,-1.102456,0.151704,-0.878418,0
3,-0.292212,-0.341506,-3.798096,-0.087105,-1.545334,-1.696620,-2.130271,1.668405,0.851985,-0.301172,1
4,-1.867337,1.619357,-0.788673,-0.958960,-0.946285,-0.488466,0.894230,1.593357,0.363961,0.275600,1
...,...,...,...,...,...,...,...,...,...,...,...
5999995,-0.991822,-0.093061,0.011192,-1.414739,0.213473,-1.260788,0.655387,-1.921485,-0.357718,-0.486765,4
5999996,-0.608770,0.292661,2.840556,2.550355,-0.316182,1.327945,-0.716522,-0.930350,-0.194090,0.978567,0
5999997,0.145235,0.266196,-3.626625,-1.706966,-1.539231,0.799616,-1.636984,0.516574,0.532125,-1.229511,4
5999998,-1.666411,-1.134341,1.084400,1.169858,1.132448,-1.962018,0.233862,-0.188001,0.442977,0.009678,0


2. head

In [25]:
# Pandas
datetime_start, time_start = time_init('2. head - Pandas')
out1 = dfp.head()
pan = time_print(datetime_start, time_start)
out1

2023-01-09 21:01:55.654 | INFO     | __main__:time_init:2 - Nome do experimento: 2. head - Pandas


Unnamed: 0,ft_1,ft_2,ft_3,ft_4,ft_5,ft_6,ft_7,ft_8,ft_9,ft_10,class
0,-1.673429,0.028931,1.917717,1.181257,-1.832658,0.343881,0.315469,-0.129159,0.924812,-0.269902,4
1,1.960336,0.326862,1.328991,3.113274,-1.875332,-0.039869,0.342638,-1.433796,-1.281674,0.040962,1
2,2.643136,0.436733,1.328592,-0.553129,0.803483,-1.395908,1.264633,-1.102456,0.151704,-0.878418,0
3,-0.292212,-0.341506,-3.798096,-0.087105,-1.545334,-1.69662,-2.130271,1.668405,0.851985,-0.301172,1
4,-1.867337,1.619357,-0.788673,-0.95896,-0.946285,-0.488466,0.89423,1.593357,0.363961,0.2756,1


In [26]:
# cudf
datetime_start, time_start = time_init('2. head - cudf')
out2 = dfc.head()
rap = time_print(datetime_start, time_start)
print_performance(rap, pan)
out2

2023-01-09 21:01:55.695 | INFO     | __main__:time_init:2 - Nome do experimento: 2. head - cudf
2023-01-09 21:01:55.698 | SUCCESS  | __main__:print_performance:7 - RESULTADO: PANDAS foi mais rápido que o CUDF 3.3879173290937996 vezes


Unnamed: 0,ft_1,ft_2,ft_3,ft_4,ft_5,ft_6,ft_7,ft_8,ft_9,ft_10,class
0,-1.673429,0.028931,1.917717,1.181257,-1.832658,0.343881,0.315469,-0.129159,0.924812,-0.269902,4
1,1.960336,0.326862,1.328991,3.113274,-1.875332,-0.039869,0.342638,-1.433796,-1.281674,0.040962,1
2,2.643136,0.436733,1.328592,-0.553129,0.803483,-1.395908,1.264633,-1.102456,0.151704,-0.878418,0
3,-0.292212,-0.341506,-3.798096,-0.087105,-1.545334,-1.69662,-2.130271,1.668405,0.851985,-0.301172,1
4,-1.867337,1.619357,-0.788673,-0.95896,-0.946285,-0.488466,0.89423,1.593357,0.363961,0.2756,1


3. describe

In [27]:
# Pandas
datetime_start, time_start = time_init('3. describe - Pandas')
dfp.describe()
pan = time_print(datetime_start, time_start)

2023-01-09 21:01:55.731 | INFO     | __main__:time_init:2 - Nome do experimento: 3. describe - Pandas


In [28]:
# cudf
datetime_start, time_start = time_init('3. describe - cudf')
dfc.describe()
rap = time_print(datetime_start, time_start)
print_performance(rap, pan)

2023-01-09 21:01:57.633 | INFO     | __main__:time_init:2 - Nome do experimento: 3. describe - cudf
2023-01-09 21:01:58.596 | SUCCESS  | __main__:print_performance:7 - RESULTADO: CUDF foi mais rápido que o PANDAS 1.9558786386271334 vez


4. sort_values 1

In [29]:
# Pandas
arg1 = 'ft_5'
datetime_start, time_start = time_init('4. sort_values 1 - Pandas')
out3 = dfp.sort_values(by=arg1)
pan = time_print(datetime_start, time_start)
out3

2023-01-09 21:01:58.614 | INFO     | __main__:time_init:2 - Nome do experimento: 4. sort_values 1 - Pandas


Unnamed: 0,ft_1,ft_2,ft_3,ft_4,ft_5,ft_6,ft_7,ft_8,ft_9,ft_10,class
66349,1.251122,0.031008,2.314300,-2.352096,-5.044896,-0.832627,-0.354416,0.886708,-0.016729,1.008662,3
4401196,0.060452,0.106606,0.186915,-3.244776,-5.011710,0.888738,-1.116729,-1.493842,-0.007371,0.581319,4
255662,-1.322324,-2.358470,2.052356,-0.323954,-4.958057,-0.431853,0.513341,-0.054642,-0.186852,0.381706,0
2100232,-1.639942,1.079859,1.525725,0.666113,-4.797860,0.304683,0.960619,1.271207,-0.476737,0.671748,4
5738341,-1.558593,0.764814,0.450212,-1.776268,-4.778948,1.394421,-0.282989,0.530719,-0.960041,0.656831,4
...,...,...,...,...,...,...,...,...,...,...,...
4765167,-0.736661,-1.030407,-3.233794,-0.960151,4.959660,-0.835467,-1.249330,-2.173159,1.224484,-1.012288,1
4092545,-0.177324,1.228531,-0.559825,-2.174931,5.113705,0.261894,-0.110893,0.431417,0.519937,0.818975,1
5097619,2.587327,1.363418,-0.364643,2.045875,5.135760,0.387577,0.053138,0.531668,-1.891728,1.321193,1
41379,-0.858833,-2.110422,0.037567,0.071600,5.314511,-0.574012,0.563068,0.622708,0.360834,0.088881,0


In [30]:
# cudf
datetime_start, time_start = time_init('4. sort_values 1 - cudf')
out4 = dfc.sort_values(by=arg1)
rap = time_print(datetime_start, time_start)
print_performance(rap, pan)
out4

2023-01-09 21:02:00.071 | INFO     | __main__:time_init:2 - Nome do experimento: 4. sort_values 1 - cudf
2023-01-09 21:02:00.201 | SUCCESS  | __main__:print_performance:7 - RESULTADO: CUDF foi mais rápido que o PANDAS 11.244810244575328 vezes


Unnamed: 0,ft_1,ft_2,ft_3,ft_4,ft_5,ft_6,ft_7,ft_8,ft_9,ft_10,class
66349,1.251122,0.031008,2.314300,-2.352096,-5.044896,-0.832627,-0.354416,0.886708,-0.016729,1.008662,3
4401196,0.060452,0.106606,0.186915,-3.244776,-5.011710,0.888738,-1.116729,-1.493842,-0.007371,0.581319,4
255662,-1.322324,-2.358470,2.052356,-0.323954,-4.958057,-0.431853,0.513341,-0.054642,-0.186852,0.381706,0
2100232,-1.639942,1.079859,1.525725,0.666113,-4.797860,0.304683,0.960619,1.271207,-0.476737,0.671748,4
5738341,-1.558593,0.764814,0.450212,-1.776268,-4.778948,1.394421,-0.282989,0.530719,-0.960041,0.656831,4
...,...,...,...,...,...,...,...,...,...,...,...
4765167,-0.736661,-1.030407,-3.233794,-0.960151,4.959660,-0.835467,-1.249330,-2.173159,1.224484,-1.012288,1
4092545,-0.177324,1.228531,-0.559825,-2.174931,5.113705,0.261894,-0.110893,0.431417,0.519937,0.818975,1
5097619,2.587327,1.363418,-0.364643,2.045875,5.135760,0.387577,0.053138,0.531668,-1.891728,1.321193,1
41379,-0.858833,-2.110422,0.037567,0.071600,5.314511,-0.574012,0.563068,0.622708,0.360834,0.088881,0


5. query

In [31]:
# Pandas
arg2 = "ft_3 > 0.7"
datetime_start, time_start = time_init('5. query - Pandas')
out5 = dfp.query(arg2)
pan = time_print(datetime_start, time_start)
out5

2023-01-09 21:02:00.256 | INFO     | __main__:time_init:2 - Nome do experimento: 5. query - Pandas


Unnamed: 0,ft_1,ft_2,ft_3,ft_4,ft_5,ft_6,ft_7,ft_8,ft_9,ft_10,class
0,-1.673429,0.028931,1.917717,1.181257,-1.832658,0.343881,0.315469,-0.129159,0.924812,-0.269902,4
1,1.960336,0.326862,1.328991,3.113274,-1.875332,-0.039869,0.342638,-1.433796,-1.281674,0.040962,1
2,2.643136,0.436733,1.328592,-0.553129,0.803483,-1.395908,1.264633,-1.102456,0.151704,-0.878418,0
6,0.297689,2.099486,0.935538,0.926482,0.294587,-0.426348,0.096765,-0.842222,-0.105171,0.478024,2
10,-1.141669,-0.564742,1.347572,-2.159665,-1.064450,-2.510482,-0.991493,-0.260903,0.975123,0.178943,3
...,...,...,...,...,...,...,...,...,...,...,...
5999992,2.302957,0.802433,2.302924,-0.109104,-0.451002,-1.513221,-0.459758,0.096395,-1.063046,1.157257,3
5999993,-0.373057,1.015767,2.021767,0.181251,-0.669629,-0.758743,0.833227,-1.131751,0.500282,-1.137749,3
5999996,-0.608770,0.292661,2.840556,2.550355,-0.316182,1.327945,-0.716522,-0.930350,-0.194090,0.978567,0
5999998,-1.666411,-1.134341,1.084400,1.169858,1.132448,-1.962018,0.233862,-0.188001,0.442977,0.009678,0


In [32]:
# cudf
datetime_start, time_start = time_init('5. query - cudf')
out6 = dfc.query(arg2)
rap = time_print(datetime_start, time_start)
print_performance(rap, pan)
out6

2023-01-09 21:02:00.405 | INFO     | __main__:time_init:2 - Nome do experimento: 5. query - cudf
2023-01-09 21:02:00.650 | SUCCESS  | __main__:print_performance:7 - RESULTADO: PANDAS foi mais rápido que o CUDF 2.0468155050016135 vezes


Unnamed: 0,ft_1,ft_2,ft_3,ft_4,ft_5,ft_6,ft_7,ft_8,ft_9,ft_10,class
0,-1.673429,0.028931,1.917717,1.181257,-1.832658,0.343881,0.315469,-0.129159,0.924812,-0.269902,4
1,1.960336,0.326862,1.328991,3.113274,-1.875332,-0.039869,0.342638,-1.433796,-1.281674,0.040962,1
2,2.643136,0.436733,1.328592,-0.553129,0.803483,-1.395908,1.264633,-1.102456,0.151704,-0.878418,0
6,0.297689,2.099486,0.935538,0.926482,0.294587,-0.426348,0.096765,-0.842222,-0.105171,0.478024,2
10,-1.141669,-0.564742,1.347572,-2.159665,-1.064450,-2.510482,-0.991493,-0.260903,0.975123,0.178943,3
...,...,...,...,...,...,...,...,...,...,...,...
5999992,2.302957,0.802433,2.302924,-0.109104,-0.451002,-1.513221,-0.459758,0.096395,-1.063046,1.157257,3
5999993,-0.373057,1.015767,2.021767,0.181251,-0.669629,-0.758743,0.833227,-1.131751,0.500282,-1.137749,3
5999996,-0.608770,0.292661,2.840556,2.550355,-0.316182,1.327945,-0.716522,-0.930350,-0.194090,0.978567,0
5999998,-1.666411,-1.134341,1.084400,1.169858,1.132448,-1.962018,0.233862,-0.188001,0.442977,0.009678,0


6. apply

In [46]:
dfp['ft_7']

0          0.315469
1          0.342638
2          1.264633
3         -2.130271
4          0.894230
             ...   
5999995    0.655387
5999996   -0.716522
5999997   -1.636984
5999998    0.233862
5999999    0.588715
Name: ft_7, Length: 6000000, dtype: float64

In [47]:
def add_ten(num):
    return num + 10

In [34]:
# Pandas
datetime_start, time_start = time_init('6. apply - Pandas')
out7 = dfp['ft_7'].apply(add_ten)
pan = time_print(datetime_start, time_start)
out7

2023-01-09 21:02:00.737 | INFO     | __main__:time_init:2 - Nome do experimento: 6. apply - Pandas


0          10.315469
1          10.342638
2          11.264633
3           7.869729
4          10.894230
             ...    
5999995    10.655387
5999996     9.283478
5999997     8.363016
5999998    10.233862
5999999    10.588715
Name: ft_7, Length: 6000000, dtype: float64

In [35]:
# cudf
datetime_start, time_start = time_init('6. apply - cudf')
out8 = dfc['ft_7'].apply(add_ten)
rap = time_print(datetime_start, time_start)
print_performance(rap, pan)
out8

2023-01-09 21:02:01.821 | INFO     | __main__:time_init:2 - Nome do experimento: 6. apply - cudf
2023-01-09 21:02:01.901 | SUCCESS  | __main__:print_performance:7 - RESULTADO: CUDF foi mais rápido que o PANDAS 13.681464782130003 vezes


0          10.315469
1          10.342638
2          11.264633
3           7.869729
4          10.894230
             ...    
5999995    10.655387
5999996     9.283478
5999997     8.363016
5999998    10.233862
5999999    10.588715
Name: ft_7, Length: 6000000, dtype: float64

7. value_counts

In [36]:
# Pandas
datetime_start, time_start = time_init('7. value_counts - Pandas')
dfp.ft_9.value_counts()
pan = time_print(datetime_start, time_start)

2023-01-09 21:02:01.926 | INFO     | __main__:time_init:2 - Nome do experimento: 7. value_counts - Pandas


In [37]:
# cudf
datetime_start, time_start = time_init('7. value_counts - cudf')
dfc.ft_9.value_counts()
rap = time_print(datetime_start, time_start)
print_performance(rap, pan)

2023-01-09 21:02:02.924 | INFO     | __main__:time_init:2 - Nome do experimento: 7. value_counts - cudf
2023-01-09 21:02:02.970 | SUCCESS  | __main__:print_performance:7 - RESULTADO: CUDF foi mais rápido que o PANDAS 22.335713858028555 vezes


8. groupby

In [38]:
# Pandas
datetime_start, time_start = time_init('8. groupby - Pandas')
dfp.groupby(['ft_2', 'ft_7']).size().sort_values(ascending=True)
pan = time_print(datetime_start, time_start)

2023-01-09 21:02:02.991 | INFO     | __main__:time_init:2 - Nome do experimento: 8. groupby - Pandas


In [39]:
# cudf
datetime_start, time_start = time_init('8. groupby - cudf')
dfc.groupby(['ft_2', 'ft_7']).size().sort_values(ascending=True)
rap = time_print(datetime_start, time_start)
print_performance(rap, pan)

2023-01-09 21:02:09.408 | INFO     | __main__:time_init:2 - Nome do experimento: 8. groupby - cudf
2023-01-09 21:02:09.468 | SUCCESS  | __main__:print_performance:7 - RESULTADO: CUDF foi mais rápido que o PANDAS 112.34539377235677 vezes


9. sort_values 2

In [40]:
arg3 = ['class', 'ft_1', 'ft_2', 'ft_3', 'ft_4', 'ft_5', 'ft_6', 'ft_7', 'ft_9']
# Pandas
datetime_start, time_start = time_init('9. sort_values 2 - Pandas')
out9 = dfp.sort_values(by=arg3)
pan = time_print(datetime_start, time_start)
out9

2023-01-09 21:02:09.487 | INFO     | __main__:time_init:2 - Nome do experimento: 9. sort_values 2 - Pandas


Unnamed: 0,ft_1,ft_2,ft_3,ft_4,ft_5,ft_6,ft_7,ft_8,ft_9,ft_10,class
5745497,-6.705965,-3.116148,2.246517,3.516951,-1.344824,-0.928899,0.398071,0.123188,0.517977,-0.873001,0
1467174,-6.169100,-3.945273,2.369984,3.673700,0.374771,0.613349,-1.333973,1.381892,-0.927933,-0.580449,0
1619313,-6.166262,-4.356469,3.131561,4.495654,1.154358,0.410570,-0.375184,-0.988066,0.054495,0.680515,0
3698215,-6.103222,-2.554064,2.199815,3.268783,-1.485152,0.465348,-0.069782,-0.334472,3.041847,0.720869,0
3895274,-6.069351,-2.232819,5.280644,6.258126,-1.746020,-0.446681,0.249382,-1.170675,0.428727,2.108019,0
...,...,...,...,...,...,...,...,...,...,...,...
1992630,4.518172,1.883084,0.211026,-0.596834,-0.454662,0.547503,-0.900603,0.386741,-0.226053,-0.318368,4
1752797,4.583209,-0.643293,1.705554,0.341851,-0.914944,-0.661866,1.929245,0.038765,-1.313252,1.177404,4
5377540,4.760997,-0.704266,-0.728732,-3.109657,1.427327,0.284508,0.830522,1.021182,2.088685,0.126431,4
3866644,4.848000,0.249280,0.653808,-2.057456,0.379916,-0.649494,1.403759,1.518423,1.266511,-0.023088,4


In [41]:
# cudf
datetime_start, time_start = time_init('9. sort_values 2 - cudf')
out10 = dfc.sort_values(by=arg3)
rap = time_print(datetime_start, time_start)
print_performance(rap, pan)
out10

2023-01-09 21:02:44.092 | INFO     | __main__:time_init:2 - Nome do experimento: 9. sort_values 2 - cudf
2023-01-09 21:02:44.391 | SUCCESS  | __main__:print_performance:7 - RESULTADO: CUDF foi mais rápido que o PANDAS 116.67482540572232 vezes


Unnamed: 0,ft_1,ft_2,ft_3,ft_4,ft_5,ft_6,ft_7,ft_8,ft_9,ft_10,class
5745497,-6.705965,-3.116148,2.246517,3.516951,-1.344824,-0.928899,0.398071,0.123188,0.517977,-0.873001,0
1467174,-6.169100,-3.945273,2.369984,3.673700,0.374771,0.613349,-1.333973,1.381892,-0.927933,-0.580449,0
1619313,-6.166262,-4.356469,3.131561,4.495654,1.154358,0.410570,-0.375184,-0.988066,0.054495,0.680515,0
3698215,-6.103222,-2.554064,2.199815,3.268783,-1.485152,0.465348,-0.069782,-0.334472,3.041847,0.720869,0
3895274,-6.069351,-2.232819,5.280644,6.258126,-1.746020,-0.446681,0.249382,-1.170675,0.428727,2.108019,0
...,...,...,...,...,...,...,...,...,...,...,...
1992630,4.518172,1.883084,0.211026,-0.596834,-0.454662,0.547503,-0.900603,0.386741,-0.226053,-0.318368,4
1752797,4.583209,-0.643294,1.705554,0.341851,-0.914944,-0.661866,1.929245,0.038765,-1.313252,1.177404,4
5377540,4.760997,-0.704266,-0.728732,-3.109657,1.427327,0.284508,0.830522,1.021182,2.088685,0.126431,4
3866644,4.848000,0.249280,0.653808,-2.057456,0.379916,-0.649494,1.403759,1.518423,1.266511,-0.023088,4
