In [2]:
#example from https://github.com/rapidsai/cudf/blob/branch-25.06/docs/cudf/source/user_guide/performance-comparisons/performance-comparisons.ipynb


import timeit

import cudf
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

print(f"{cudf.__version__=}")

cudf.__version__='25.10.00'


In [3]:
rng = np.random.default_rng(seed=0)

Concat, count & joins performance

In [4]:
timeit_number = 30
num_rows = 300_000_000
sub_sample = int(num_rows / 30)

In [5]:
pdf = pd.DataFrame(
    {
        "numbers": rng.integers(-1000, 1000, num_rows, dtype="int64"),
        "business": rng.choice(
            ["McD", "Buckees", "Walmart", "Costco"], size=num_rows
        ),
    }
)
pdf

Unnamed: 0,numbers,business
0,701,Walmart
1,273,Walmart
2,22,Buckees
3,-461,Buckees
4,-385,McD
...,...,...
299999995,425,McD
299999996,-916,Costco
299999997,538,Walmart
299999998,-87,Costco


In [6]:
gdf = cudf.from_pandas(pdf)
gdf

Unnamed: 0,numbers,business
0,701,Walmart
1,273,Walmart
2,22,Buckees
3,-461,Buckees
4,-385,McD
...,...,...
299999995,425,McD
299999996,-916,Costco
299999997,538,Walmart
299999998,-87,Costco


In [7]:
def timeit_pandas_cudf(pd_obj, gd_obj, func, **kwargs):
    """
    A utility function to measure execution time of an
    API(`func`) in pandas & cudf.

    Parameters
    ----------
    pd_obj : Pandas object
    gd_obj : cuDF object
    func : callable
    """
    pandas_time = timeit.timeit(lambda: func(pd_obj), **kwargs)
    cudf_time = timeit.timeit(lambda: func(gd_obj), **kwargs)
    return pandas_time, cudf_time

In [None]:
pandas_value_counts, cudf_value_counts = timeit_pandas_cudf(
    pdf, gdf, lambda df: df.value_counts(), number=timeit_number
)

In [None]:
pdf = pdf.head(sub_sample)
gdf = gdf.head(sub_sample)

In [None]:
pandas_concat = timeit.timeit(
    lambda: pd.concat([pdf, pdf, pdf]), number=timeit_number
)

In [None]:
cudf_concat = timeit.timeit(
    lambda: cudf.concat([gdf, gdf, gdf]), number=timeit_number
)

In [None]:
pandas_groupby, cudf_groupby = timeit_pandas_cudf(
    pdf,
    gdf,
    lambda df: df.groupby("business").agg(["min", "max", "mean"]),
    number=timeit_number,
)

In [None]:
pdf = pd.DataFrame(
    {
        "numbers": rng.integers(
            -1000, 1000, int(sub_sample / 10), dtype="int64"
        ),
        "business": rng.choice(
            ["McD", "Buckees", "Walmart", "Costco"], size=int(sub_sample / 10)
        ),
    }
)
gdf = cudf.from_pandas(pdf)

In [None]:
pandas_merge, cudf_merge = timeit_pandas_cudf(
    pdf, gdf, lambda df: df.merge(df), number=10
)

In [None]:
performance_df = pd.DataFrame(
    {
        "cudf speedup vs. pandas": [
            pandas_value_counts / cudf_value_counts,
            pandas_concat / cudf_concat,
            pandas_groupby / cudf_groupby,
            pandas_merge / cudf_merge,
        ],
    },
    index=["value_counts", "concat", "groupby", "merge"],
)
performance_df

In [None]:
def performance_plot(df, xlabel=None):
    # ylim is 20% above max value
    ylim_max = df["cudf speedup vs. pandas"].max() + (
        df["cudf speedup vs. pandas"].max() / 20
    )
    ax = df.plot.bar(
        color="#7400ff",
        ylim=(1, ylim_max),
        rot=0,
        xlabel=xlabel,
        ylabel="Speedup factor",
    )
    ax.bar_label(ax.containers[0], fmt="%.0f")
    plt.show()

In [None]:
performance_plot(performance_df, xlabel="Operation")

In [None]:
import gc

# Cleaning up used memory for later benchmarks
del pdf
del gdf

_ = gc.collect()

In [None]:
timeit_number = 20
num_rows = 300_000_000

In [None]:
pd_series = pd.Series(
    rng.choice(
        ["123", "56.234", "Walmart", "Costco", "rapids ai"], size=num_rows
    )
)

In [None]:
gd_series = cudf.from_pandas(pd_series)