# Delta-rs library Benchmarks

## Setup

### Downloading the data

Only once per project.

In [None]:
## Choose an option:
# supplier_tpch_sf1.parquet
# supplier_tpch_sf10.parquet
# supplier_tpch_sf100.parquet
# lineitem_tpch_sf1.parquet
# lineitem_tpch_sf10.parquet
## Copy the option in the *** field

#!curl "https://repo.hops.works/dev/gio-hopsworks/***" -o "/home/yarnapp/hopsfs/Resources/***"

### Installing and importing the libraries

In [None]:
# Verify that the custom delta-rs library is installed
%pip install deltalake
# Verify that all other libraries are installed
%pip install pyarrow
%pip install pandas
%pip install timeit

## For debugging
#%pip install ipython

In [None]:
from deltalake import DeltaTable, write_deltalake
import pyarrow as pa
import pandas as pd
import timeit

## For debugging
# from IPython.display import display

## Benchmarking

### Supplier table

#### SF 1 - 10000 rows

In [None]:
## Initialise results list of rows
results = []

## Iterate benchmark for - Choose a number
number_of_iterations = 50

for i in range(number_of_iterations):
    ###################### (1) WRITE BENCHMARK #####################

    SETUP_CODE='''import pyarrow as pa
from deltalake import write_deltalake'''

    TEST_CODE='''
HDFS_DATA_PATH = "hdfs://rpc.namenode.service.consul:8020/Projects/benchmarks_offline_FS/Experiments/supplier" 
LOCAL_PATH = "/home/yarnapp/hopsfs/Resources/supplier_tpch_sf1.parquet"
pa_table = pa.parquet.read_table(LOCAL_PATH)
write_deltalake(HDFS_DATA_PATH, pa_table)'''

    # benchmark the task
    write_result = timeit.timeit(setup  = SETUP_CODE,
                                 stmt   = TEST_CODE,
                                 number = 1          )

    # report the result
    # printing exec. time
    print('{0} iteration - Write DeltaLake Table - Time taken: {1} s'.format(i + 1 , write_result))

    ###################### (2) READ BENCHMARK #####################

    SETUP_CODE='''from deltalake import DeltaTable'''

    TEST_CODE='''
HDFS_DATA_PATH = "hdfs://rpc.namenode.service.consul:8020/Projects/benchmarks_offline_FS/Experiments/supplier"
DeltaTable(HDFS_DATA_PATH)'''

    # benchmark the task
    read_result = timeit.timeit(setup  = SETUP_CODE,
                                stmt   = TEST_CODE,
                                number = 1          )

    # report the result
    # printing exec. time
    print('{0} iteration - Read  DeltaLake Table - Time taken: {1} s'.format(i + 1, read_result))
    
    ############################## (END) ############################
    
    ## Save row of results in results
    results_row = [write_result, read_result]
    results.append(results_row)
    
    ## Erase data from created folder
    !rm -r /home/yarnapp/hopsfs/Experiments/supplier

## Create and then save a dataframe with the results in .csv
df = pd.DataFrame(results, columns =  ["Write", "Read"])
df.to_csv("/home/yarnapp/hopsfs/Resources/benchmark_results_supplier_tpch_sf1.csv")

#### SF 10 - 100000 rows

In [None]:
## Initialise results list of rows
results = []

## Iterate benchmark for - Choose a number
number_of_iterations = 50

for i in range(number_of_iterations):
    ###################### (1) WRITE BENCHMARK #####################

    SETUP_CODE='''import pyarrow as pa
from deltalake import write_deltalake'''

    TEST_CODE='''
HDFS_DATA_PATH = "hdfs://rpc.namenode.service.consul:8020/Projects/benchmarks_offline_FS/Experiments/supplier" 
LOCAL_PATH = "/home/yarnapp/hopsfs/Resources/supplier_tpch_sf10.parquet"
pa_table = pa.parquet.read_table(LOCAL_PATH)
write_deltalake(HDFS_DATA_PATH, pa_table)'''

    # benchmark the task
    write_result = timeit.timeit(setup  = SETUP_CODE,
                                 stmt   = TEST_CODE,
                                 number = 1          )

    # report the result
    # printing exec. time
    print('{0} iteration - Write DeltaLake Table - Time taken: {1} s'.format(i + 1 , write_result))

    ###################### (2) READ BENCHMARK ######################

    SETUP_CODE='''from deltalake import DeltaTable'''

    TEST_CODE='''
HDFS_DATA_PATH = "hdfs://rpc.namenode.service.consul:8020/Projects/benchmarks_offline_FS/Experiments/supplier"
DeltaTable(HDFS_DATA_PATH)'''

    # benchmark the task
    read_result = timeit.timeit(setup  = SETUP_CODE,
                                stmt   = TEST_CODE,
                                number = 1          )

    # report the result
    # printing exec. time
    print('{0} iteration - Read  DeltaLake Table - Time taken: {1} s'.format(i + 1, read_result))

    ############################## (END) ############################
    
    ## Save row of results in results
    results_row = [write_result, read_result]
    results.append(results_row)
    
    ## Erase data from created folder
    !rm -r "/home/yarnapp/hopsfs/Experiments/supplier"

## Create and then save a dataframe with the results in .csv
df = pd.DataFrame(results, columns =  ["Write", "Read"])
df.to_csv("/home/yarnapp/hopsfs/Resources/benchmark_results_supplier_tpch_sf10.csv")

#### SF 100 - 1000000 rows

In [None]:
## Initialise results list of rows
results = []

## Iterate benchmark for - Choose a number
number_of_iterations = 50

for i in range(number_of_iterations):
    ###################### (1) WRITE BENCHMARK #####################

    SETUP_CODE='''import pyarrow as pa
from deltalake import write_deltalake'''

    TEST_CODE='''
HDFS_DATA_PATH = "hdfs://rpc.namenode.service.consul:8020/Projects/benchmarks_offline_FS/Experiments/supplier" 
LOCAL_PATH = "/home/yarnapp/hopsfs/Resources/supplier_tpch_sf100.parquet"
pa_table = pa.parquet.read_table(LOCAL_PATH)
write_deltalake(HDFS_DATA_PATH, pa_table)'''

    # benchmark the task
    write_result = timeit.timeit(setup  = SETUP_CODE,
                                 stmt   = TEST_CODE,
                                 number = 1          )

    # report the result
    # printing exec. time
    print('{0} iteration - Write DeltaLake Table - Time taken: {1} s'.format(i + 1 , write_result))

    ###################### (2) READ BENCHMARK #####################

    SETUP_CODE='''from deltalake import DeltaTable'''

    TEST_CODE='''
HDFS_DATA_PATH = "hdfs://rpc.namenode.service.consul:8020/Projects/benchmarks_offline_FS/Experiments/supplier"
DeltaTable(HDFS_DATA_PATH)'''

    # benchmark the task
    read_result = timeit.timeit(setup  = SETUP_CODE,
                                stmt   = TEST_CODE,
                                number = 1          )

    # report the result
    # printing exec. time
    print('{0} iteration - Read  DeltaLake Table - Time taken: {1} s'.format(i + 1, read_result))

    ############################## (END) ############################
    
    ## Save row of results in results
    results_row = [write_result, read_result]
    results.append(results_row)
    
    ## Erase data from created folder
    !rm -r "/home/yarnapp/hopsfs/Experiments/supplier"

## Create and then save a dataframe with the results in .csv
df = pd.DataFrame(results, columns =  ["Write", "Read"])
df.to_csv("/home/yarnapp/hopsfs/Resources/benchmark_results_supplier_tpch_sf100.csv")

### Lineitem table

#### SF 1 - 6000000 rows

In [None]:
## Initialise results list of rows
results = []

## Iterate benchmark for - Choose a number
number_of_iterations = 50

for i in range(number_of_iterations):
    ###################### (1) WRITE BENCHMARK #####################

    SETUP_CODE='''import pyarrow as pa
from deltalake import write_deltalake'''

    TEST_CODE='''
HDFS_DATA_PATH = "hdfs://rpc.namenode.service.consul:8020/Projects/benchmarks_offline_FS/Experiments/lineitem" 
LOCAL_PATH = "/home/yarnapp/hopsfs/Resources/lineitem_tpch_sf1.parquet"
pa_table = pa.parquet.read_table(LOCAL_PATH)
write_deltalake(HDFS_DATA_PATH, pa_table)'''

    # benchmark the task
    write_result = timeit.timeit(setup  = SETUP_CODE,
                                 stmt   = TEST_CODE,
                                 number = 1          )

    # report the result
    # printing exec. time
    print('{0} iteration - Write DeltaLake Table - Time taken: {1} s'.format(i + 1 , write_result))

    ###################### (2) READ BENCHMARK #####################

    SETUP_CODE='''from deltalake import DeltaTable'''

    TEST_CODE='''
HDFS_DATA_PATH = "hdfs://rpc.namenode.service.consul:8020/Projects/benchmarks_offline_FS/Experiments/lineitem"
DeltaTable(HDFS_DATA_PATH)'''

    # benchmark the task
    read_result = timeit.timeit(setup  = SETUP_CODE,
                                stmt   = TEST_CODE,
                                number = 1          )

    # report the result
    # printing exec. time
    print('{0} iteration - Read  DeltaLake Table - Time taken: {1} s'.format(i + 1, read_result))

    ############################## (END) ############################
    
    ## Save row of results in results
    results_row = [write_result, read_result]
    results.append(results_row)
    
    ## Erase data from created folder
    !rm -r "/home/yarnapp/hopsfs/Experiments/lineitem"

## Create and then save a dataframe with the results in .csv
df = pd.DataFrame(results, columns =  ["Write", "Read"])
df.to_csv("/home/yarnapp/hopsfs/Resources/benchmark_results_lineitem_tpch_sf1.csv")

#### SF 10 - 60000000 rows

In [None]:
## Initialise results list of rows
results = []

## Iterate benchmark for - Choose a number
number_of_iterations = 50

for i in range(number_of_iterations):
    ###################### (1) WRITE BENCHMARK #####################

    SETUP_CODE='''import pyarrow as pa
from deltalake import write_deltalake'''

    TEST_CODE='''
HDFS_DATA_PATH = "hdfs://rpc.namenode.service.consul:8020/Projects/benchmarks_offline_FS/Experiments/lineitem" 
LOCAL_PATH = "/home/yarnapp/hopsfs/Resources/lineitem_tpch_sf10.parquet"
pa_table = pa.parquet.read_table(LOCAL_PATH)
write_deltalake(HDFS_DATA_PATH, pa_table)'''

    # benchmark the task
    write_result = timeit.timeit(setup  = SETUP_CODE,
                                 stmt   = TEST_CODE,
                                 number = 1          )

    # report the result
    # printing exec. time
    print('{0} iteration - Write DeltaLake Table - Time taken: {1} s'.format(i + 1 , write_result))

    ###################### (2) READ BENCHMARK #####################

    SETUP_CODE='''from deltalake import DeltaTable'''

    TEST_CODE='''
HDFS_DATA_PATH = "hdfs://rpc.namenode.service.consul:8020/Projects/benchmarks_offline_FS/Experiments/lineitem"
DeltaTable(HDFS_DATA_PATH)'''

    # benchmark the task
    read_result = timeit.timeit(setup  = SETUP_CODE,
                                stmt   = TEST_CODE,
                                number = 1          )

    # report the result
    # printing exec. time
    print('{0} iteration - Read  DeltaLake Table - Time taken: {1} s'.format(i + 1, read_result))

    ############################## (END) ############################
    
    ## Save row of results in results
    results_row = [write_result, read_result]
    results.append(results_row)
    
    ## Erase data from created folder
    !rm -r "/home/yarnapp/hopsfs/Experiments/lineitem"

## Create and then save a dataframe with the results in .csv
df = pd.DataFrame(results, columns =  ["Write", "Read"])
df.to_csv("/home/yarnapp/hopsfs/Resources/benchmark_results_lineitem_tpch_sf10.csv")