# Read/Write Test with Apache Iceberg (PyIceberg)

In [None]:
#!pip install pyiceberg[pyarrow,duckdb,sql-sqlite]
!pip install pyiceberg[pyarrow,duckdb,sql-sqlite]==0.6.*
!pip install sqlalchemy --upgrade
!pip install pandas --upgrade
!pip install polars

In [None]:
# Import the needed libraries
from pyiceberg.catalog import load_catalog
from pyiceberg.catalog.sql import SqlCatalog
import numpy as np
import pyarrow.parquet as pq
import pyarrow.compute as pc
import pyarrow
import pandas as pd
import polars as pl
import os
import time
import math
import string
import random
import sys
import warnings
from pyiceberg.exceptions import CommitFailedException
import timeit
import hopsworks
import importlib
import re

## Console 🕹️
**Modify something here if** you want to:
- Change the test repetitions
- Define the number of used cores
- Change the file names
- Save subreports, and define how often
- Change the source from where to collect the data
- Change the path where to save the data
- Change the file type

In [2]:
# Indicate how many times you should repeat each test, and how many cores are you using
reps = dict()
reps["sup_1"]   = 50
reps["sup_10"]  = 50
reps["sup_100"] = 50
reps["line_1"]  = 50
reps["line_10"] = 50
number_of_cores = 1

# Define the file names, according to the name of the test (sup_1, line_10, etc...)
file_names = dict()
file_names["sup_1"]   = 'supplier_tpch_sf1'
file_names["sup_10"]  = 'supplier_tpch_sf10'
file_names["sup_100"] = 'supplier_tpch_sf100'
file_names["line_1"]  = 'lineitem_tpch_sf1'
file_names["line_10"] = 'lineitem_tpch_sf10'

# @optional
# Set those in order to save the subreports,
save_subreports = False
save_how_often  = 0

# Define where the files should be generally saved, and where they can be retrieved
ICEBERG_PATH    = "/home/yarnapp/hopsfs/Resources/test/"
CATALOG_PATH    = ICEBERG_PATH + "catalog.db"
HDFS_PATH       = "/tmp/test/"
LOCAL_PATH      = "/home/yarnapp/hopsfs/Resources/"
REMOTE_PATH     = "https://repo.hops.works/dev/gio-hopsworks/"
FILE_TYPE       = ".parquet"
FILENAME_PREFIX = 'iceberg_benchmark_results_' + str(number_of_cores) + "core"

In [3]:
def get_local_paths(file_names):
    local_paths = dict()
    
    for file in file_names:
        # The local path of each file should look like: "/home/yarnapp/hopsfs/Resources/supplier_tpch_sf1.parquet"
        local_paths[file] = LOCAL_PATH + file_names[file] + FILE_TYPE
    
    return local_paths


def get_remote_paths(file_names):
    remote_paths = dict()
    
    for file in file_names:
        # The remote path of each file should look like: "https://repo.hops.works/dev/gio-hopsworks/supplier_tpch_sf1.parquet"
        remote_paths[file] = REMOTE_PATH + file_names[file] + FILE_TYPE
    
    return remote_paths

In [4]:
# Creates the local path and remote path dictionaries
local_paths  = get_local_paths(file_names)
remote_paths = get_remote_paths(file_names)

In [None]:
# Download all the datasets from the remote_paths to the local_paths
for file in file_names:
    !curl {remote_paths[file]} -o {local_paths[file]}

In [5]:
# Start the test with the above indicated iterations
for dataset in reps:
    if (reps[dataset] > 0):
        print('\n\n\n\n\n *** STARTED THE TEST OF ' + str(dataset) + ' FOR ' + str(reps[dataset]) + ' TIME(S) *** \n\n\n\n\n')

        # Initialise results list of rows
        results   = []
        dataframe = pq.read_table(local_paths[dataset])


        for rep in range(reps[dataset]):

            ################ BEFORE MEASUREMENT ###################################################
            # Create the folder where to initialize the PyIceberg Catalog, supported via SQLite, and
            # the folder where to save data on HDFS (HopsFS).
            !mkdir {ICEBERG_PATH}
            !mkdir {HDFS_PATH}
            
            # Create Catalog
            catalog = SqlCatalog(
                "default",
                **{
                    "uri": f"sqlite:///{CATALOG_PATH}",
                    "warehouse": f"{HDFS_PATH}",
                    "hdfs.host": 'namenode.service.consul',
                },
            )
            
            # Create Namespace
            catalog.create_namespace("namespace")
            
            # Create Table, in order to save data using PyIceberg.
            table = catalog.create_table(
                "namespace.table",
                schema=dataframe.schema,
                location=HDFS_PATH,
            )
            
            
            
            ############### WRITE DATA ON THE PYICEBERG TABLE #####################################
            # Perform writing (append) operation, by passing the dataframe. Track the time needed
            # by the operation using the library "time"
            before_write = time.time()
            
            table.append(dataframe)
            
            after_write  = time.time()
            write_time   = after_write - before_write
            print("\n*** Time needed to write with PyArrow: " + str(write_time) + ' seconds ***')

            

            ############### READ FROM THE PYICEBERG TABLE FROM THE CATALOG ########################           
            # Perform reading (scan) operation, passing namespace and table name. Track the time
            # needed by the operation using the library "time".
            before_read    = time.time()
            
            table          = catalog.load_table("namespace.table")
            read_dataframe = table.scan().to_arrow()
            
            after_read     = time.time()
            read_time      = after_read - before_read
            print("\n*** Time needed to read: " + str(read_time) + ' seconds ***')



            ################ END OF MEASUREMENTS ##################################################
            # Purge and drop both table and namespace, in order to repeat the test
            catalog.purge_table('namespace.table')
            catalog.drop_namespace('namespace')
            
            # Delete the folder where data, metadata, and catalog are saved
            !rm -r {ICEBERG_PATH}
            !rm -r {HDFS_PATH}

            ## Save the results of the current test (write, read)
            results_curr = [write_time, read_time]
            results.append(results_curr)

            # Wait in order to be sure that folder have been deleted on the machine
            time.sleep(5)
            
            if(save_subreports and ((rep + 1) % save_how_often == 0) and ((rep + 1) < reps[dataset])):
                ## Save the test's results in a .csv file with specified name
                df = pd.DataFrame(results, columns =  ["Write", "Read"])
                path_csv = str(LOCAL_PATH) + str(FILENAME_PREFIX) + '_' + 'SUB' + str(rep + 1) + '_' + str(file_names[dataset]) + ".csv"
                df.to_csv(path_csv)


        ## Save the test's results in a .csv file with specified name
        df = pd.DataFrame(results, columns =  ["Write", "Read"])
        path_csv = LOCAL_PATH + FILENAME_PREFIX + '_' + file_names[dataset] + '.csv'
        df.to_csv(path_csv)






 *** STARTED THE TEST OF line_1 FOR 50 TIME(S) *** 





2024-09-17 05:18:58,640 INFO: Defaulting to PyArrow FileIO
2024-09-17 05:19:00,180 INFO: Defaulting to PyArrow FileIO
2024-09-17 05:19:00,181 INFO: Defaulting to PyArrow FileIO
2024-09-17 05:19:02,507 INFO: Defaulting to PyArrow FileIO
2024-09-17 05:19:02,508 INFO: Defaulting to PyArrow FileIO

*** Time needed to write with PyArrow: 3.520193099975586 seconds ***
2024-09-17 05:19:03,707 INFO: Defaulting to PyArrow FileIO
2024-09-17 05:19:03,708 INFO: Defaulting to PyArrow FileIO

*** Time needed to read: 0.006920814514160156 seconds ***
2024-09-17 05:19:03,713 INFO: Defaulting to PyArrow FileIO
2024-09-17 05:19:03,714 INFO: Defaulting to PyArrow FileIO
2024-09-17 05:19:04,909 INFO: Defaulting to PyArrow FileIO
2024-09-17 05:19:15,404 INFO: Defaulting to PyArrow FileIO
2024-09-17 05:19:16,322 INFO: Defaulting to PyArrow FileIO
2024-09-17 05:19:16,323 INFO: Defaulting to PyArrow FileIO
2024-09-17 05:19:18,657 INFO: Defaulting t

In [None]:
# Delete the folder where data, metadata, and catalog are saved
!rm -r {ICEBERG_PATH}
!rm -r {HDFS_PATH}

In [None]:
###### 💀💀💀 DON'T USE THIS IF NOT EXTREMELY NEEDED 💀💀💀 #######

# Purge and drop both table and namespace, in order to repeat the test
catalog.purge_table('namespace.table')
catalog.drop_namespace('namespace')