In [None]:
# Install the PyIceberg library, then fix the dependency problem with SQLAlchemy library
!pip install pyiceberg[pyarrow,duckdb,sql-sqlite] --upgrade
!pip install sqlalchemy --upgrade
#!pip install sqlalchemy==2.0.28 --upgrade

In [1]:
# Import the needed libraries
from pyiceberg.catalog import load_catalog
from pyiceberg.catalog.hive import HiveCatalog
import pyarrow.parquet as pq
import pyarrow.compute as pc
import pyarrow
import pandas as pd
import os
import importlib
from urllib.parse import urlparse
from typing import Dict, List
from pyarrow.fs import HadoopFileSystem
from functools import lru_cache
import time
import math
from pyiceberg.exceptions import CommitFailedException

In [None]:
# 🧪🧪 ONLY ONCE! 🧪🧪
# Get the NYC Taxi dataset from the network
!curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet -o "/home/yarnapp/hopsfs/Resources/nyc_taxiparquet"

In [2]:
# Create a folder where to save on HopsFS
!mkdir /home/yarnapp/hopsfs/Resources/test_dir/
!mkdir /tmp/test_data/

mkdir: cannot create directory ‘/home/yarnapp/hopsfs/Resources/test_dir/’: File exists
mkdir: cannot create directory ‘/tmp/test_data/’: File exists


In [11]:
#catalog_file_path = "/home/hdfs/iceberg/catalog/pyiceberg_catalog.db"
#hdfs_path_uri = "hdfs://namenode.service.consul:8020/tmp/test_data"

# Create a catalog
test_catalog = HiveCatalog(
    "default",
    **{
        "uri":"thrift://metastore.hive.service.consul:9083",
        "hive.hive2-compatible":True,
        #"hive.hostname": "hive",
        #"hive.port":"9083"
    },
)

# Print the object catalog, to show the catalog type
print(test_catalog)

default (<class 'pyiceberg.catalog.hive.HiveCatalog'>)


In [12]:
test_catalog.properties

{'uri': 'thrift://metastore.hive.service.consul:9083',
 'hive.hive2-compatible': True,
 'hive.hostname': 'hive',
 'hive.port': '9083'}

In [13]:
# Load the data previously downloaded into a Parquet DataFrame (df)
nyc_data_path = "/home/yarnapp/hopsfs/Resources/nyc_taxiparquet"
df = pq.read_table(nyc_data_path)

In [14]:
# Create a new namespace
test_catalog.create_namespace("test_ns")

TTransportException: TSocket read 0 bytes

In [8]:
# Create a new table "test_table", specifying the schema according to the df's schema
test_table = test_catalog.create_table(
    "test_ns.nyc_taxi",
    schema=df.schema,
    # 🧪🧪 TESTING 🧪🧪
    # The location now should be added, since in the creation of the catalog we are both specifying the host and the warehouse!
    location="/tmp/test_data"
)

2024-07-04 14:59:50,659 INFO: Defaulting to PyArrow FileIO


AttributeError: 'bool' object has no attribute 'lower'

#### Test the data insertion

Insert the full NYC taxi dataframe in the empy table created above.

In [None]:
# Append the dataframe to the test_table, showing the difference between before and after the operation
print("Start APPEND")
before_len = len(test_table.scan().to_arrow())

test_table.append(df)

print('End APPEND')
after_len  = len(test_table.scan().to_arrow())
print("Before the append operation, there were " + str(before_len) + "rows in the table")
print("After  the append operation, there were " + str(after_len)  + "rows in the table")

#### Test multiple APPEND operations

In order to test several consecutive APPEND operations, the Arrow Dataframe containing the NYC Taxi data is transformed in Pandas Dataframe, then divided in small part of 1000 rows each.
⚠️ Depending on "how_many" APPEND operations you want to perform, change the former parameters in the following cell.

Several errors might arise, but those should not be related to the functioning of the PyIceberg library: the problem should instead reside in the underlying infrastructure (Jupyter, Hopsworks UI, VM, File access permissions ...)

In [None]:
catalog = load_catalog("default",**{"uri":"sqlite:////home/yarnapp/hopsfs/Resources/test_dir/pyiceberg_catalog.db"})

In [None]:
# Load the data previously downloaded into a Parquet DataFrame (df)
nyc_data_path = "/home/yarnapp/hopsfs/Resources/nyc_taxiparquet"
arrow_df      = pq.read_table(nyc_data_path)

# Create a set for randomizing the insertion
insert_set = set()
for i in range(1, math.floor(arrow_df.shape[0]/1000), 2):
    insert_set.add(i)
    
# Transform the arrow dataframe into a pandas DataFrame
df_append = pd.DataFrame()
df_append = arrow_df.to_pandas()

# Set how many times you want to repeat the APPEND operation
how_many = 10

In [None]:
# Load the table where to append the new data
table_append = catalog.load_table("test_ns.nyc_taxi")

for i in range(how_many):
    elem = insert_set.pop()
    partial_df = df_append[elem*1000:1000*(elem + 1)]
    partial_table = pyarrow.Table.from_pandas(partial_df)

    # Append the dataframe to the test_table, showing the difference between before and after the operation
    print("Start APPEND")
    before_len = len(table_append.scan().to_arrow())
    
    table_append.append(partial_table)
    
    print('End APPEND')
    after_len  = len(table_append.scan().to_arrow())
    print("Before the append operation, there were " + str(before_len) + "rows in the table")
    print("After  the append operation, there were " + str(after_len)  + "rows in the table")
    
    if i == how_many - 1:
        print('\n\n ** All the APPEND operations have been completed **')

#### Test the schema evolution

In [None]:
# Create a new dataframe, equal to df but with a new column
updated_df = df.append_column("tip_per_mile", pc.divide(df["tip_amount"], df["trip_distance"]))

In [None]:
catalog = load_catalog("default",**{"uri":"sqlite:////home/yarnapp/hopsfs/Resources/test_dir/pyiceberg_catalog.db"})
table   = catalog.load_table("test_ns.nyc_taxi")

In [None]:
# Extract then the new schema information and save them in a new file
with table.update_schema() as update_schema:
    update_schema.union_by_name(updated_df.schema)
    
# Overwrite the previous table, replacing the old dataframe with a new one
table.overwrite(updated_df)
print(table.scan().to_arrow())

In [None]:
# 🧪🧪 TESTING 🧪🧪

# Get the table into a pandas DataFrame, in order to verift its length and integrity.
prova = table.scan().to_pandas()

#### Test the table scan and file retrieval

In [None]:
df = table.scan(row_filter="tip_per_mile > 0").to_arrow()
len(df)

---
#### @FINAL Delete all the data and files created

In [None]:
# Just call it if you are at the end of your own test
! rm -r /home/yarnapp/hopsfs/Resources/test_dir
! rm -r /tmp/test_data/

In [None]:
# 🧪🧪 TESTING 🧪🧪

# Sometimes a file "default" is created here. This could cause problem, so run this cell to remove it.
! rm -r /home/yarnapp/hopsfs/Experiments/default

In [None]:
# 🧪🧪 TESTING 🧪🧪

!echo '.databases'|sqlite3 default
#!echo '.databases'|sqlite3 default