In [None]:
# Install the PyIceberg library, then fix the dependency problem with SQLAlchemy library
!pip install pyiceberg[pyarrow,duckdb,sql-sqlite]
!pip install sqlalchemy==2.0.28 --upgrade

In [None]:
# Create a folder where to save on HopsFS
!mkdir /home/yarnapp/hopsfs/Resources/test_dir
!mkdir /home/yarnapp/hopsfs/Resources/test_dir/test_data

In [None]:
# Import the needed libraries
from pyiceberg.catalog.sql import SqlCatalog
import pyarrow.parquet as pq
import pyarrow.compute as pc
import os
import importlib
from urllib.parse import urlparse
from typing import Dict, List
from pyarrow.fs import HadoopFileSystem
from functools import lru_cache

In [None]:
# Set the path to the right test directories
test_dir_path  = "/home/yarnapp/hopsfs/Resources/test_dir"
test_data_path = "/home/yarnapp/hopsfs/Resources/test_dir/test_data"

In [None]:
hdfs_path = "/Projects/iceberg/Resources/test_dir/test_data"
test_hdfs_path = "namenode.service.consul:8020/Projects/iceberg/Resources/test_dir/test_data"

# Create a catalog (ONLY THE FIRST TIME) - "name", **{properties}
test_catalog = SqlCatalog(
    "default",
    **{
        "uri": f"sqlite:///{test_data_path}/pyiceberg_test1_catalog.db",
        "warehouse": f"hdfs://{test_hdfs_path}",
    },
)

# Print the object catalog, to show the catalog type
print(test_catalog)

In [None]:
# Get the NYC Taxi dataset from the network
nyc_data_path = test_dir_path + "/nyc_taxi_data.parquet"
!curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet -o "/home/yarnapp/hopsfs/Resources/test_dir/nyc_taxi_data.parquet"
    
# Then, load the data into a Parquet DataFrame (df)
df = pq.read_table(nyc_data_path)

In [None]:
# Create a new namespace
test_catalog.create_namespace("test_ns")

In [None]:
# Create a new table "test_table", specifying the schema according to the df's schema
test_table = test_catalog.create_table(
    "test_ns.nyc_taxiii",
    schema=df.schema,
)

#### Test the data insertion

In [None]:
# Append the dataframe to the test_table, showing the difference between before and after the operation
print("Before the append operation, there are " + str(len(test_table.scan().to_arrow())) + "rows in the table")
test_table.append(df)
print("After the append operation, there are " + str(len(test_table.scan().to_arrow())) + "rows in the table")

#### Test the schema evolution

In [None]:
# Create a new dataframe, equal to df but with a new column
updated_df = df.append_column("tip_per_mile", pc.divide(df["tip_amount"], df["trip_distance"]))

# Extract then the new schema information and save them in a new file
with test_table.update_schema() as update_schema:
    update_schema.union_by_name(updated_df.schema)

In [None]:
# Overwrite the previous table, adding the new dataframe
test_table.overwrite(updated_df)
print(test_table.scan().to_arrow())

#### Test the table scan and file retrieval

In [None]:
df = test_table.scan(row_filter="tip_per_mile > 0").to_arrow()
len(df)

In [None]:
!find /home/yarnapp/hopsfs/Resources/test_dir/test_data

---
#### @FINAL Delete all the data and files created

In [None]:
# Just call it if you are at the end of your own test
! rm -r /home/yarnapp/hopsfs/Resources/test_dir