In [None]:
-- Setup your external volume where your Iceberg data and metadata files will be stored. Replace values with your details.
CREATE OR REPLACE EXTERNAL VOLUME iceberg_ext_vol
  STORAGE_LOCATIONS =
      (
        (
            NAME = 'my-s3-us-west-2'
            STORAGE_PROVIDER = 'S3'
            STORAGE_BASE_URL = 's3://MY_EXAMPLE_BUCKET/ICEBERG/'
            STORAGE_AWS_ROLE_ARN = 'arn:aws:iam::123456789012:role/myrole'
            ENCRYPTION=(TYPE='AWS_SSE_KMS' KMS_KEY_ID='1234abcd-12ab-34cd-56ef-1234567890ab')
        )
      )
  ALLOW_WRITES = TRUE;


In [None]:
-- For demo purposes, use Snowflake’s read-only quickstart bucket.
CREATE or replace STAGE SFQUICKSTARTS 
url = 's3://sfquickstarts/';

In [None]:
-- Confirm there are 12 files totaling 444.77 MB for the CSV Citibike dataset for 2016.
ls @SFQUICKSTARTS/vhol_citibike_ml_snowpark_python/data/schema1/2016;

SELECT count("name") as numb_files, round(sum("size")/1024/1024, 2) as sum_filesize_mb, array_agg("name") as array_files from table(result_scan(LAST_QUERY_ID()));

In [None]:
-- Create a file format with some convenience options set.
CREATE OR REPLACE FILE FORMAT my_csv
  TYPE = CSV
  PARSE_HEADER = TRUE
  FIELD_OPTIONALLY_ENCLOSED_BY = '"'
  ERROR_ON_COLUMN_COUNT_MISMATCH = FALSE
  REPLACE_INVALID_CHARACTERS= TRUE
  SKIP_BLANK_LINES = TRUE;


In [None]:
-- Get the schema of the CSV files by looking at 2 files and 1000 rows each.
SELECT GENERATE_COLUMN_DESCRIPTION(
    ARRAY_AGG(OBJECT_CONSTRUCT(*)) WITHIN GROUP (ORDER BY ORDER_ID asc), 'table') AS COLUMNS
      FROM TABLE (
        INFER_SCHEMA(          LOCATION=>'@SFQUICKSTARTS/vhol_citibike_ml_snowpark_python/data/schema1/2016',
          FILE_FORMAT=>'MY_CSV',
          IGNORE_CASE => FALSE,
          MAX_FILE_COUNT => 2
          ,MAX_RECORDS_PER_FILE => 1000
        )
      );

In [None]:
-- Create the Iceberg table using the detected schema.
CREATE OR REPLACE ICEBERG TABLE citibike (  
"tripduration" NUMBER(9, 0),
"starttime" TIMESTAMP_NTZ,
"stoptime" TIMESTAMP_NTZ,
"start station id" NUMBER(4, 0),
"start station name" TEXT,
"start station latitude" NUMBER(17, 15),
"start station longitude" NUMBER(16, 14),
"end station id" NUMBER(4, 0),
"end station name" TEXT,
"end station latitude" NUMBER(17, 15),
"end station longitude" NUMBER(16, 14),
"bikeid" NUMBER(5, 0),
"usertype" TEXT,
"birth year" TEXT,
"gender" NUMBER(1, 0)
)
  CATALOG='SNOWFLAKE'
  EXTERNAL_VOLUME='ICEBERG_EXT_VOL'
  BASE_LOCATION='iceberg_demo/citibike/';

In [None]:
-- Load the CSV files and Snowflake will create Parquet data and metadata files in your specified base_location.
COPY INTO CITIBIKE
FROM @SFQUICKSTARTS/vhol_citibike_ml_snowpark_python/data/schema1/2016
FILE_FORMAT = my_csv 
MATCH_BY_COLUMN_NAME = 'case_sensitive' 
ON_ERROR = CONTINUE
LOAD_MODE = FULL_INGEST;

In [None]:
-- Query the Iceberg table and optionally check your s3 bucket if you want to confirm magic happened.
select * from citibike where "tripduration" > 0 limit 100;

In [None]:
-- Check the size of the Iceberg table, look it’s only 274.81MB because we compressed and created Parquet files from the original CSV files.
select table_schema,
       table_name,
       round(bytes/1024/1024, 2) as table_size_mb
from information_schema.tables
where table_name = 'CITIBIKE'
order by table_size_mb desc;