### Demo: Data Ingestion with CREATE TABLE AS and COPY INTO

As the original labs notebook isn't accessible. We load the nyctaxi data sample and convert in Parquet for testing.

##### Streaming Tables with Auto Loader is preferred to COPY INTO

In [0]:
%fs ls "/databricks-datasets/nyctaxi/tables/nyctaxi_yellow"


In [0]:
%sql
-- Load our catalog and schema
USE CATALOG workspace;
USE SCHEMA learning;

-- Check current catalog and schema
SELECT current_catalog(), current_schema()

In [0]:
%sql
LIST '/databricks-datasets/nyctaxi/tables/nyctaxi_yellow/'

In [0]:
%sql
SHOW VOLUMES IN workspace.learning;

As the datasets is in delta format we will convert in to parquet and save in our volume

In [0]:
# Stores the data in Parquet format in our volume

df_delta = spark.read.format("delta").load("/databricks-datasets/nyctaxi/tables/nyctaxi_yellow/")

output_path = "dbfs:/Volumes/workspace/learning/learn/nyctaxi_yellow_parquet/"

df_delta.write.format("parquet").mode("overwrite").save(output_path)


print("Data saved as managed table: ", output_path)


In [0]:
%sql
LIST 'dbfs:/Volumes/workspace/learning/learn/nyctaxi_yellow_parquet/'

In [0]:
%sql
-- read_files() is a SQL function that returns a Spark DataFrame
FROM read_files(
  'dbfs:/Volumes/workspace/learning/learn/nyctaxi_yellow_parquet/' ,
  format => 'parquet'
)
LIMIT 10



In [0]:
%sql
-- CREATE TABLE AS
DROP TABLE IF EXISTS nyctaxi_yellow;

CREATE TABLE nyctaxi_yellow
SELECT *
FROM read_files(
  'dbfs:/Volumes/workspace/learning/learn/nyctaxi_yellow_parquet/' ,
  format => 'parquet'
);

SELECT * 
FROM nyctaxi_yellow
LIMIT 10

In [0]:
%sql
DESCRIBE TABLE EXTENDED learning.nyctaxi_yellow;

In [0]:
# CREATE TABLE AS in python

# Read the Parquet files
df = (spark
      .read
      .format("parquet")
      .load('dbfs://Volumes/workspace/learning/learn/nyctaxi_yellow_parquet')
      )

# Write to the DataFrame a Delta table
(df
 .write
 .mode("overwrite")
 .saveAsTable(f"learning.nyctaxi_yellow_parquet_python")
 )

 #Read
nyctaxi_bronze_table = spark.table(f"learning.nyctaxi_yellow_parquet_python")
nyctaxi_bronze_table.display()


### COPY INTO
- Incremental batch
- Idempotency

In [0]:
%sql
DROP TABLE IF EXISTS nyctaxi_yellow_delta_ci;

CREATE TABLE nyctaxi_yellow_delta_ci (
  vendor_id STRING,
  pickup_datetime TIMESTAMP,
  dropoff_datetime TIMESTAMP,
  passenger_count INTEGER,
  trip_distance DOUBLE
  );

COPY INTO nyctaxi_yellow_delta_ci
FROM 'dbfs:/Volumes/workspace/learning/learn/nyctaxi_yellow_parquet/'
FILEFORMAT = parquet
COPY_OPTIONS ('mergeSchema' = 'true')

In [0]:
%sql
-- Nothing done as the data is already in the table
COPY INTO nyctaxi_yellow_delta_ci
FROM 'dbfs:/Volumes/workspace/learning/learn/nyctaxi_yellow_parquet/'
FILEFORMAT = parquet
COPY_OPTIONS ('mergeSchema' = 'true')