
## Set-up for the project environment for GizmoBox Data Lakehouse

1. Access the container gizmobox


In [0]:
%fs ls 'abfss://gizmobox@deacourseextdld.dfs.core.windows.net/Landing'

In [0]:
%sql
create external location if not exists dea_course_ext_dl_gizmobox
  url 'abfss://gizmobox@deacourseextdld.dfs.core.windows.net/'
  with (storage credential dea_course_ext_sc)
  comment 'External location for Gizmo Box data lakehouse;'

### 3. Create the Gizmobox- Catalog



In [0]:
%sql

show catalogs;

In [0]:
%sql

create catalog if not exists gizmobox
  managed location 'abfss://gizmobox@deacourseextdld.dfs.core.windows.net/' -- If not specified then the default catalog is used. which is root
  comment 'This is the catalog for Gizmo Box';


#### 4. Create Schemas

1. Landing
2. Bronze
3. Silver
4. Gold

In [0]:
%sql
SELECT current_catalog();

In [0]:
%sql
USE CATALOG gizmobox;
    
SHOW SCHEMAS;

In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS Landing
  COMMENT 'This is the landing schema for Gizmo Box'
  MANAGED LOCATION 'abfss://gizmobox@deacourseextdld.dfs.core.windows.net/Landing';
    
CREATE SCHEMA IF NOT EXISTS bronze
  COMMENT 'This is the bronze schema for Gizmo Box'
  MANAGED LOCATION 'abfss://gizmobox@deacourseextdld.dfs.core.windows.net/bronze';
    
CREATE SCHEMA IF NOT EXISTS silver
  COMMENT 'This is the silver schema for Gizmo Box'
  MANAGED LOCATION 'abfss://gizmobox@deacourseextdld.dfs.core.windows.net/silver';
    
CREATE SCHEMA IF NOT EXISTS gold
  COMMENT 'This is the gold schema for Gizmo Box'
  MANAGED LOCATION 'abfss://gizmobox@deacourseextdld.dfs.core.windows.net/gold';
    
SHOW SCHEMAS;

### 5. Create Volume


In [0]:
%sql
USE CATALOG gizmobox;
USE SCHEMA landing;

CREATE EXTERNAL VOLUME IF NOT EXISTS operational_data
  COMMENT 'This is the operational data volume for Gizmo Box'
  LOCATION 'abfss://gizmobox@deacourseextdld.dfs.core.windows.net/Landing/operational_data';





In [0]:
%fs ls /Volumes/gizmobox/landing/operational_data

Querying the data 

In [0]:
%sql
-- Single File
select * from json.`/Volumes/gizmobox/landing/operational_data/customers/customers_2024_10.json`

In [0]:
%sql
--- Querying multiple files
select * from json.`/Volumes/gizmobox/landing/operational_data/customers/customers_2024_*.json`

In [0]:
%sql
--- Querying entire folder
SELECT * FROM JSON.`/Volumes/gizmobox/landing/operational_data/customers`

In [0]:
%sql

--- To enable data auditing to know which records are from which file
select _metadata.file_path AS file_path,
  * from json.`/Volumes/gizmobox/landing/operational_data/customers`

#### 5. Register Files in Unity Catalog using Views


In [0]:
%sql
use catalog gizmobox;
use schema bronze;

create or replace view v_customers
as 
select *, _metadata.file_path as file_path from json.`/Volumes/gizmobox/landing/operational_data/customers`
    


In [0]:
%sql
select * from gizmobox.bronze.v_customers;

### Creating Temporary View

This view is only session based, once the session ends, the temp view will be gone. 

### Global Temporary View

This view is cluster based, it lives until the cluster runs and deletes when cluster is stopped.

In [0]:
%sql
create or replace temporary view tv_customers
as
select *, _metadata.file_path as file_path from json.`/Volumes/gizmobox/landing/operational_data/customers`
    


In [0]:
%sql
select * from tv_customers

In [0]:
%sql
create or replace global temporary view gtv_customers
as
select *, _metadata.file_path as file_path from json.`/Volumes/gizmobox/landing/operational_data/customers`

In [0]:
%sql
select * from global_temp.gtv_customers;

Using custom JSON parsing for data quality issues. 

In [0]:
%sql
--- 1. Query orders using JSON format
select * from json.`/Volumes/gizmobox/landing/operational_data/orders`

In [0]:
%sql

--- Using TEXT format to query the order file

select * from text.`/Volumes/gizmobox/landing/operational_data/orders`
    


In [0]:
%sql

---Create orders view in Bronze Schema

create or replace view gizmobox.bronze.v_orders 
as 
select * from text.`/Volumes/gizmobox/landing/operational_data/orders`;

In [0]:
%sql
select * from gizmobox.bronze.v_orders;

Working with Unstructured Data--- Use binary data


In [0]:
%fs ls '/Volumes/gizmobox/landing/operational_data/memberships'

In [0]:
%sql

select * from binaryFile.`/Volumes/gizmobox/landing/operational_data/memberships/*/*.png`

In [0]:
%sql
create or replace view gizmobox.bronze.v_memberships
as
select * from binaryFile.`/Volumes/gizmobox/landing/operational_data/memberships/*/*.png`

In [0]:
%sql
select * from v_memberships;

Querying files of text format

In [0]:
%sql

select * from csv.`/Volumes/gizmobox/landing/operational_data/addresses`

As you can see, since the file is TSV and using csv as format specifier has given us the file as a whole in one column. To avoid this, you use read_files function.

In [0]:
%sql
create or replace view gizmobox.bronze.v_addresses
as 
select * from read_files('/Volumes/gizmobox/landing/operational_data/addresses', format=>'csv', delimiter=>'\t', header=>True)

In [0]:
%sql
select * from gizmobox.bronze.v_addresses;

### Extracting Data From the Payments Files

##### Listing the files from payment folder

In [0]:

%fs ls 'abfss://gizmobox@deacourseextdld.dfs.core.windows.net/Landing/external_data/payments'

In [0]:
%sql

--- 2. Create External Table

create table if not exists gizmobox.bronze.payments
(payment_id integer, order_id integer, payment_timestamp timestamp, payment_status integer, payment_method string)
using csv
options(header="true", delimeter = ",")
location 'abfss://gizmobox@deacourseextdld.dfs.core.windows.net/Landing/external_data/payments';

-- Location marks the table as an external table otherwise would be a managed table




In [0]:
%sql
select * from gizmobox.bronze.payments;

In [0]:
%sql
describe extended gizmobox.bronze.payments;

### Demonstrate the effect of adding/updating/deleting files

In [0]:
%sql
select * from gizmobox.bronze.payments; --I deleted a file in azure, to avoid this you need to refresh the table

In [0]:
%sql
REFRESH TABLE gizmobox.bronze.payments --Always use whenever there is a change in the files externally

#### 4. Demonstrate the effect of Dropping the Table

In [0]:
%sql
DROP TABLE IF EXISTS gizmobox.bronze.payments;    ---Since it is an external table, it will only drop tables in the unity metadata and not in Azure