### 1. Profiling Data using UI

In [0]:
%sql
SELECT * FROM gizmobox.bronze.v_customers;

Databricks data profile. Run in Databricks to view.

### 2. Profile Data using DBUTILS Package

In [0]:
%python

df = spark.table('gizmobox.bronze.v_customers')
dbutils.data.summarize(df)

### 3. Profile Data Manually

In [0]:
%sql
SELECT COUNT(*), count(customer_id), count(email), count(telephone) FROM gizmobox.bronze.v_customers;

In [0]:
%sql
SELECT COUNT(*), count_if(customer_id is null), count_if(email is null), count_if(telephone is null) FROM gizmobox.bronze.v_customers;

In [0]:
%sql
select count(*) from gizmobox.bronze.v_customers where customer_id is null;

In [0]:
%sql

--- To check duplicate values
select count(*) total_no_of_records, count(distinct customer_id) unique_customer_id from gizmobox.bronze.v_customers where customer_id is not null;

### Transform Customer Data


In [0]:
%sql
select * from gizmobox.bronze.v_customers where customer_id is not null;

In [0]:
%sql
select * from gizmobox.bronze.v_customers where customer_id is not null order by customer_id;

In [0]:
%sql
select distinct * from gizmobox.bronze.v_customers where customer_id is not null order by customer_id;

In [0]:
%sql
select customer_id, max(created_timestamp), max(customer_id), max(date_of_birth), max(email) from gizmobox.bronze.v_customers where customer_id is not null group by customer_id order by customer_id;

In [0]:
%sql
create or replace temporary view v_customers_distinct as select distinct * from gizmobox.bronze.v_customers where customer_id is not null order by customer_id;

In [0]:
%sql
select customer_id, max(created_timestamp) as max_created_timestamp from v_customers_distinct group by customer_id

In [0]:
%sql
with cte_max as 
(
    select customer_id, 
    max(created_timestamp) as max_created_timestamp
    from v_customers_distinct
    group by customer_id
)
select t.*
from v_customers_distinct t
inner join cte_max m
on t.customer_id = m.customer_id
and t.created_timestamp = m.max_created_timestamp



#### 4. CAST the column values to the correct data type

In [0]:
%sql
with cte_max as 
(
    select customer_id, 
    max(created_timestamp) as max_created_timestamp
    from v_customers_distinct
    group by customer_id
)
select cast(t.created_timestamp AS timestamp) as created_timestamp,
t.customer_id,
t.customer_name,
cast(t.date_of_birth AS date) as date_of_birth,
t.email,
cast(t.member_since AS date) as member_since,
t.telephone
from v_customers_distinct t
inner join cte_max m
on t.customer_id = m.customer_id
and t.created_timestamp = m.max_created_timestamp

### 5. Write data to Delta Table

In [0]:
%sql
create table gizmobox.silver.customers
as
with cte_max as 
(
    select customer_id, 
    max(created_timestamp) as max_created_timestamp
    from v_customers_distinct
    group by customer_id
)
select cast(t.created_timestamp AS timestamp) as created_timestamp,
t.customer_id,
t.customer_name,
cast(t.date_of_birth AS date) as date_of_birth,
t.email,
cast(t.member_since AS date) as member_since,
t.telephone
from v_customers_distinct t
inner join cte_max m
on t.customer_id = m.customer_id
and t.created_timestamp = m.max_created_timestamp

In [0]:
%sql
select * from gizmobox.silver.customers

In [0]:
describe extended gizmobox.silver.customers;

### Transform Payments Data

In [0]:
%sql

--- Extract Data and time from payment_timestamp

select payment_id, order_id, payment_timestamp, cast(date_format(payment_timestamp, 'yyyy-MM-dd') as date) AS payment_date, date_format(payment_timestamp, 'HH:mm:ss') AS payment_time, payment_status, payment_method from gizmobox.bronze.payments;

In [0]:
%sql
select payment_id,
 order_id,
cast(date_format(payment_timestamp, 'yyyy-MM-dd') as date) AS payment_date, 
date_format(payment_timestamp, 'HH:mm:ss') AS payment_time, 
case payment_status 
when 1 then 'Success' 
when 2 then 'Pending'
 when 3 then 'Cancelled'
  when 4 then 'Failed'
  End as payment_status,
  payment_method 
  from gizmobox.bronze.payments;

In [0]:
%sql
create or replace table gizmobox.silver.payments
as 
select payment_id,
 order_id,
cast(date_format(payment_timestamp, 'yyyy-MM-dd') as date) AS payment_date, 
date_format(payment_timestamp, 'HH:mm:ss') AS payment_time, 
case payment_status 
when 1 then 'Success' 
when 2 then 'Pending'
 when 3 then 'Cancelled'
  when 4 then 'Failed'
  End as payment_status,
  payment_method 
  from gizmobox.bronze.payments;

In [0]:
%sql
select * from gizmobox.silver.payments;

### Transform Refunds Data