In [0]:
%sql
select * from bronze.crm_cust_info 

In [0]:
---CHECK FOR NULLS OR DUPLICATE IN PRIMARY KEY
---EXPECTED NULL

In [0]:
%sql
select cst_id,count(*) from bronze.crm_cust_info group by cst_id having count(*) > 1 or cst_id is null

In [0]:
---query to data transformation and cleansing
---remove duplicate from primary key
---remove trim spaces
---data standardization & consistency

In [0]:
%sql
select * from (
select *,row_number() over(partition by cst_id order by cst_create_date desc) as flag_last from bronze.crm_cust_info 
)
where flag_last = 1 

In [0]:
%sql
insert into silver.crm_cust_info (
    cst_id, cst_key, cst_firstname, cst_lastname, cst_marital_status, cst_gndr, cst_create_date
  )
  select
    cst_id,
    cst_key,
    trim(cst_firstname) as cst_firstname,
    trim(cst_lastname) as cst_lastname,
    CASE
      WHEN UPPER(cst_marital_status) = 'S' then 'Single'
      when UPPER(cst_marital_status) = 'M' then 'Married'
      ELSE 'n/a'
    END as cst_marital_status,---normalize marital status values to readable format
    CASE
      WHEN UPPER(cst_gndr) = 'F' then 'Female'
      when UPPER(cst_gndr) = 'M' then 'Male'
      ELSE 'n/a' ---handling missing data
    END as cst_gndr, ---normalize gender values to readable format
    cst_create_date
  from
    (
      select
        *,
        row_number() over (partition by cst_id order by cst_create_date desc) as flag_last
      from
        bronze.crm_cust_info
    )
  where
    flag_last = 1
    and cst_id is not null --select the most recent record per customer(removge duplicate) and remove null

In [0]:
%sql
select * from silver.crm_cust_info

In [0]:
%sql
select * from bronze.crm_prd_info

In [0]:
%sql
--prd_id is primary key
--check for null and duplicates in primary key
select prd_id,count(*) from bronze.crm_prd_info group by prd_id having count(*)> 1 or prd_id is null

In [0]:
--check for unwanted space
%sql
select prd_id,
prd_key,
replace(substring(prd_key,1,5),'-','_') as cat_id,
replace(substring(prd_key,7,length(prd_key)),'-','_') as prd_key,
prd_nm,
prd_cost,
prd_line,
prd_start_dt,
prd_end_dt   from bronze.crm_prd_info
where prd_nm != trim(prd_nm)

In [0]:
%sql
---check for nulls or negative number
---Expected :No result
select prd_id,
prd_key,
replace(substring(prd_key,1,5),'-','_') as cat_id,
replace(substring(prd_key,7,length(prd_key)),'-','_') as prd_key,
prd_nm,
prd_cost,
prd_line,
prd_start_dt,
prd_end_dt   from bronze.crm_prd_info
where prd_cost <= 0 or prd_cost is null
---Since we have null value as per business we need to make it 0
    

In [0]:
---Need to check for abbrevation change or not like we did for prd_lines
--Data standardization & consistency
select prd_id,
prd_key,
replace(substring(prd_key,1,5),'-','_') as cat_id,
replace(substring(prd_key,7,length(prd_key)),'-','_') as prd_key,
prd_nm,
coalesce(prd_cost,0) as prd_cost,
case upper(trim(prd_line)) 
when 'M' then 'Mountain'
when 'R' then 'Road'
when 'S' then 'other Sales'
when 'T' then 'Touring'
else 'n/a' end as prd_line,
prd_start_dt,
prd_end_dt   from bronze.crm_prd_info
where prd_cost <= 0 or prd_cost is null


In [0]:
-- check for invalid date orders
---start date < end date
---end date should be small to start date of next recird against same id
select prd_id,
prd_key,
prd_nm,
prd_start_dt,
prd_end_dt ,
lead(prd_start_dt) over (partition by prd_key order by prd_start_dt)-1 as prd_end_dt_test
  from bronze.crm_prd_info
where  prd_key in ('AC-HE-HL-U509-R' , 'AC-HE-HL-U509')


In [0]:
-- check for invalid date orders
select prd_id,
prd_key,
replace(substring(prd_key,1,5),'-','_') as cat_id,
replace(substring(prd_key,7,length(prd_key)),'-','_') as prd_key,
prd_nm,
coalesce(prd_cost,0) as prd_cost,
case upper(trim(prd_line)) 
when 'M' then 'Mountain'
when 'R' then 'Road'
when 'S' then 'other Sales'
when 'T' then 'Touring'
else 'n/a' end as prd_line,
prd_start_dt,
lead(prd_start_dt) over (partition by prd_key order by prd_start_dt)-1 as prd_end_dt   
from bronze.crm_prd_info



In [0]:
--Modify table defination
CREATE OR REPLACE TABLE silver.crm_prd_info (
  prd_id INT,
  cat_id VARCHAR(50),
  prd_key VARCHAR(50),
  prd_nm VARCHAR(50),
  prd_cost INT,
  prd_line VARCHAR(50),
  prd_start_dt TIMESTAMP,
  prd_end_dt TIMESTAMP,
  dwh_creat_date TIMESTAMP DEFAULT current_timestamp())
USING delta
TBLPROPERTIES (
  'delta.columnMapping.mode' = 'name',
  'delta.enableDeletionVectors' = 'true',
  'delta.feature.allowColumnDefaults' = 'supported',
  'delta.feature.appendOnly' = 'supported',
  'delta.feature.changeDataFeed' = 'supported',
  'delta.feature.checkConstraints' = 'supported',
  'delta.feature.columnMapping' = 'supported',
  'delta.feature.deletionVectors' = 'supported',
  'delta.feature.generatedColumns' = 'supported',
  'delta.feature.invariants' = 'supported',
  'delta.minReaderVersion' = '3',
  'delta.minWriterVersion' = '7')


In [0]:
insert into silver.crm_prd_info(prd_id,cat_id,prd_key,prd_nm,prd_cost,prd_line,prd_start_dt,prd_end_dt) 
select prd_id,
replace(substring(prd_key,1,5),'-','_') as cat_id,
replace(substring(prd_key,7,length(prd_key)),'-','_') as prd_key,
prd_nm,
coalesce(prd_cost,0) as prd_cost,
case upper(trim(prd_line)) 
when 'M' then 'Mountain'
when 'R' then 'Road'
when 'S' then 'other Sales'
when 'T' then 'Touring'
else 'n/a' end as prd_line,
prd_start_dt,
lead(prd_start_dt) over (partition by prd_key order by prd_start_dt)-1 as prd_end_dt   
from bronze.crm_prd_info



In [0]:
SELECT * FROM SILVER.crm_prd_info