In [0]:
%load_ext autoreload
%autoreload 2

###### Setting up utils dir path to use the utils functions in the notebook

In [0]:
import os
import sys

In [0]:
curr_dir = os.getcwd()

sys.path.append(curr_dir)

In [0]:
from typing import List
from pyspark.sql import DataFrame
from pyspark.sql.window import Window

#### **CUSTOMERS**

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
df_cust = spark.read.table('pyspark_dbt.bronze.customers')

In [0]:
display(df_cust.limit(10))

###### Getting the customer `email` domain

In [0]:
df_cust = df_cust.withColumn('email_domain', split('email','@')[1])

display(df_cust.limit(10))

###### Cleaning the `phone_number` column

In [0]:
df_cust = df_cust.withColumn('phone_number', regexp_replace('phone_number', '[^0-9]', ''))

display(df_cust.limit(10))

###### Concating `first_name` and `last_name` as `full_name` and then dropping them 

In [0]:
df_cust = df_cust.withColumn('full_name', concat_ws(' ', 'first_name', 'last_name'))
df_cust = df_cust.drop('first_name', 'last_name')
display(df_cust.limit(10))

In [0]:
from utils.custom_utils import Transformations

In [0]:
transformer = Transformations()

df_cust = transformer.de_duplication(
    df=df_cust,
    columns=["customer_id"],
    order_by="last_updated_timestamp"
)


In [0]:
display(df_cust)

In [0]:
df_cust = transformer.process_timestamp(df_cust)

display(df_cust.limit(10))

###### UPSERT logic

In [0]:
from pyspark.sql import SparkSession 
if not 'spark' in globals():
    spark = SparkSession.builder.getOrCreate()
if not spark.catalog.tableExists('pyspark_dbt.silver.customers'):


    df_cust.write.format('delta')\
      .mode('append')\
      .saveAsTable('pyspark_dbt.silver.customers')

else:
  transformer.upsert(
    spark,
    df_cust,
    ["customer_id"],
    "customers",
    "last_updated_timestamp"
)



In [0]:
%sql
select count(*) from pyspark_dbt.silver.customers