In [0]:
# Date param
# This will be useful when we use azure datafactory
dbutils.widgets.text('p_file_date', '2022-09-10')
v_file_date = dbutils.widgets.get('p_file_date')

In [0]:
# By running the notebook configurations, we are able to use their variables in the current notebook 
# bronze_folder_path 
# silver_folder_path 
# gold_folder_path
# By running this notebook, we are able to use functions in the current notebook.
# each %run magic command must be in an isolated code block

In [0]:
%run "../includes/configurations"

In [0]:
%run "../includes/common_functions"

In [0]:
# Import Libraries
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, BooleanType
from pyspark.sql.functions import substring

In [0]:
# Declaring schema
schema = StructType(fields=[
    StructField("customerId", StringType(), False),
    StructField("firstName", StringType(), False),
    StructField("lastName", StringType(), False),
    StructField("phone", StringType(), True),
    StructField("email", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("address", StringType(), True),
    StructField("is_active", BooleanType(), True)
])

In [0]:
# Reading raw customer data
customer_df = spark.read. \
                option("header", True). \
                schema(schema). \
                csv(f"{bronze_folder_path}/customer/customer_{v_file_date}.csv")

In [0]:
# Renaming some columns
tmp_customer_df = customer_df. \
                       withColumnRenamed("customerId", "customer_id"). \
                       withColumnRenamed("firstName", "first_name"). \
                       withColumnRenamed("lastName", "last_name")

display(tmp_customer_df)

customer_id,first_name,last_name,phone,email,gender,address,is_active
CUS50595231748,Steven,Meyer,(968) 497-6188,ewezimo@nap.am,Female,1605 Mivem Trail,False
CUS41095949824,Vernon,Bowman,(679) 452-2816,azohu@ki.mg,Male,1117 Hela Terrace,True
CUS77289220724,Louisa,Curtis,(281) 228-4979,zac@sih.gi,Male,929 Asuha Point,False
CUS55697703960,Mary,Graham,(321) 707-6736,biz@login.kz,Female,474 Bikav Square,True
CUS91382780948,Katherine,Sherman,(978) 216-6291,je@som.uz,Female,168 Ibimi Road,False
CUS36947218124,Nathan,Lyons,(787) 974-4062,ap@ru.si,Female,1769 Nadpus Manor,True
CUS15964882412,Mayme,Campbell,(562) 876-4720,ciwu@imfubu.mr,Male,131 Zioze Plaza,False
CUS26768799060,Loretta,Vargas,(226) 607-7677,ju@viguhibe.bi,Female,1662 Ebpu Square,False
CUS41482828460,Mildred,Lynch,(453) 244-5871,iheconba@vijuw.st,Female,217 Oteeb Manor,False
CUS86644713624,Evan,Rice,(680) 599-5547,rif@awiliz.sn,Male,381 Luucu Manor,True


In [0]:
# Transforming gender column
tmp_customer_df = tmp_customer_df. \
                        withColumn('gender', substring('gender', 1,1))

display(tmp_customer_df)

customer_id,first_name,last_name,phone,email,gender,address,is_active
CUS50595231748,Steven,Meyer,(968) 497-6188,ewezimo@nap.am,F,1605 Mivem Trail,False
CUS41095949824,Vernon,Bowman,(679) 452-2816,azohu@ki.mg,M,1117 Hela Terrace,True
CUS77289220724,Louisa,Curtis,(281) 228-4979,zac@sih.gi,M,929 Asuha Point,False
CUS55697703960,Mary,Graham,(321) 707-6736,biz@login.kz,F,474 Bikav Square,True
CUS91382780948,Katherine,Sherman,(978) 216-6291,je@som.uz,F,168 Ibimi Road,False
CUS36947218124,Nathan,Lyons,(787) 974-4062,ap@ru.si,F,1769 Nadpus Manor,True
CUS15964882412,Mayme,Campbell,(562) 876-4720,ciwu@imfubu.mr,M,131 Zioze Plaza,False
CUS26768799060,Loretta,Vargas,(226) 607-7677,ju@viguhibe.bi,F,1662 Ebpu Square,False
CUS41482828460,Mildred,Lynch,(453) 244-5871,iheconba@vijuw.st,F,217 Oteeb Manor,False
CUS86644713624,Evan,Rice,(680) 599-5547,rif@awiliz.sn,M,381 Luucu Manor,True


In [0]:
# We add an insert timestamp column to our dataframe
final_df = add_insert_timestamp(tmp_customer_df)

In [0]:
# We save our data in delta format in our silver container
final_df.write.mode("overwrite").format("delta").save(f"{silver_folder_path}/customer")

In [0]:
# We print a message if everything runs successfully
dbutils.notebook.exit("Success")