In [None]:
dbutils.widgets.text('p_file_date', '2022-09-10')
v_file_date = dbutils.widgets.get('p_file_date')

In [None]:
%run "../includes/configurations"

In [None]:
%run "../includes/common_functions"

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, BooleanType, DateType

In [None]:
schema = StructType(fields=[
    StructField("date", DateType(), False),
    StructField("customerId", StringType(), False),
    StructField("paymentPeriod", IntegerType(), False),
    StructField("loanAmount", DoubleType(), False),
    StructField("currencyType", StringType(), False),
    StructField("evaluationChannel", StringType(), False),
    StructField("interest_rate", DoubleType(), False)
])

In [None]:
# Specify the date format if its not default "yyyy-MM-dd"
loanTrx_df = spark.read. \
                option("header", True). \
                option("dateFormat", "dd/MM/yyyy"). \
                schema(schema). \
                csv(f"{bronze_folder_path}/transactions/loanTrx_{v_file_date}.csv")

In [None]:
display(loanTrx_df)

date,customerId,paymentPeriod,loanAmount,currencyType,evaluationChannel,interest_rate
2022-09-10,CUS50595231748,48,26900.0,USD,WEB PAGE,0.335
2022-09-10,CUS41095949824,24,6000.0,USD,AGENCY,0.4
2022-09-10,CUS77289220724,24,67000.0,USD,AGENCY,0.395
2022-09-10,CUS55697703960,60,8500.0,USD,WEB PAGE,0.395
2022-09-10,CUS91382780948,48,9589.94,USD,WEB PAGE,0.395
2022-09-10,CUS36947218124,24,24261.0,USD,WEB PAGE,0.395
2022-09-10,CUS15964882412,12,8000.0,USD,AGENCY,0.85
2022-09-10,CUS26768799060,60,31900.0,USD,WEB PAGE,0.395
2022-09-10,CUS41482828460,60,7600.0,USD,WEB PAGE,0.395
2022-09-10,CUS86644713624,24,8424.0,USD,WEB PAGE,0.275


In [None]:
# As this data is supossed to come from a core application, we would just renamed a column
final_df = loanTrx_df \
               .withColumnRenamed("customerId", "customer_id") \
               .withColumnRenamed('paymentPeriod' , 'payment_period') \
               .withColumnRenamed('loanAmount' , 'loan_amount') \
               .withColumnRenamed('currencyType' , 'currency_type') \
               .withColumnRenamed('evaluationChannel' , 'evaluation_channel') 

In [None]:
display(final_df)

date,customer_id,payment_period,loan_amount,currency_type,evaluation_channel,interest_rate
2022-09-10,CUS50595231748,48,26900.0,USD,WEB PAGE,0.335
2022-09-10,CUS41095949824,24,6000.0,USD,AGENCY,0.4
2022-09-10,CUS77289220724,24,67000.0,USD,AGENCY,0.395
2022-09-10,CUS55697703960,60,8500.0,USD,WEB PAGE,0.395
2022-09-10,CUS91382780948,48,9589.94,USD,WEB PAGE,0.395
2022-09-10,CUS36947218124,24,24261.0,USD,WEB PAGE,0.395
2022-09-10,CUS15964882412,12,8000.0,USD,AGENCY,0.85
2022-09-10,CUS26768799060,60,31900.0,USD,WEB PAGE,0.395
2022-09-10,CUS41482828460,60,7600.0,USD,WEB PAGE,0.395
2022-09-10,CUS86644713624,24,8424.0,USD,WEB PAGE,0.275


In [None]:
# We save our data in delta format in our silver container
# We use replaceWhere option in case we need to re-process our data
final_df.write.format("delta") \
              .mode("overwrite") \
              .partitionBy('date') \
              .option("replaceWhere", f"date == '{v_file_date}'") \
              .save(f"{silver_folder_path}/loanTrx")

In [None]:
dbutils.notebook.exit("Success")