**Accounts**

In [0]:
accounts_src_path='/mnt/input/Silver/accounts.parquet'
accounts_tgt_path='/mnt/input/Gold/accounts'

In [0]:
%sql
create table if not exists accounts
(
  account_id int,
   customer_id int,
   account_type string,
   balance double,
   hashkey bigint,
   createdby string,
   createdDate timestamp,
   updatedby string,
   updatedDate timestamp
) 
using delta
location '/mnt/input/Gold/accounts'

In [0]:
df_accounts=spark.read.format("parquet").option("header", "true").load(accounts_src_path)


In [0]:
%python
from pyspark.sql.functions import crc32, concat
df_accounts_hash=df_accounts.withColumn("hashkey",crc32(concat(*df_accounts.columns)))

In [0]:
from delta.tables import *
dtable_accounts = DeltaTable.forPath(spark, accounts_tgt_path)
dtable_accounts.toDF().show()

In [0]:
%python
from pyspark.sql.functions import col

df_src_accounts = df_accounts_hash.alias("src").join(
    dtable_accounts.toDF().alias("tgt"),
    (col("src.account_id") == col("tgt.account_id")) & (col("src.hashkey") == col("tgt.hashkey")),
    "anti"
).select("src.*")

In [0]:
from pyspark.sql.functions import *
dtable_accounts.alias("tgt").merge(df_src_accounts.alias("src"),((col("src.account_id") == col("tgt.account_id"))))\
    .whenMatchedUpdate(set={
        "tgt.account_id":"src.account_id",
        "tgt.customer_id":"src.customer_id",
        "tgt.account_type":"src.account_type",
        "tgt.balance":"src.balance",
        "tgt.hashkey":"src.hashkey",
        "tgt.updatedDate":current_timestamp(),
        "tgt.updatedby":lit("databricks-update")
    })\
        .whenNotMatchedInsert(values={
       "tgt.account_id":"src.account_id",
        "tgt.customer_id":"src.customer_id",
        "tgt.account_type":"src.account_type",
        "tgt.balance":"src.balance",
        "tgt.hashkey":"src.hashkey",
        "tgt.createdDate":current_timestamp(),
        "tgt.createdby":lit("databricks"),
        "tgt.updatedDate":current_timestamp(),
        "tgt.updatedBy":lit("databricks")
        }).execute()


**Customers**

In [0]:
%sql
use catalog hive_metastore;
create table if not exists customers
(
   customer_id int,
   first_name string,
   last_name string,
   address string,
   state string,
   city string,
   zip string,
   hashkey bigint,
   createdby string,
   createdDate timestamp,
   updatedby string,
   updatedDate timestamp
) 
using delta
location '/mnt/input/Gold/customers'

In [0]:
customers_src_path='/mnt/input/Silver/customers.parquet'
customers_tgt_path='/mnt/input/Gold/customers'

In [0]:
df_cust=spark.read.format("parquet").option("header", "true").load(accounts_src_path)

In [0]:
from pyspark.sql.functions import crc32, concat
df_customers_hash=df.withColumn("hashkey",crc32(concat(*df.columns)))

In [0]:
from delta.tables import *
dtable_customers = DeltaTable.forPath(spark, accounts_tgt_path)
dtable_customers.toDF().show()

+-----------+----------+---------+-------+-----+----+---+-------+---------+-----------+---------+-----------+
|customer_id|first_name|last_name|address|state|city|zip|hashkey|createdby|createdDate|updatedby|updatedDate|
+-----------+----------+---------+-------+-----+----+---+-------+---------+-----------+---------+-----------+
+-----------+----------+---------+-------+-----+----+---+-------+---------+-----------+---------+-----------+



In [0]:
%python
from pyspark.sql.functions import col

df_src_customers = df_customers_hash.alias("src").join(
    dtable_customers.toDF().alias("tgt"),
    (col("src.customer_id") == col("tgt.customer_id")) & (col("src.hashkey") == col("tgt.hashkey")),
    "anti"
).select("src.*")



In [0]:
from pyspark.sql.functions import *
dtable_customers.alias("tgt").merge(df_src_customers.alias("src"),((col("src.customer_id") == col("tgt.customer_id"))))\
    .whenMatchedUpdate(set={
        "tgt.customer_id":"src.customer_id",
        "tgt.first_name":"src.first_name",
        "tgt.last_name":"src.last_name",
        "tgt.address":"src.address",
        "tgt.city":"src.city",
        "tgt.state":"src.state",
        "tgt.zip":"src.zip",
        "tgt.hashkey":"src.hashkey",
        "tgt.updatedDate":current_timestamp(),
        "tgt.updatedby":lit("databricks-update")
    })\
        .whenNotMatchedInsert(values={
       "tgt.customer_id":"src.customer_id",
        "tgt.first_name":"src.first_name",
        "tgt.last_name":"src.last_name",
        "tgt.address":"src.address",
        "tgt.city":"src.city",
        "tgt.state":"src.state",
        "tgt.zip":"src.zip",
        "tgt.hashkey":"src.hashkey",
        "tgt.createdDate":current_timestamp(),
        "tgt.createdby":lit("databricks"),
        "tgt.updatedDate":current_timestamp(),
        "tgt.updatedBy":lit("databricks")
        }).execute()


**Loan_Payments**

In [0]:
loan_payments_src_path='/mnt/input/Silver/loan_payments.parquet'
loan_payments_tgt_path='/mnt/input/Gold/loan_payments'

In [0]:
%sql
use catalog hive_metastore;
create table if not exists loan_payments
(
   payment_id int,
   loan_id int,
   payment_date timestamp,
   payment_amount decimal,
   hashkey bigint,
   createdby string,
   createdDate timestamp,
   updatedby string,
   updatedDate timestamp
) 
using delta
location '/mnt/input/Gold/loan_payments'

In [0]:
df_pay=spark.read.format("parquet").option("header", "true").load(loan_payments_src_path)

In [0]:
from pyspark.sql.functions import crc32, concat
df_loan_payments_hash=df_pay.withColumn("hashkey",crc32(concat(*df_pay.columns)))

In [0]:
from delta.tables import *
dtable_loan_payments = DeltaTable.forPath(spark, loan_payments_tgt_path)
dtable_loan_payments.toDF().show()

+----------+-------+------------+--------------+-------+---------+-----------+---------+-----------+
|payment_id|loan_id|payment_date|payment_amount|hashkey|createdby|createdDate|updatedby|updatedDate|
+----------+-------+------------+--------------+-------+---------+-----------+---------+-----------+
+----------+-------+------------+--------------+-------+---------+-----------+---------+-----------+



In [0]:
%python
from pyspark.sql.functions import col

df_src_loan_payments = df_loan_payments_hash.alias("src").join(
    dtable_loan_payments.toDF().alias("tgt"),
    (col("src.payment_id") == col("tgt.payment_id")) & (col("src.hashkey") == col("tgt.hashkey")),
    "anti"
).select("src.*")

In [0]:
from pyspark.sql.functions import *
dtable_loan_payments.alias("tgt").merge(df_src_loan_payments.alias("src"),((col("src.payment_id") == col("tgt.payment_id"))))\
    .whenMatchedUpdate(set={
        "tgt.payment_id":"src.payment_id",
        "tgt.loan_id":"src.loan_id",
        "tgt.payment_date":"src.payment_date",
        "tgt.payment_amount":"src.payment_amount",
        "tgt.hashkey":"src.hashkey",
        "tgt.updatedDate":current_timestamp(),
        "tgt.updatedby":lit("databricks-update")
    })\
        .whenNotMatchedInsert(values={
       "tgt.payment_id":"src.payment_id",
        "tgt.loan_id":"src.loan_id",
        "tgt.payment_date":"src.payment_date",
        "tgt.payment_amount":"src.payment_amount",
        "tgt.hashkey":"src.hashkey",
        "tgt.createdDate":current_timestamp(),
        "tgt.createdby":lit("databricks"),
        "tgt.updatedDate":current_timestamp(),
        "tgt.updatedBy":lit("databricks")
        }).execute()


**Loans**

In [0]:
loans_src_path='/mnt/input/Silver/loans.parquet'
loans_tgt_path='/mnt/input/Gold/loans'

In [0]:
%sql
use catalog hive_metastore;
create table if not exists loans
(
   loan_id int,
   customer_id int,
   loan_amount decimal,
   interest_rate decimal,
   loan_term int,
   hashkey bigint,
   createdby string,
   createdDate timestamp,
   updatedby string,
   updatedDate timestamp
) 
using delta
location '/mnt/input/Gold/loans'

In [0]:
df_loans=spark.read.format("parquet").option("header", "true").load(loans_src_path)

In [0]:
from pyspark.sql.functions import crc32, concat
df_loans_hash=df_loans.withColumn("hashkey",crc32(concat(*df_loans.columns)))

In [0]:
from delta.tables import *
dtable_loans = DeltaTable.forPath(spark, loans_tgt_path)
dtable_loans.toDF().show()

+-------+-----------+-----------+-------------+---------+-------+---------+-----------+---------+-----------+
|loan_id|customer_id|loan_amount|interest_rate|loan_term|hashkey|createdby|createdDate|updatedby|updatedDate|
+-------+-----------+-----------+-------------+---------+-------+---------+-----------+---------+-----------+
+-------+-----------+-----------+-------------+---------+-------+---------+-----------+---------+-----------+



In [0]:
%python
from pyspark.sql.functions import col

df_src_loans = df_loans_hash.alias("src").join(
    dtable_loans.toDF().alias("tgt"),
    (col("src.loan_id") == col("tgt.loan_id")) & (col("src.hashkey") == col("tgt.hashkey")),
    "anti"
).select("src.*")

In [0]:
from pyspark.sql.functions import *
dtable_loans.alias("tgt").merge(df_src_loans.alias("src"),((col("src.loan_id") == col("tgt.loan_id"))))\
    .whenMatchedUpdate(set={
        "tgt.loan_id":"src.loan_id",
        "tgt.customer_id":"src.customer_id",
        "tgt.loan_amount":"src.loan_amount",
        "tgt.interest_rate":"src.interest_rate",
        "tgt.loan_term":"src.loan_term",
        "tgt.hashkey":"src.hashkey",
        "tgt.updatedDate":current_timestamp(),
        "tgt.updatedby":lit("databricks-update")
    })\
        .whenNotMatchedInsert(values={
       "tgt.loan_id":"src.loan_id",
        "tgt.customer_id":"src.customer_id",
        "tgt.loan_amount":"src.loan_amount",
        "tgt.interest_rate":"src.interest_rate",
        "tgt.loan_term":"src.loan_term",
        "tgt.hashkey":"src.hashkey",
        "tgt.createdDate":current_timestamp(),
        "tgt.createdby":lit("databricks"),
        "tgt.updatedDate":current_timestamp(),
        "tgt.updatedBy":lit("databricks")
        }).execute()


**Transactions**

In [0]:
transactions_src_path='/mnt/input/Silver/transactions.parquet'
transactions_tgt_path='/mnt/input/Gold/transactions'

In [0]:
%sql
use catalog hive_metastore;
create table if not exists transactions
(
   transaction_id int,
   account_id int,
   transaction_date timestamp,
   transaction_amount decimal,
   transaction_type string,
   hashkey bigint,
   createdby string,
   createdDate timestamp,
   updatedby string,
   updatedDate timestamp
) 
using delta
location '/mnt/input/Gold/transactions'

In [0]:
df_transactions=spark.read.format("parquet").option("header", "true").load(transactions_src_path)

In [0]:
from pyspark.sql.functions import crc32, concat
df_transactions_hash=df_transactions.withColumn("hashkey",crc32(concat(*df_transactions.columns)))

In [0]:
from delta.tables import *
dtable_transactions = DeltaTable.forPath(spark, transactions_tgt_path)
dtable_transactions.toDF().show()

+--------------+----------+----------------+------------------+----------------+-------+---------+-----------+---------+-----------+
|transaction_id|account_id|transaction_date|transaction_amount|transaction_type|hashkey|createdby|createdDate|updatedby|updatedDate|
+--------------+----------+----------------+------------------+----------------+-------+---------+-----------+---------+-----------+
+--------------+----------+----------------+------------------+----------------+-------+---------+-----------+---------+-----------+



In [0]:
%python
from pyspark.sql.functions import col

df_src_transactions = df_transactions_hash.alias("src").join(
    dtable_transactions.toDF().alias("tgt"),
    (col("src.transaction_id") == col("tgt.transaction_id")) & (col("src.hashkey") == col("tgt.hashkey")),
    "anti"
).select("src.*")

In [0]:
from pyspark.sql.functions import *
dtable_transactions.alias("tgt").merge(df_src_transactions.alias("src"),((col("src.transaction_id") == col("tgt.transaction_id"))))\
    .whenMatchedUpdate(set={
        "tgt.transaction_id":"src.transaction_id",
        "tgt.account_id":"src.account_id",
        "tgt.transaction_date":"src.transaction_date",
        "tgt.transaction_amount":"src.transaction_amount",
        "tgt.transaction_type":"src.transaction_type",
        "tgt.hashkey":"src.hashkey",
        "tgt.updatedDate":current_timestamp(),
        "tgt.updatedby":lit("databricks-update")
    })\
        .whenNotMatchedInsert(values={
      "tgt.transaction_id":"src.transaction_id",
        "tgt.account_id":"src.account_id",
        "tgt.transaction_date":"src.transaction_date",
        "tgt.transaction_amount":"src.transaction_amount",
        "tgt.transaction_type":"src.transaction_type",
        "tgt.hashkey":"src.hashkey",
        "tgt.createdDate":current_timestamp(),
        "tgt.createdby":lit("databricks"),
        "tgt.updatedDate":current_timestamp(),
        "tgt.updatedBy":lit("databricks")
        }).execute()
