In [0]:
%sql
CREATE TABLE IF NOT EXISTS hive_metastore.default.accounts
(
    account_id int,
    customer_id int,
    account_type string,
    balance double,
    hashkey BIGINT,
    createdby string,
    createddate timestamp,
    updatedby string,
    updateddate timestamp

)
USING DELTA
LOCATION '/mnt/adlsgen2project2container/GoldLayer/Accounts';

In [0]:
%sql
CREATE TABLE IF NOT EXISTS hive_metastore.default.customers
(
    customer_id int,
    first_name string,
    last_name string,
    address string,
    city string,
    state string,
    zip string,
    hashkey BIGINT,
    createdby string,
    createddate timestamp,
    updatedby string,
    updateddate timestamp

)
USING DELTA
LOCATION '/mnt/adlsgen2project2container/GoldLayer/Customers';

In [0]:
%sql
CREATE TABLE IF NOT EXISTS hive_metastore.default.loanpayments
(
    payment_id int,
    loan_id int,
    payment_date timestamp,
    payment_amount double,
    hashkey BIGINT,
    createdby string,
    createddate timestamp,
    updatedby string,
    updateddate timestamp

)
USING DELTA
LOCATION '/mnt/adlsgen2project2container/GoldLayer/LoanPayments';

In [0]:
%sql
CREATE TABLE IF NOT EXISTS hive_metastore.default.loans
(
    loan_id int,
    customer_id int,
    loan_amount double,
    interest_rate double,
    loan_term int,
    hashkey BIGINT,
    createdby string,
    createddate timestamp,
    updatedby string,
    updateddate timestamp

)
USING DELTA
LOCATION '/mnt/adlsgen2project2container/GoldLayer/Loans';

In [0]:
%sql
CREATE TABLE IF NOT EXISTS hive_metastore.default.transactions
(
    transaction_id int,
    account_id int,
    transaction_date TIMESTAMP,
    transaction_amount double,
    transaction_type string,
    hashkey BIGINT,
    createdby string,
    createddate timestamp,
    updatedby string,
    updateddate timestamp

)
USING DELTA
LOCATION '/mnt/adlsgen2project2container/GoldLayer/Transactions';

In [0]:
df_accounts = spark.read.format("parquet").load("/mnt/adlsgen2project2container/SilverLayer/ParquetFiles/Accounts")

In [0]:
df_accounts.printSchema()
display(df_accounts)

In [0]:
from pyspark.sql.functions import col

In [0]:
df_accounts=df_accounts.select(col('account_id').alias('src_account_id'),col('customer_id').alias('src_customer_id'),col('account_type').alias('src_account_type'),col('balance').alias('src_balance'))
display(df_accounts)

In [0]:
from pyspark.sql.functions import *

In [0]:
df_accounts_hash=df_accounts.withColumn('src_hash', crc32(concat(col('src_account_id'), col('src_customer_id'), col('src_account_type'), col('src_balance'))))
display(df_accounts_hash)

In [0]:
tgt_path_Accounts='/mnt/adlsgen2project2container/GoldLayer/Accounts'

In [0]:
from delta.tables import DeltaTable
dtableaccounts = DeltaTable.forPath(spark, tgt_path_Accounts)
dtableaccounts.toDF().show()

In [0]:
df_src=df_accounts_hash.alias("src").join(dtableaccounts.toDF().alias("tgt"), ((col("src.src_account_id")== col("tgt.account_id")) & (col("src.src_hash")== col("tgt.hashkey"))),"anti").select("src.*")

In [0]:
display(df_src)

In [0]:
dtableaccounts.alias("tgt").merge(
    df_src.alias("src"), "tgt.account_id = src.src_account_id"
).whenMatchedUpdate(
    set={
        "tgt.account_id": "src.src_account_id",
        "tgt.customer_id": "src.src_customer_id",
        "tgt.account_type": "src.src_account_type",
        "tgt.balance": "src.src_balance",
        "tgt.hashkey": "src.src_hash",
        "tgt.updateddate": current_timestamp(),
        "tgt.updatedby": lit("databricks-updated")
    }
)\
    .whenNotMatchedInsert(
    values={
        "tgt.account_id": "src.src_account_id",
        "tgt.customer_id": "src.src_customer_id",
        "tgt.account_type": "src.src_account_type",
        "tgt.balance": "src.src_balance",
        "tgt.hashkey": "src.src_hash",
        "tgt.createddate": current_timestamp(),
        "tgt.createdby": lit("databricks"),
        "tgt.updateddate": current_timestamp(),
        "tgt.updatedby": lit("databricks")
    }
).execute()

In [0]:
%sql
select * from hive_metastore.default.accounts

In [0]:
df_customers=spark.read.format("parquet").load("/mnt/adlsgen2project2container/SilverLayer/ParquetFiles/Customers")

In [0]:
display(df_customers)

In [0]:
df_customers_hash=df_customers.withColumn('src_hash', crc32(concat(col('customer_id'), col('first_name'), col('last_name'),col('address'), col('city'), col('state'), col('zip'))))
display(df_customers_hash)

In [0]:
tgt_path_Customers='/mnt/adlsgen2project2container/GoldLayer/Customers'
from delta.tables import DeltaTable
dtablecustomers = DeltaTable.forPath(spark, tgt_path_Customers)
dtablecustomers.toDF().show()

In [0]:
df_src_customers=df_customers_hash.alias("src").join(dtablecustomers.toDF().alias("tgt"), ((col("src.customer_id")== col("tgt.customer_id")) & (col("src.src_hash")== col("tgt.hashkey"))),"anti").select("src.*")

In [0]:
display(df_src_customers)

In [0]:
dtablecustomers.alias("tgt").merge(
    df_src_customers.alias("src"), "tgt.customer_id = src.customer_id"
).whenMatchedUpdate(
    set={
        "tgt.customer_id": "src.customer_id",
        "tgt.first_name": "src.first_name",
        "tgt.last_name": "src.last_name",
        "tgt.address": "src.address",
        "tgt.city": "src.city",
        "tgt.state": "src.state",
        "tgt.zip": "src.zip",
        "tgt.hashkey": "src.src_hash",
        "tgt.updateddate": current_timestamp(),
        "tgt.updatedby": lit("databricks-updated")
    }
).whenNotMatchedInsert(
    values={
        "tgt.customer_id": "src.customer_id",
        "tgt.first_name": "src.first_name",
        "tgt.last_name": "src.last_name",
        "tgt.address": "src.address",
        "tgt.city": "src.city",
        "tgt.state": "src.state",
        "tgt.zip": "src.zip",
        "tgt.hashkey": "src.src_hash",
        "tgt.createddate": current_timestamp(),
        "tgt.createdby": lit("databricks"),
        "tgt.updateddate": current_timestamp(),
        "tgt.updatedby": lit("databricks")
    }
).execute()

In [0]:
%sql
select * from hive_metastore.default.customers

In [0]:
df_loans=spark.read.format("parquet").load("/mnt/adlsgen2project2container/SilverLayer/ParquetFiles/Loans")
display(df_loans)

In [0]:
df_hashloans=df_loans.withColumn('src_hash', crc32(concat(col('loan_id'), col('customer_id'), col('loan_amount'), col('interest_rate'), col('loan_term'))))
display(df_hashloans)

In [0]:
tgt_path_Loans='/mnt/adlsgen2project2container/GoldLayer/Loans'
from delta.tables import DeltaTable
dtableloans = DeltaTable.forPath(spark, tgt_path_Loans)
dtableloans.toDF().show()

In [0]:
df_src_loans=df_hashloans.alias("src").join(dtableloans.toDF().alias("tgt"), ((col("src.loan_id")== col("tgt.loan_id")) & (col("src.src_hash")== col("tgt.hashkey"))),"anti").select("src.*")
display(df_src_loans)


In [0]:
dtableloans.alias("tgt").merge(
    df_src_loans.alias("src"), "tgt.loan_id = src.loan_id"
).whenMatchedUpdate(
    set={
        "tgt.loan_id": "src.loan_id",
        "tgt.customer_id": "src.customer_id",
        "tgt.loan_amount": "src.loan_amount",
        "tgt.interest_rate": "src.interest_rate",
        "tgt.loan_term": "src.loan_term",
        "tgt.hashkey": "src.src_hash",
        "tgt.updateddate": current_timestamp(),
        "tgt.updatedby": lit("databricks-updated")
    }
).whenNotMatchedInsert(
    values={
        "tgt.loan_id": "src.loan_id",
        "tgt.customer_id": "src.customer_id",
        "tgt.loan_amount": "src.loan_amount",
        "tgt.interest_rate": "src.interest_rate",
        "tgt.loan_term": "src.loan_term",
        "tgt.hashkey": "src.src_hash",
        "tgt.createddate": current_timestamp(),
        "tgt.createdby": lit("databricks"),
        "tgt.updateddate": current_timestamp(),
        "tgt.updatedby": lit("databricks")
    }
).execute()

In [0]:
%sql
select * from hive_metastore.default.loans

In [0]:
df_loanPayments=spark.read.format("parquet").load("/mnt/adlsgen2project2container/SilverLayer/ParquetFiles/LoanPayments")
display(df_loanPayments)


In [0]:
df_hashloanPayments=df_loanPayments.withColumn('src_hash', crc32(concat(col('payment_id'), col('loan_id'), col('payment_date'), col('payment_amount'))))
display(df_hashloanPayments)


In [0]:
tgt_path_LoanPayments='/mnt/adlsgen2project2container/GoldLayer/LoanPayments'
from delta.tables import DeltaTable
dtableloanPayments = DeltaTable.forPath(spark, tgt_path_LoanPayments)
dtableloanPayments.toDF().show()

In [0]:
df_src_loanPayments=df_hashloanPayments.alias("src").join(dtableloanPayments.toDF().alias("tgt"), ((col("src.payment_id")== col("tgt.payment_id")) & (col("src.src_hash")== col("tgt.hashkey"))),"anti").select("src.*")
display(df_src_loanPayments)


In [0]:
dtableloanPayments.alias("tgt").merge(
    df_src_loanPayments.alias("src"), "tgt.payment_id = src.payment_id"
).whenMatchedUpdate(
    set={
        "tgt.payment_id": "src.payment_id",
        "tgt.loan_id": "src.loan_id",
        "tgt.payment_date": "src.payment_date",
        "tgt.payment_amount": "src.payment_amount",
        "tgt.hashkey": "src.src_hash",
        "tgt.updateddate": current_timestamp(),
        "tgt.updatedby": lit("databricks-updated")
    }
).whenNotMatchedInsert(
    values={
        "tgt.payment_id": "src.payment_id",
        "tgt.loan_id": "src.loan_id",
        "tgt.payment_date": "src.payment_date",
        "tgt.payment_amount": "src.payment_amount",
        "tgt.hashkey": "src.src_hash",
        "tgt.createddate": current_timestamp(),
        "tgt.createdby": lit("databricks"),
        "tgt.updateddate": current_timestamp(),
        "tgt.updatedby": lit("databricks")
    }
).execute()

In [0]:
%sql
select * from hive_metastore.default.loanpayments

In [0]:
df_transactions=spark.read.format("parquet").load("/mnt/adlsgen2project2container/SilverLayer/ParquetFiles/Transactions")
display(df_transactions)

In [0]:
df_transactions_hash=df_transactions.withColumn('src_hash', crc32(concat(col('transaction_id'), col('account_id'), col('transaction_date'), col('transaction_amount'),col('transaction_type'))))
display(df_transactions_hash)

In [0]:
tgt_path_transactions='/mnt/adlsgen2project2container/GoldLayer/Transactions'
from delta.tables import DeltaTable
dtabletransactions = DeltaTable.forPath(spark, tgt_path_transactions)
dtabletransactions.toDF().show()

In [0]:
df_src_transactions=df_transactions_hash.alias("src").join(dtabletransactions.toDF().alias("tgt"), ((col("src.transaction_id")== col("tgt.transaction_id")) & (col("src.src_hash")== col("tgt.hashkey"))),"anti").select("src.*")
display(df_src_transactions)

In [0]:
dtabletransactions.alias("tgt").merge(
    df_src_transactions.alias("src"), "tgt.transaction_id = src.transaction_id"
).whenMatchedUpdate(
    set={
        "tgt.transaction_id": "src.transaction_id",
        "tgt.account_id": "src.account_id",
        "tgt.transaction_date": "src.transaction_date",
        "tgt.transaction_amount": "src.transaction_amount",
        "tgt.transaction_type": "src.transaction_type",
        "tgt.hashkey": "src.src_hash",
        "tgt.updateddate": current_timestamp(),
        "tgt.updatedby": lit("databricks-updated")
    }
).whenNotMatchedInsert(
    values={
        "tgt.transaction_id": "src.transaction_id",
        "tgt.account_id": "src.account_id",
        "tgt.transaction_date": "src.transaction_date",
        "tgt.transaction_amount": "src.transaction_amount",
        "tgt.transaction_type": "src.transaction_type",
        "tgt.hashkey": "src.src_hash",
        "tgt.createddate": current_timestamp(),
        "tgt.createdby": lit("databricks"),
        "tgt.updateddate": current_timestamp(),
        "tgt.updatedby": lit("databricks")
    }
).execute()

In [0]:
%sql
select * from hive_metastore.default.transactions