In [1]:
from pyspark.sql import SparkSession
from delta import *

builder = (
    SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)
spark = configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/home/nil/anaconda3/envs/spark/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/nil/.ivy2/cache
The jars for the packages stored in: /home/nil/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-5f45d069-5c00-4cc8-b4f3-0c855ce6b0f6;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.1.0 in central
	found io.delta#delta-storage;3.1.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 348ms :: artifacts dl 11ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.1.0 from central in [default]
	io.delta#delta-storage;3.1.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   |  

In [2]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [4]:
schema = StructType(
    [
        StructField("emp_id", IntegerType(), True),
        StructField("name", StringType(), True),
        StructField("city", StringType(), True),
        StructField("country", StringType(), True),
        StructField("contact_no", IntegerType(), True),
    ]
)

In [5]:
data = [(1000, "Michael", "Columbus", "USA", 1234567890)]

df = spark.createDataFrame(data=data, schema=schema)
df.show()

                                                                                

+------+-------+--------+-------+----------+
|emp_id|   name|    city|country|contact_no|
+------+-------+--------+-------+----------+
|  1000|Michael|Columbus|    USA|1234567890|
+------+-------+--------+-------+----------+



In [6]:
spark.sql(
    """CREATE OR REPLACE TABLE dim_employee
(
    emp_id INT,
    name STRING,
    city STRING,
    country STRING,
    contact_no INT
)
USING DELTA
LOCATION '/mnt/sda3/iNeuron/Data Engineering/pySpark/Delta_Lake/delta_lake_04'"""
)

24/02/17 11:12:35 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

DataFrame[]

In [8]:
spark.sql("SELECT * FROM dim_employee").show()

                                                                                

+------+----+----+-------+----------+
|emp_id|name|city|country|contact_no|
+------+----+----+-------+----------+
+------+----+----+-------+----------+



### Method 1: Spark SQL

In [9]:
df.createOrReplaceTempView("source_view")

In [11]:
spark.sql("select * from source_view").show()

+------+-------+--------+-------+----------+
|emp_id|   name|    city|country|contact_no|
+------+-------+--------+-------+----------+
|  1000|Michael|Columbus|    USA|1234567890|
+------+-------+--------+-------+----------+



In [12]:
spark.sql(
    """MERGE INTO dim_employee as target
USING source_view as source
    ON target.emp_id=source.emp_id
    WHEN MATCHED
THEN UPDATE SET
    target.name=source.name,
    target.city=source.city,
    target.country=source.country,
    target.contact_no=source.contact_no
WHEN NOT MATCHED THEN
INSERT(emp_id,name,city,country,contact_no) VALUES (emp_id,name,city,country,contact_no)"""
)

                                                                                

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [14]:
spark.sql("select * from dim_employee").show()

+------+-------+--------+-------+----------+
|emp_id|   name|    city|country|contact_no|
+------+-------+--------+-------+----------+
|  1000|Michael|Columbus|    USA|1234567890|
+------+-------+--------+-------+----------+



In [22]:
data = [
    (1000, "Michael", "Chicago", "USA", 1234567890),
    (2000, "Michael", "New York", "USA", 1234567890),
]

df = spark.createDataFrame(data=data, schema=schema)
df.show()

+------+-------+--------+-------+----------+
|emp_id|   name|    city|country|contact_no|
+------+-------+--------+-------+----------+
|  1000|Michael| Chicago|    USA|1234567890|
|  2000|Michael|New York|    USA|1234567890|
+------+-------+--------+-------+----------+



In [23]:
df.createOrReplaceTempView("source_view")

In [24]:
# source table
spark.sql("select * from source_view").show()

+------+-------+--------+-------+----------+
|emp_id|   name|    city|country|contact_no|
+------+-------+--------+-------+----------+
|  1000|Michael| Chicago|    USA|1234567890|
|  2000|Michael|New York|    USA|1234567890|
+------+-------+--------+-------+----------+



In [25]:
# target table
spark.sql("select * from dim_employee").show()

+------+-------+--------+-------+----------+
|emp_id|   name|    city|country|contact_no|
+------+-------+--------+-------+----------+
|  1000|Michael|Columbus|    USA|1234567890|
+------+-------+--------+-------+----------+



In [26]:
spark.sql(
    """MERGE INTO dim_employee as target
USING source_view as source
    ON target.emp_id=source.emp_id
    WHEN MATCHED
THEN UPDATE SET
    target.name=source.name,
    target.city=source.city,
    target.country=source.country,
    target.contact_no=source.contact_no
WHEN NOT MATCHED THEN
INSERT (emp_id,name,city,country,contact_no) VALUES (emp_id,name,city,country,contact_no)"""
)

                                                                                

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [27]:
# updated target table
spark.sql("select * from dim_employee").show()

+------+-------+--------+-------+----------+
|emp_id|   name|    city|country|contact_no|
+------+-------+--------+-------+----------+
|  1000|Michael| Chicago|    USA|1234567890|
|  2000|Michael|New York|    USA|1234567890|
+------+-------+--------+-------+----------+



### Method 2: PySpark

In [28]:
data = [
    (2000, "Sarah", "New York", "USA", 1234567890),
    (3000, "David", "Atlanta", "USA", 1234567890),
]

df = spark.createDataFrame(data=data, schema=schema)
df.show()

+------+-----+--------+-------+----------+
|emp_id| name|    city|country|contact_no|
+------+-----+--------+-------+----------+
|  2000|Sarah|New York|    USA|1234567890|
|  3000|David| Atlanta|    USA|1234567890|
+------+-----+--------+-------+----------+



In [29]:
delta_df = DeltaTable.forPath(sparkSession=spark, path="Delta_Lake/delta_lake_04")

In [30]:
delta_df.toDF().show()

+------+-------+--------+-------+----------+
|emp_id|   name|    city|country|contact_no|
+------+-------+--------+-------+----------+
|  1000|Michael| Chicago|    USA|1234567890|
|  2000|Michael|New York|    USA|1234567890|
+------+-------+--------+-------+----------+



In [32]:
delta_df.alias("target").merge(
    source=df.alias("source"), condition="target.emp_id=source.emp_id"
).whenMatchedUpdate(
    set={
        "name": "source.name",
        "city": "source.city",
        "country": "source.country",
        "contact_no": "source.contact_no",
    }
).whenNotMatchedInsert(
    values={
        "emp_id": "source.emp_id",
        "name": "source.name",
        "city": "source.city",
        "country": "source.country",
        "contact_no": "source.contact_no",
    }
).execute()

                                                                                

In [33]:
# updated target table after PySpark merge
spark.sql("select * from dim_employee").show()

+------+-------+--------+-------+----------+
|emp_id|   name|    city|country|contact_no|
+------+-------+--------+-------+----------+
|  1000|Michael| Chicago|    USA|1234567890|
|  2000|  Sarah|New York|    USA|1234567890|
|  3000|  David| Atlanta|    USA|1234567890|
+------+-------+--------+-------+----------+

