In [1]:
from functools import partial
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, regexp_replace, flatten, explode, struct, create_map, array
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, ArrayType, TimestampType


In [2]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import IntegerType, StringType

In [113]:
spark = SparkSession.builder.appName('chap_4').master("local[*]").getOrCreate()

In [6]:
from faker import Faker
import pandas as pd
fake = Faker()
def generate_data(num):
    row = [{"name":fake.name(),
           "address":fake.address(),
           "city":fake.city(),
           "state":fake.state(),
           "purchase_date":fake.date_time(),
            "purchase_id":fake.pyfloat(),
             "sales":fake.pyfloat()
           }]
    return row
panda = pd.DataFrame(generate_data(2))
fake_data = spark.createDataFrame(panda)
fake_data.write.format("parquet").mode("append").save("/Users/saisundarmasetty/Documents/data_architect_ws/chap3_lab_data/")

                                                                                

In [7]:
# Create a bronze table from the dataset. The data should be streaming but set up to trigger once.
location = "/Users/saisundarmasetty/Documents/data_architect_ws/chap3_lab_data/"
format = "parquet"
schema = spark.read.format(format).load(location).schema
users = spark.readStream.schema(schema).format(format).load(location)

In [8]:
users.printSchema()

root
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- purchase_date: timestamp (nullable = true)
 |-- purchase_id: double (nullable = true)
 |-- sales: double (nullable = true)



In [9]:
bronze_schema = users.schema
bronze_location = "/Users/saisundarmasetty/Documents/data_architect_ws/chap4_lab_bronze/"
bronze_format = "parquet"
checkpoint_location = f"{bronze_location}/_checkpoint"
output_mode = "append"
bronze_query = users.writeStream.format(bronze_format).trigger(once=True).option("checkpointLocation", checkpoint_location).option("path", bronze_location).outputMode(output_mode).start()


25/07/02 19:28:09 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


In [10]:
spark.read.format(bronze_format).schema(users.schema).load(bronze_location).show()


+----------------+--------------------+---------------+----------+--------------------+-------------------+-------------------+
|            name|             address|           city|     state|       purchase_date|        purchase_id|              sales|
+----------------+--------------------+---------------+----------+--------------------+-------------------+-------------------+
|   Sylvia Obrien|88043 William Mou...|  New Jaimestad|      Utah|1998-10-23 06:14:...| 3.4507044394328E13|2.53867269249356E11|
|Katherine Fuller|5694 Reilly Mount...|Christopherside|Washington|1994-03-17 10:37:...|-8.38804796858555E9|   480.211623671276|
|    David Nelson|4837 Wright Stati...| Christineburgh|   Vermont|1998-01-23 01:24:...|  -8152.74306742652| 8.2259285471756E12|
|    Corey Torres|3054 Christian Vi...|      Port Ryan|   Alabama|1988-11-07 03:16:...|   613647.515475998|  -34000.7887971917|
+----------------+--------------------+---------------+----------+--------------------+-----------------

our silver table, we are going add the following columns:

full_address: A combination of the address, city, and state
Id: Round up/down purchase_id and find the absolute value
Create and use a UDF to create a first_name column.

In [11]:
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf, lit, struct, concat, col, abs, floor

In [12]:
bronze_location = "/Users/saisundarmasetty/Documents/data_architect_ws/chap4_lab_bronze/"
schema = spark.read.format("parquet").load(bronze_location).schema
user_bronze = spark.readStream.format("parquet").schema(schema).load(bronze_location)

In [13]:
#creating the UDF function which deals with the datamanipulation
@udf(returnType = StringType())
def strip_name(x):
    return x.split()[0]

In [14]:
#Full address column , rounding up
address_columns = ["address","city","state"]
clean = user_bronze.select(col("*"),concat(*address_columns).alias("full_address"),floor(abs("purchase_id")).alias("id"),strip_name("name").alias("first_name"))

In [15]:
silver_location = "/Users/saisundarmasetty/Documents/data_architect_ws/chap4_lab_silver/"
silver_checkpoint = f"{silver_location}/_checkpoint"
format = "parquet"
output_mode = "append"
clean.writeStream.format(format).option("checkpointLocation","silver_checkpoint").option("path","silver_location").trigger(once=True).outputMode(output_mode).start()

25/07/02 19:28:20 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.query.StreamingQuery at 0x10c966300>

In [16]:
spark.read.format("parquet").load("silver_location").show(truncate = False)

+----------------+---------------------------------------------------------+---------------+----------+--------------------------+-------------------+-------------------+--------------------------------------------------------------------------+--------------+----------+
|name            |address                                                  |city           |state     |purchase_date             |purchase_id        |sales              |full_address                                                              |id            |first_name|
+----------------+---------------------------------------------------------+---------------+----------+--------------------------+-------------------+-------------------+--------------------------------------------------------------------------+--------------+----------+
|Sylvia Obrien   |88043 William Mountain Suite 486\nEast Lynnland, LA 91295|New Jaimestad  |Utah      |1998-10-23 06:14:58.553893|3.4507044394328E13 |2.53867269249356E11|88043 William 

Gold Table : MAX, MIN sales and states


In [17]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import lit, struct, sum,avg,max, min
silver_location = "/Users/saisundarmasetty/Documents/data_architect_ws/chap4_lab_bronze/"
schema = spark.read.format("parquet").load(silver_location).schema
users_silver = spark.readStream.format("parquet").schema(schema).load(silver_location)


In [4]:
pip install delta-spark

Note: you may need to restart the kernel to use updated packages.


In [5]:
from pyspark.sql import SparkSession
from delta import *

# Create a SparkSession with Delta Lake support
builder = SparkSession.builder.appName("DeltaApp") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

print("Spark and Delta Lake are configured.")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/07/02 19:27:48 WARN Utils: Your hostname, Sai-Sundar-MacBook-Pro.local, resolves to a loopback address: 127.0.0.1; using 10.0.0.78 instead (on interface en0)
25/07/02 19:27:48 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/Users/saisundarmasetty/Documents/data_architect_ws/chapter2/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /Users/saisundarmasetty/.ivy2.5.2/cache
The jars for the packages stored in: /Users/saisundarmasetty/.ivy2.5.2/jars
io.delta#delta-spark_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-ff7b50de-9061-4bc1-bddb-bbde6bebfd25;1.0
	confs: [default]
	found io.delta#delta-spark_2.13;4.0.0 in central
	found io.delta#delta-storage;4.0.0 in central
	found org.antlr#antlr4-runtime;4.13.1 in central
downloa

Spark and Delta Lake are configured.


In [19]:

gold_agg = users_silver.groupBy("state").agg(min("sales").alias("minimum_sales"),max("sales").alias("maximum_sales"),avg("sales").alias("avg_sales"))
gold_location = "/Users/saisundarmasetty/Documents/data_architect_ws/chap4_lab_gold/"
gold_checkpoint_location = f"{gold_location}/_checkpoint"
format = "delta"
gold_agg.writeStream.format(format).option("checkpointLocation",gold_checkpoint_location).trigger(once=True).option("path",gold_location).outputMode("complete").start()


25/07/02 19:28:45 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.query.StreamingQuery at 0x10f831490>

25/07/02 19:28:48 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [20]:
spark.read.format(format).load(gold_location).show()

+----------+-------------------+-------------------+-------------------+
|     state|      minimum_sales|      maximum_sales|          avg_sales|
+----------+-------------------+-------------------+-------------------+
|Washington|   480.211623671276|   480.211623671276|   480.211623671276|
|   Alabama|  -34000.7887971917|  -34000.7887971917|  -34000.7887971917|
|   Vermont| 8.2259285471756E12| 8.2259285471756E12| 8.2259285471756E12|
|      Utah|2.53867269249356E11|2.53867269249356E11|2.53867269249356E11|
+----------+-------------------+-------------------+-------------------+

