In [None]:
import os
import shutil
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# --- CONFIG ---
os.environ['SPARK_LOCAL_IP'] = '127.0.0.1'
input_path = "/tmp/input_json"
checkpoint_path = "/tmp/checkpoints/json_stream"

delta_path = "/tmp/delta/json_to_delta"
chk_delta = "/tmp/delta/checkpoints/delta_stream_json"

# Clean up for a fresh start
# if os.path.exists(input_path): shutil.rmtree(input_path)
# if os.path.exists(checkpoint_path): shutil.rmtree(checkpoint_path)
# os.makedirs(input_path) # Create the folder to watch


# spark = SparkSession.builder \
#     .appName("FileFolderStream") \
#     .master("local[*]") \
#     .config("spark.driver.bindAddress", "127.0.0.1") \
#     .config("spark.driver.host", "127.0.0.1") \
#     .getOrCreate()




In [None]:
pyspark_version = pyspark.__version__
pyspark_version

In [None]:
# --- INIT SPARK ---
spark = SparkSession.builder \
    .appName("FileFolderStream") \
    .master("local[*]") \
    .config("spark.driver.bindAddress", "127.0.0.1") \
    .config("spark.driver.host", "127.0.0.1") \
    .config("spark.executor.processTreeMetrics.enabled", "false") \
    .config("spark.jars.packages", "io.delta:delta-spark_2.13:4.0.1") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

spark dashboard is available at: http://localhost:4040/

In [None]:
# --- DEFINE SCHEMA ---
# When streaming files, you MUST specify a schema (Spark can't infer it automatically reliably)
json_schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("city", StringType(), True)
])

# --- READ STREAM ---
# This monitors the folder for NEW files
df = spark.readStream \
    .format("json") \
    .schema(json_schema) \
    .load(input_path)  # <--- The folder to watch

# --- WRITE STREAM ---
query = df.writeStream \
    .format("console") \
    .outputMode("append") \
    .option("checkpointLocation", checkpoint_path) \
    .queryName('Console stream')\
    .start()

query_delta = df.writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", chk_delta) \ÃŸ
    .queryName('Delta stream') \
    .start(delta_path)


print(f"Monitoring {input_path} for new JSON files...")
query.awaitTermination()