In [0]:
# --- Configuration ---
# Update these paths based on your Catalog and Schema names
catalog = "main"
schema = "db_project"
volume = "raw_data"

base_path = f"/Volumes/main/db_project/raw_data/"
output_path = f"{base_path}chunks/"

# --- 1. Load Data & Profiling ---
# Loading the primary dataset
df_main = spark.read.csv(f"{base_path}1_main.csv", header=True, inferSchema=True)

# Requirement Day 1: Dataset Profiling
print("Dataset Profile:")
df_main.describe().show() 
# Tip: Use display(df_main) in a separate cell for a visual profile

# --- 2. Data Chunking (4 Parts) ---
# Splitting the data into 4 equal chunks 
chunk1, chunk2, chunk3, chunk4 = df_main.randomSplit([0.25, 0.25, 0.25, 0.25], seed=123)

# Chunk 1: First time load (CSV) [cite: 38]
chunk1.write.mode("overwrite").csv(f"{output_path}chunk1_initial", header=True)

# Chunk 2: Incremental load (CSV) [cite: 39]
chunk2.write.mode("overwrite").csv(f"{output_path}chunk2_incremental", header=True)

# Chunk 3: Convert to JSON format [cite: 40]
chunk3.write.mode("overwrite").json(f"{output_path}chunk3_json")

# Chunk 4: Convert to XML format [cite: 41]
# Note: Spark requires a library for XML. For Day 1 setup, we use a simple format
# We will refine the ingestion of this on Day 3 [cite: 58]
chunk4.write.mode("overwrite").format("json").save(f"{output_path}chunk4_xml_temp")

print("Chunking complete and saved to Volumes.")