In [0]:
from pyspark.sql.functions import lit
from datetime import datetime

# Configuration
current_time = datetime.now().isoformat()
ingest_type = "batch"
path = f"/Volumes/capstone/schema_bronze/00_data/{ingest_type}/"
archive_path = path + "archive/"
file_list = [file.name for file in dbutils.fs.ls(path) if file.name != "archive/"]
catalog_schema = "capstone.schema_bronze"

file_list

In [0]:
for file in file_list:
    
    file_path = path + file
    table_full = catalog_schema + "." + file.split(".")[0]

    # Check if file exists before processing
    try:
        # Check if file exists
        file_exists = dbutils.fs.ls(file_path)
        print(f"{file} exists")

    except Exception as file_check_error:
        # Handle file check errors (e.g., path doesn't exist)
        if "java.io.FileNotFoundException" in str(file_check_error) or "Path does not exist" in str(file_check_error):
            log_run_history={
                "file_name": file,
                "timestamp": current_time,
                "status":"NO_FILE",
                "type":"Batch file or directory not found",
                "error_message": str(file_check_error),
                }

        else:
            # Other file system errors
            log_run_history={
                "file_name": file,
                "timestamp": current_time,
                "status":"ERROR",
                "type":"Error occurred during file system check",
                "error_message": str(file_check_error),
                }
            
        file_exists = None
        print(log_run_history["type"])

    # File exists, proceed with processing
    if file_exists is not None:
        print(f"processing {file}........")
        try:
            
            # Process the file from volume
            claims_batch_df = (
                spark.read
                    .option("header", "true")
                    .csv(file_path)   # path to batch file
                    .withColumn("_source", lit(ingest_type))
                    .withColumn("_ingestion_timestamp", lit(current_time).cast("timestamp"))
            )
            
            # Get record count for logging
            record_count = claims_batch_df.count()
            
            # Write to table
            claims_batch_df.write.mode("overwrite").saveAsTable(table_full)
            
            # Move and rename the file after processed
            new_file_name = str(current_time) + "_" + file
            new_file_path = archive_path + new_file_name
            dbutils.fs.mv(file_path, new_file_path)
            
            # Log success
            log_run_history={
                "file_name": file,
                "timestamp": current_time,
                "status":"SUCCESS",
                "type":f"{ingest_type} file processed successfully",
                "error_message": None,
                }

            
        except Exception as processing_error:
            # Log processing error
            log_run_history={
                "file_name": file,
                "timestamp": current_time,
                "status":"ERROR",
                "type":"Error occurred during file processing",
                "error_message": str(processing_error),
                }
            print(processing_error)
        print(log_run_history["type"])
    


In [0]:
log_run_history