Events Decomposition and Data Extraction


In [99]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType

def process_nested_fields(df, field_name, fields, extra_directory=None):    
    output_table_fields = []
    for field in fields:        
        if not isinstance(field.dataType, (ArrayType, StructType)):
            path_name = ".".join([field_name, field.name]).strip(".")
            output_table_fields.append(path_name)
    
    if output_table_fields:
        if extra_directory:
            parquet_path = extra_directory
        else:
            parquet_path = field_name if field_name else "root"

        print(f"Writing parquet file to: /data/events/data/{parquet_path}")
        df.select(output_table_fields).write.mode("overwrite").parquet(f"/data/events/data/{parquet_path}")
    

    nested_struct_fields = [f for f in fields if isinstance(f.dataType, StructType)]    
    for struct_field in nested_struct_fields:        
        path_name = ".".join([field_name, struct_field.name]).strip(".")       
        process_nested_fields(df, path_name, struct_field.dataType.fields)


    nested_array_fields = [f.name for f in fields if isinstance(f.dataType, ArrayType)]
    for array_field in nested_array_fields:
        path_name = ".".join([field_name, array_field]).strip(".")                
        nested_df = df.select(explode(col(path_name)).alias(field_name)).select(field_name + ".*")                
        process_nested_fields(nested_df,"", nested_df.schema.fields, path_name)
        
        


spark = SparkSession.builder.appName("DataJsonProcessing").getOrCreate()

data_path = "/data/events/sap.sf.workforce.assignment.updated.v1_20250708130214.jsonl"

events_dataframe = spark.read.json(data_path)

process_nested_fields(events_dataframe, "", events_dataframe.schema.fields)




    
    







Writing parquet file to: /data/events/data/root
Writing parquet file to: /data/events/data/data
Writing parquet file to: /data/events/data/data.assignmentDetail.standardAssignmentDetail
Writing parquet file to: /data/events/data/data.jobDetails
Writing parquet file to: /data/events/data/jobDetailUSA
