# Step 1
### Analyze and extract the json information into three parquet files 

![](/Volumes/de_demo/default/ev_data/1. Analyze and Extract.png)

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, explode, explode_outer, split
from pyspark.sql.types import StringType, IntegerType, StructType, StructField
from pyspark.sql.functions import col
from pyspark.sql.types import ArrayType,StructType
import json

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Read JSON Data") \
    .master("local[*]") \
    .getOrCreate()

In [0]:
# Read JSON file from S3 bucket
file = "s3a://evdata-test/raw/ElectricVehiclePopulationData.json"
multiline_df = spark.read.option("multiline", "true") \
      .json(file)
multiline_df.printSchema()
multiline_df.show()

root
 |-- data: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- meta: struct (nullable = true)
 |    |-- view: struct (nullable = true)
 |    |    |-- approvals: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- reviewedAt: long (nullable = true)
 |    |    |    |    |-- reviewedAutomatically: boolean (nullable = true)
 |    |    |    |    |-- state: string (nullable = true)
 |    |    |    |    |-- submissionDetails: struct (nullable = true)
 |    |    |    |    |    |-- permissionType: string (nullable = true)
 |    |    |    |    |-- submissionId: long (nullable = true)
 |    |    |    |    |-- submissionObject: string (nullable = true)
 |    |    |    |    |-- submissionOutcome: string (nullable = true)
 |    |    |    |    |-- submissionOutcomeApplication: struct (nullable = true)
 |    |    |    |    |    |-- failureCount: long (nullable = true

In [0]:
# Generate 3 data frames one each for table metadata, column metadata and vehicle data
table_metadata = multiline_df.select("meta.view.*").drop("columns")
columns_metadata = multiline_df.select(explode(col("meta.view.columns")).alias("columns"))
columns_metadata = columns_metadata.select("columns.*")
vehicle_data = multiline_df.select(explode(col("data")).alias("row_data"))

In [0]:
# Function to flatten the json by iterating through fields - both arrays and structs


def flatten_json(df):

    """
    Flattens a DataFrame with complex nested fields (Arrays and Structs) by converting them into individual columns.
   
    Parameters:
    - df: The input DataFrame with complex nested fields
   
    Returns:
    - The flattened DataFrame with all complex fields expanded into separate columns.
   """
   # compute Complex Fields (Lists and Structs) in Schema   
    complex_fields = dict([(field.name, field.dataType)
                             for field in df.schema.fields
                             if type(field.dataType) == ArrayType or  type(field.dataType) == StructType])
    print(df.schema)
    print("")
    while len(complex_fields)!=0:
      col_name=list(complex_fields.keys())[0]
      print ("Processing :"+col_name+" Type : "+str(type(complex_fields[col_name])))
    
      # if StructType then convert all sub element to columns.
      # i.e. flatten structs
      if (type(complex_fields[col_name]) == StructType):
         expanded = [col(col_name+'.'+k).alias(col_name+'_'+k) for k in [ n.name for n in  complex_fields[col_name]]]
         df=df.select("*", *expanded).drop(col_name)
    
      # if ArrayType then add the Array Elements as Rows using the explode function
      # i.e. explode Arrays
      elif (type(complex_fields[col_name]) == ArrayType):    
         df=df.withColumn(col_name,explode_outer(col_name))
    
      # recompute remaining Complex Fields in Schema       
      complex_fields = dict([(field.name, field.dataType)
                             for field in df.schema.fields
                             if type(field.dataType) == ArrayType or  type(field.dataType) == StructType])
    return df

In [0]:
# Flatten table metadata and exclude metadata fields that has special characters
table_metadata = multiline_df.select("meta.view.*").drop("columns")
table_meta = table_metadata.select("*").drop("metadata")
flatten_table_meta_df = flatten_json(table_meta)
flatten_table_meta_df.display()

StructType([StructField('approvals', ArrayType(StructType([StructField('reviewedAt', LongType(), True), StructField('reviewedAutomatically', BooleanType(), True), StructField('state', StringType(), True), StructField('submissionDetails', StructType([StructField('permissionType', StringType(), True)]), True), StructField('submissionId', LongType(), True), StructField('submissionObject', StringType(), True), StructField('submissionOutcome', StringType(), True), StructField('submissionOutcomeApplication', StructType([StructField('failureCount', LongType(), True), StructField('status', StringType(), True)]), True), StructField('submittedAt', LongType(), True), StructField('submitter', StructType([StructField('displayName', StringType(), True), StructField('id', StringType(), True)]), True), StructField('workflowId', LongType(), True)]), True), True), StructField('assetType', StringType(), True), StructField('attribution', StringType(), True), StructField('averageRating', LongType(), True),

assetType,attribution,averageRating,category,createdAt,description,displayType,downloadCount,flags,hideFromCatalog,hideFromDataJson,id,name,newBackend,numberOfComments,oid,provenance,publicationAppendEnabled,publicationDate,publicationGroup,publicationStage,rights,rowsUpdatedAt,rowsUpdatedBy,tableId,tags,totalTimesRated,viewCount,viewLastModified,viewType,approvals_reviewedAt,approvals_reviewedAutomatically,approvals_state,approvals_submissionId,approvals_submissionObject,approvals_submissionOutcome,approvals_submittedAt,approvals_workflowId,clientContext_clientContextVariables,grants_flags,grants_inherited,grants_type,owner_displayName,owner_flags,owner_id,owner_profileImageUrlLarge,owner_profileImageUrlMedium,owner_profileImageUrlSmall,owner_screenName,owner_type,tableAuthor_displayName,tableAuthor_flags,tableAuthor_id,tableAuthor_profileImageUrlLarge,tableAuthor_profileImageUrlMedium,tableAuthor_profileImageUrlSmall,tableAuthor_screenName,tableAuthor_type,approvals_submissionDetails_permissionType,approvals_submissionOutcomeApplication_failureCount,approvals_submissionOutcomeApplication_status,approvals_submitter_displayName,approvals_submitter_id
dataset,Washington State Department of Licensing,0,Transportation,1555435581,This dataset shows the Battery Electric Vehicles (BEVs) and Plug-in Hybrid Electric Vehicles (PHEVs) that are currently registered through Washington State Department of Licensing (DOL).,table,52628,default,False,False,f6w7-q2d2,Electric Vehicle Population Data,True,0,38591322,official,False,1655411270,16109923,published,read,1676414279,nzip-b272,18748445,tesla,0,23105,1676414230,tabular,1559931329,True,approved,4774840,public_audience_request,change_audience,1559931329,2106,,public,False,viewer,Department of Licensing,acceptedEula,eagg-6py7,/api/users/eagg-6py7/profile_images/LARGE,/api/users/eagg-6py7/profile_images/THUMB,/api/users/eagg-6py7/profile_images/TINY,Department of Licensing,interactive,Department of Licensing,acceptedEula,eagg-6py7,/api/users/eagg-6py7/profile_images/LARGE,/api/users/eagg-6py7/profile_images/THUMB,/api/users/eagg-6py7/profile_images/TINY,Department of Licensing,interactive,READ,0,success,Department of Licensing,eagg-6py7
dataset,Washington State Department of Licensing,0,Transportation,1555435581,This dataset shows the Battery Electric Vehicles (BEVs) and Plug-in Hybrid Electric Vehicles (PHEVs) that are currently registered through Washington State Department of Licensing (DOL).,table,52628,default,False,False,f6w7-q2d2,Electric Vehicle Population Data,True,0,38591322,official,False,1655411270,16109923,published,read,1676414279,nzip-b272,18748445,tesla,0,23105,1676414230,tabular,1559931329,True,approved,4774840,public_audience_request,change_audience,1559931329,2106,,public,False,viewer,Department of Licensing,acceptedEula,eagg-6py7,/api/users/eagg-6py7/profile_images/LARGE,/api/users/eagg-6py7/profile_images/THUMB,/api/users/eagg-6py7/profile_images/TINY,Department of Licensing,interactive,Department of Licensing,mayBeStoriesCoOwner,eagg-6py7,/api/users/eagg-6py7/profile_images/LARGE,/api/users/eagg-6py7/profile_images/THUMB,/api/users/eagg-6py7/profile_images/TINY,Department of Licensing,interactive,READ,0,success,Department of Licensing,eagg-6py7
dataset,Washington State Department of Licensing,0,Transportation,1555435581,This dataset shows the Battery Electric Vehicles (BEVs) and Plug-in Hybrid Electric Vehicles (PHEVs) that are currently registered through Washington State Department of Licensing (DOL).,table,52628,default,False,False,f6w7-q2d2,Electric Vehicle Population Data,True,0,38591322,official,False,1655411270,16109923,published,read,1676414279,nzip-b272,18748445,tesla,0,23105,1676414230,tabular,1559931329,True,approved,4774840,public_audience_request,change_audience,1559931329,2106,,public,False,viewer,Department of Licensing,mayBeStoriesCoOwner,eagg-6py7,/api/users/eagg-6py7/profile_images/LARGE,/api/users/eagg-6py7/profile_images/THUMB,/api/users/eagg-6py7/profile_images/TINY,Department of Licensing,interactive,Department of Licensing,acceptedEula,eagg-6py7,/api/users/eagg-6py7/profile_images/LARGE,/api/users/eagg-6py7/profile_images/THUMB,/api/users/eagg-6py7/profile_images/TINY,Department of Licensing,interactive,READ,0,success,Department of Licensing,eagg-6py7
dataset,Washington State Department of Licensing,0,Transportation,1555435581,This dataset shows the Battery Electric Vehicles (BEVs) and Plug-in Hybrid Electric Vehicles (PHEVs) that are currently registered through Washington State Department of Licensing (DOL).,table,52628,default,False,False,f6w7-q2d2,Electric Vehicle Population Data,True,0,38591322,official,False,1655411270,16109923,published,read,1676414279,nzip-b272,18748445,tesla,0,23105,1676414230,tabular,1559931329,True,approved,4774840,public_audience_request,change_audience,1559931329,2106,,public,False,viewer,Department of Licensing,mayBeStoriesCoOwner,eagg-6py7,/api/users/eagg-6py7/profile_images/LARGE,/api/users/eagg-6py7/profile_images/THUMB,/api/users/eagg-6py7/profile_images/TINY,Department of Licensing,interactive,Department of Licensing,mayBeStoriesCoOwner,eagg-6py7,/api/users/eagg-6py7/profile_images/LARGE,/api/users/eagg-6py7/profile_images/THUMB,/api/users/eagg-6py7/profile_images/TINY,Department of Licensing,interactive,READ,0,success,Department of Licensing,eagg-6py7
dataset,Washington State Department of Licensing,0,Transportation,1555435581,This dataset shows the Battery Electric Vehicles (BEVs) and Plug-in Hybrid Electric Vehicles (PHEVs) that are currently registered through Washington State Department of Licensing (DOL).,table,52628,default,False,False,f6w7-q2d2,Electric Vehicle Population Data,True,0,38591322,official,False,1655411270,16109923,published,read,1676414279,nzip-b272,18748445,leaf,0,23105,1676414230,tabular,1559931329,True,approved,4774840,public_audience_request,change_audience,1559931329,2106,,public,False,viewer,Department of Licensing,acceptedEula,eagg-6py7,/api/users/eagg-6py7/profile_images/LARGE,/api/users/eagg-6py7/profile_images/THUMB,/api/users/eagg-6py7/profile_images/TINY,Department of Licensing,interactive,Department of Licensing,acceptedEula,eagg-6py7,/api/users/eagg-6py7/profile_images/LARGE,/api/users/eagg-6py7/profile_images/THUMB,/api/users/eagg-6py7/profile_images/TINY,Department of Licensing,interactive,READ,0,success,Department of Licensing,eagg-6py7
dataset,Washington State Department of Licensing,0,Transportation,1555435581,This dataset shows the Battery Electric Vehicles (BEVs) and Plug-in Hybrid Electric Vehicles (PHEVs) that are currently registered through Washington State Department of Licensing (DOL).,table,52628,default,False,False,f6w7-q2d2,Electric Vehicle Population Data,True,0,38591322,official,False,1655411270,16109923,published,read,1676414279,nzip-b272,18748445,leaf,0,23105,1676414230,tabular,1559931329,True,approved,4774840,public_audience_request,change_audience,1559931329,2106,,public,False,viewer,Department of Licensing,acceptedEula,eagg-6py7,/api/users/eagg-6py7/profile_images/LARGE,/api/users/eagg-6py7/profile_images/THUMB,/api/users/eagg-6py7/profile_images/TINY,Department of Licensing,interactive,Department of Licensing,mayBeStoriesCoOwner,eagg-6py7,/api/users/eagg-6py7/profile_images/LARGE,/api/users/eagg-6py7/profile_images/THUMB,/api/users/eagg-6py7/profile_images/TINY,Department of Licensing,interactive,READ,0,success,Department of Licensing,eagg-6py7
dataset,Washington State Department of Licensing,0,Transportation,1555435581,This dataset shows the Battery Electric Vehicles (BEVs) and Plug-in Hybrid Electric Vehicles (PHEVs) that are currently registered through Washington State Department of Licensing (DOL).,table,52628,default,False,False,f6w7-q2d2,Electric Vehicle Population Data,True,0,38591322,official,False,1655411270,16109923,published,read,1676414279,nzip-b272,18748445,leaf,0,23105,1676414230,tabular,1559931329,True,approved,4774840,public_audience_request,change_audience,1559931329,2106,,public,False,viewer,Department of Licensing,mayBeStoriesCoOwner,eagg-6py7,/api/users/eagg-6py7/profile_images/LARGE,/api/users/eagg-6py7/profile_images/THUMB,/api/users/eagg-6py7/profile_images/TINY,Department of Licensing,interactive,Department of Licensing,acceptedEula,eagg-6py7,/api/users/eagg-6py7/profile_images/LARGE,/api/users/eagg-6py7/profile_images/THUMB,/api/users/eagg-6py7/profile_images/TINY,Department of Licensing,interactive,READ,0,success,Department of Licensing,eagg-6py7
dataset,Washington State Department of Licensing,0,Transportation,1555435581,This dataset shows the Battery Electric Vehicles (BEVs) and Plug-in Hybrid Electric Vehicles (PHEVs) that are currently registered through Washington State Department of Licensing (DOL).,table,52628,default,False,False,f6w7-q2d2,Electric Vehicle Population Data,True,0,38591322,official,False,1655411270,16109923,published,read,1676414279,nzip-b272,18748445,leaf,0,23105,1676414230,tabular,1559931329,True,approved,4774840,public_audience_request,change_audience,1559931329,2106,,public,False,viewer,Department of Licensing,mayBeStoriesCoOwner,eagg-6py7,/api/users/eagg-6py7/profile_images/LARGE,/api/users/eagg-6py7/profile_images/THUMB,/api/users/eagg-6py7/profile_images/TINY,Department of Licensing,interactive,Department of Licensing,mayBeStoriesCoOwner,eagg-6py7,/api/users/eagg-6py7/profile_images/LARGE,/api/users/eagg-6py7/profile_images/THUMB,/api/users/eagg-6py7/profile_images/TINY,Department of Licensing,interactive,READ,0,success,Department of Licensing,eagg-6py7
dataset,Washington State Department of Licensing,0,Transportation,1555435581,This dataset shows the Battery Electric Vehicles (BEVs) and Plug-in Hybrid Electric Vehicles (PHEVs) that are currently registered through Washington State Department of Licensing (DOL).,table,52628,default,False,False,f6w7-q2d2,Electric Vehicle Population Data,True,0,38591322,official,False,1655411270,16109923,published,read,1676414279,nzip-b272,18748445,nissan,0,23105,1676414230,tabular,1559931329,True,approved,4774840,public_audience_request,change_audience,1559931329,2106,,public,False,viewer,Department of Licensing,acceptedEula,eagg-6py7,/api/users/eagg-6py7/profile_images/LARGE,/api/users/eagg-6py7/profile_images/THUMB,/api/users/eagg-6py7/profile_images/TINY,Department of Licensing,interactive,Department of Licensing,acceptedEula,eagg-6py7,/api/users/eagg-6py7/profile_images/LARGE,/api/users/eagg-6py7/profile_images/THUMB,/api/users/eagg-6py7/profile_images/TINY,Department of Licensing,interactive,READ,0,success,Department of Licensing,eagg-6py7
dataset,Washington State Department of Licensing,0,Transportation,1555435581,This dataset shows the Battery Electric Vehicles (BEVs) and Plug-in Hybrid Electric Vehicles (PHEVs) that are currently registered through Washington State Department of Licensing (DOL).,table,52628,default,False,False,f6w7-q2d2,Electric Vehicle Population Data,True,0,38591322,official,False,1655411270,16109923,published,read,1676414279,nzip-b272,18748445,nissan,0,23105,1676414230,tabular,1559931329,True,approved,4774840,public_audience_request,change_audience,1559931329,2106,,public,False,viewer,Department of Licensing,acceptedEula,eagg-6py7,/api/users/eagg-6py7/profile_images/LARGE,/api/users/eagg-6py7/profile_images/THUMB,/api/users/eagg-6py7/profile_images/TINY,Department of Licensing,interactive,Department of Licensing,mayBeStoriesCoOwner,eagg-6py7,/api/users/eagg-6py7/profile_images/LARGE,/api/users/eagg-6py7/profile_images/THUMB,/api/users/eagg-6py7/profile_images/TINY,Department of Licensing,interactive,READ,0,success,Department of Licensing,eagg-6py7


In [0]:
columns_metadata.display()

computationStrategy,dataTypeName,description,fieldName,flags,format,id,name,position,renderTypeName,tableColumnId
,meta_data,,:sid,List(hidden),List(null),-1,sid,0,meta_data,
,meta_data,,:id,List(hidden),List(null),-1,id,0,meta_data,
,meta_data,,:position,List(hidden),List(null),-1,position,0,meta_data,
,meta_data,,:created_at,List(hidden),List(null),-1,created_at,0,meta_data,
,meta_data,,:created_meta,List(hidden),List(null),-1,created_meta,0,meta_data,
,meta_data,,:updated_at,List(hidden),List(null),-1,updated_at,0,meta_data,
,meta_data,,:updated_meta,List(hidden),List(null),-1,updated_meta,0,meta_data,
,meta_data,,:meta,List(hidden),List(null),-1,meta,0,meta_data,
,text,The 1st 10 characters of each vehicle's Vehicle Identification Number (VIN).,vin_1_10,,List(null),561974342,VIN (1-10),1,text,79309982.0
,text,The county in which the registered owner resides.,county,,List(null),561974343,County,2,text,79309981.0


In [0]:
vehicle_data.display()

row_data
"List(row-zt4k~iszy.uhv6, 00000000-0000-0000-62B4-C1BC527B773A, 0, 1676414233, null, 1676414284, null, { }, 5YJ3E1EA8J, San Diego, Oceanside, CA, 92051, 2018, TESLA, MODEL 3, Battery Electric Vehicle (BEV), Clean Alternative Fuel Vehicle Eligible, 215, 0, null, 153998050, null, null, 06073018509, null, null, null)"
"List(row-5r58~kb8y.789r, 00000000-0000-0000-B54E-F27AFFF902F6, 0, 1676414233, null, 1676414284, null, { }, 3FA6P0PU7H, Sedgwick, Derby, KS, 67037, 2017, FORD, FUSION, Plug-in Hybrid Electric Vehicle (PHEV), Not eligible due to low battery range, 21, 0, null, 138214331, POINT (-97.27013 37.54531), null, 20173009801, 1291, null, null)"
"List(row-84ix~3wif_u9ju, 00000000-0000-0000-F67B-BBFF22B88E48, 0, 1676414233, null, 1676414298, null, { }, 1N4AZ0CP8D, Snohomish, Marysville, WA, 98271, 2013, NISSAN, LEAF, Battery Electric Vehicle (BEV), Clean Alternative Fuel Vehicle Eligible, 75, 0, 38, 3129059, POINT (-122.19388 48.15353), PUGET SOUND ENERGY INC, 53061052805, 3213, 2, 40)"
"List(row-wiar-siae_sed9, 00000000-0000-0000-0360-775CFE2EDAFF, 0, 1676414233, null, 1676414298, null, { }, WBY8P8C58K, Kitsap, Bremerton, WA, 98337, 2019, BMW, I3, Plug-in Hybrid Electric Vehicle (PHEV), Clean Alternative Fuel Vehicle Eligible, 126, 0, 26, 166525635, POINT (-122.62749 47.565), PUGET SOUND ENERGY INC, 53035080500, 848, 6, 33)"
"List(row-abd5~finn.nzkg, 00000000-0000-0000-3182-A2040CC92549, 0, 1676414233, null, 1676414298, null, { }, 5YJ3E1EA7K, Snohomish, Edmonds, WA, 98026, 2019, TESLA, MODEL 3, Battery Electric Vehicle (BEV), Clean Alternative Fuel Vehicle Eligible, 220, 0, 32, 475248315, POINT (-122.31768 47.87166), PUGET SOUND ENERGY INC, 53061050800, 3213, 2, 7)"
"List(row-9qmp.7c6z.3ncd, 00000000-0000-0000-2B0A-D5C4FBDB5076, 0, 1676414233, null, 1676414298, null, { }, 1G1FZ6S07L, Walla Walla, Walla Walla, WA, 99362, 2020, CHEVROLET, BOLT EV, Battery Electric Vehicle (BEV), Clean Alternative Fuel Vehicle Eligible, 259, 0, 16, 150312991, POINT (-118.34261 46.07068), PACIFICORP, 53071920702, 2743, 5, 23)"
"List(row-qnct~snmi~bfh6, 00000000-0000-0000-64A6-BA447C35E0FA, 0, 1676414233, null, 1676414298, null, { }, KNDCC3LG1L, Snohomish, Everett, WA, 98204, 2020, KIA, NIRO, Battery Electric Vehicle (BEV), Clean Alternative Fuel Vehicle Eligible, 239, 0, 21, 152471728, POINT (-122.25527 47.90456), PUGET SOUND ENERGY INC, 53061041905, 3213, 2, 7)"
"List(row-isc5~dgtb~rt8i, 00000000-0000-0000-B95A-E0F2BAF47895, 0, 1676414233, null, 1676414298, null, { }, 1N4AZ0CP5D, Island, Oak Harbor, WA, 98277, 2013, NISSAN, LEAF, Battery Electric Vehicle (BEV), Clean Alternative Fuel Vehicle Eligible, 75, 0, 10, 234850367, POINT (-122.64682 48.29077), PUGET SOUND ENERGY INC, 53029970300, 3007, 2, 16)"
"List(row-un7y_4v74~he2e, 00000000-0000-0000-5F1F-0A80D3988C08, 0, 1676414233, null, 1676414298, null, { }, 5YJ3E1EA1L, Snohomish, Bothell, WA, 98021, 2020, TESLA, MODEL 3, Battery Electric Vehicle (BEV), Clean Alternative Fuel Vehicle Eligible, 266, 0, 1, 110771972, POINT (-122.18384 47.8031), PUGET SOUND ENERGY INC, 53061051937, 3213, 1, 1)"
"List(row-ynhd~9n9n_nrh6, 00000000-0000-0000-D1B9-DB78A4838B73, 0, 1676414233, null, 1676414298, null, { }, 5YJ3E1EB8K, Thurston, Olympia, WA, 98501, 2019, TESLA, MODEL 3, Battery Electric Vehicle (BEV), Clean Alternative Fuel Vehicle Eligible, 220, 0, 22, 477769939, POINT (-122.89166 47.03956), PUGET SOUND ENERGY INC, 53067010700, 2742, 10, 28)"


In [0]:
# Set the AWS Access Key and Secret Key in Spark configuration
spark.conf.set("fs.s3a.access.key", "AKIAYHJANKE5BZ22VW5P")
spark.conf.set("fs.s3a.secret.key", "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")
spark.conf.set("fs.s3a.endpoint", "s3.amazonaws.com")


In [0]:
flatten_table_meta_df.coalesce(1).write.mode("overwrite").parquet("s3a://evdata-test/derived/table_metadata")
columns_metadata.coalesce(1).write.mode("overwrite").parquet("s3a://evdata-test/derived/columns_metadata")



In [0]:
data_headers = columns_metadata.select("name").rdd.flatMap(lambda x: x).collect()
vehicle_data_exploded = vehicle_data.select(*[col('row_data').getItem(i).alias(f'row_data{i+1}') for i in range(0, 28)])
vehicle_data_exploded = vehicle_data_exploded.toDF(*data_headers)
#vehicle_data_exploded.display()

vehicle_data_exploded.write.mode("overwrite").parquet("s3a://evdata-test/derived/vehicle_data")



# Findings 

JSON has two elements 
1. meta
2. data 

meta consists of metadata information that has data set metadata - referring this as table metadata going forward and column metadata. 

table metadata consists of various types of arrays and struct fields of which approvals, submission information is also part of it. Table metadata is flattened before being written to S3 bucket

column metadata consists of name, data type, description, position and other details. 

names from column metadata are extracted as first row and stitched together with vehicle data and written as a parquet file to the derived folder

![](/Volumes/de_demo/default/ev_data/S3 File processing.png)