In [1]:
# import necessary packages and libraries
import json, datetime, time

import pandas as pd

from azure.storage.blob import BlobServiceClient, BlobClient

from pyspark.sql.types import *

from pyspark.sql.functions import *

from delta import *

In [2]:
# define necessary connections to storage ( source and destination )
# abfss because my storage account has HNS enabled
adlsAcct = "<storage account url>"
adlsSas = "<sas token>"
adlsCont = "insiders"

# define variables for blob store connection
storage_acct = "sparkmtndatalake"
container_name = "insiders"
linked_svc = "SparkMtnLake"

# sas token will be pulled from linked service definition
sas_token = mssparkutils.credentials.getConnectionStringOrCreds( linked_svc )

spark.conf.set( "fs.az.sas.%s.%s.x.y.z.abc" % ( container_name, storage_acct), sas_token )

httpsUrl = "https://%s.x.y.z.abc/" % ( storage_acct )
abfssUrl = "abfss://%s@%s.x.y.z.abc/" % ( container_name, storage_acct )

rawFldr = "/raw/"
bronzeFldr =  "/bronze/"
silverFldr = "/silver/"

blobSvcConn = BlobServiceClient( httpsUrl, credential = sas_token )

contClient = blobSvcConn.get_container_client( container_name )

In [None]:
# silver layer
# part 1 - patient data

# chreate schema first to do initial load
# grab first file manually - keep it simple
firstFilePath = abfssUrl + "bronze/reference_data/000047ca-00c7-492b-bf65-740805144cd2/"

# create schema from first Patient file
pathPatientSchema = firstFilePath + "Patient.ndjson"
patientSchema = spark.read.option( "multiline", "true" ).json( pathPatientSchema ).schema

abfssFileList = abfssUrl + "bronze/reference_data/*/Patient.ndjson"

silverExportPath = abfssUrl + "silver/"

# recursively load all Patient files into new DF with schema defined above
patientDf = spark.read.option( "multiline", "true" ).option( "columnNameOfCorruptRecord", "corruptRecord" ).option( "recursiveFileLookup", "true" ).schema( patientSchema ).json( abfssFileList )
# patientDf.show( 10, False )
# patientDf.printSchema()

##################
# patientAddress #
##################
# "deceasedDateTime", - between gender and martialStatus
patientAddressDf = patientDf.select( "id", "name", "birthDate", "gender", explode_outer( "address" ).alias( "addressExpand") ).select( "id", "name", "birthDate", "gender", "addressExpand" )

# patientAddressDf.printSchema()
# patientAddressDf.show( 10, False )

# export to delta
exportPath = silverExportPath + "patientAddress/"
# patientAddressDf.select( "id", "birthDate", "gender", "addressExpand.city", "addressExpand.country", "addressExpand.postalCode", "addressExpand.state" ).select( "id", "birthDate", "gender", "city", "country", "postalCode", "state" ).write.format( "delta" ).save( exportPath )

#####################
# patientIdentifier #
#####################
patientIdDf = patientDf.select( "id", "birthDate", "gender", "maritalStatus", explode_outer( "name" ).alias( "nameExpand" ) ).select( "id", "birthDate", "gender", "maritalStatus", "nameExpand" )

# patientIdDf.printSchema()
# patientIdDf.show( 10, False )

# export to delta
exportPath = silverExportPath + "patientIdentification/"
# patientIdDf.select( "id", "birthDate", "gender", "maritalStatus", "nameExpand.family", "nameExpand.given" ).select( "id", "birthDate", "gender", "maritalStatus", "family", "given" ).write.format( "delta" ).save( exportPath )

####################
# patientExtension #
####################
patientExtensionDf = patientDf.select( "id", "birthDate", "gender", "maritalStatus",  explode_outer( "extension" ).alias( "extensionExpand" ) ).select( "id", "birthDate", "gender", "maritalStatus",  "extensionExpand" )

# patientExtensionDf.printSchema()
# patientExtensionDf.show( 10, False )

# export to delta
# patientExtensionDf.show( 10 )
exportPath = silverExportPath + "patientExtension/"
# patientExtensionDf.select( "id", "birthDate", "gender", "maritalStatus", "extensionExpand.url", "extensionExpand.valueAddress", "extensionExpand.valueDecimal", "extensionExpand.valueString" ).select( "id", "birthDate", "gender", "maritalStatus", "url", "valueAddress", "valueDecimal", "valueString" ).write.format( "delta" ).save( exportPath )

In [3]:
# silver layer
# part 2 - Claims data

# https://sparkmtndatalake.blob.core.usgovcloudapi.net/insiders/bronze/historic_data/Claim/year=2016/part-00000-af12a187-6fd9-4b3f-9d52-afe965b9f9dd.c000.json

silverExportPath = abfssUrl + "silver/"
historicalFilePath = abfssUrl + "bronze/historic_data/"
incrementalFilePath = abfssUrl + "bronze/incremental_data/"

# load claims files
histFirstFilePath = historicalFilePath + "Claim/year=2016/part-00000-af12a187-6fd9-4b3f-9d52-afe965b9f9dd.c000.json"
incrFirstFilePath = incrementalFilePath + "Claim/year=2021/month=02/day=08/part-00000-efc1c61f-2cdd-43ee-b7f7-2c96f6d5d7a3.c000.json"

## historical ##
# create schema first to do initial file load
pathClaimSchema = histFirstFilePath
claimSchema = spark.read.option( "multiline", "true" ).json( pathClaimSchema ).schema

abfssFileList = historicalFilePath + "Claim/*/"

# recursively load historic Claims files
historyClaimsDf = spark.read.option( "multiline", "true" ).option( "columnNameOfCorruptRecord", "corruptRecord" ).option( "recursiveFileLookup", "true" ).schema( claimSchema ).json( abfssFileList )
# historyClaimsDf.printSchema()
# historyClaimsDf.show( 10, False )

histClaimsDf = historyClaimsDf.withColumn( "createdDate", to_date( col( "created" ) ).cast( "date" ) )
# histClaimsDf.printSchema()
# histClaimsDf.show( 10, False )

## incremental ##
# create schema
pathClaimSchema = incrFirstFilePath
claimSchema = spark.read.option( "multiline", "true" ).json( pathClaimSchema ).schema

abfssFileList = incrementalFilePath + "Claim/"

# recursively load incremental Claims files
incrementClaimsDf = spark.read.option( "multiline", "true" ).option( "columnNameOfCorruptRecord", "corruptRecord" ).option( "recursiveFileLookup", "true" ).schema( claimSchema ).json( abfssFileList )
# incrementClaimsDf.printSchema()
# incrementClaimsDf.show( 10, False )

incrClaimsDf = incrementClaimsDf.withColumn( "createdDate", to_date( col( "created" ) ).cast( "date" ) )
# incrClaimsDf.printSchema()
# incrClaimsDf.show( 10, False )

####################
# claimInsurance #
####################
exportPath = silverExportPath + "claimInsurance/"

# historical
claimInsuranceDf = histClaimsDf.select( "billablePeriod", "created", "id", col( "patient.reference" ).alias( "patientId" ), explode_outer( "insurance" ).alias( "insuranceExpand" ), "createdDate" ).select( "billablePeriod", "created", "id", "patientId", "insuranceExpand", "createdDate" )
exportClaimsInsDf = claimInsuranceDf.withColumn( "year", year( col( "createdDate" ) ) ).repartition( "year" )
# exportClaimsInsDf.printSchema()

# export historical claimInsuranceDf
localExportPath = exportPath + "historical/"
# exportClaimsInsDf.select( "billablePeriod", "created", "year", "id", "patientId", "insuranceExpand.coverage" ).select( "billablePeriod", "created", "id", "patientId", "coverage", "year" ).write.format( "delta" ).partitionBy( "year" ).save( localExportPath )

# incremental
claimInsuranceDf = incrClaimsDf.select( "billablePeriod", "created", "id", col( "patient.reference" ).alias( "patientId" ), explode_outer( "insurance" ).alias( "insuranceExpand" ), "createdDate" ).select( "billablePeriod", "created", "id", "patientId", "insuranceExpand", "createdDate" )
# claimInsuranceDf.printSchema()
exportClaimsInsDf = claimInsuranceDf.withColumn( "year", year( col( "createdDate" ) ) ).withColumn( "month", month( col( "createdDate" ) ) ).withColumn( "day", date_format( col( "createdDate" ), "d" ) ).repartition( "year", "month", "day" )

# export incremental claims
localExportPath = exportPath + "incremental/"
# exportClaimsInsDf.select( "billablePeriod", "created", "year", "month", "day", "id", "patientId", "insuranceExpand.coverage" ).select( "billablePeriod", "created", "id", "patientId", "coverage", "year", "month", "day" ).write.format( "delta" ).partitionBy( "year", "month", "day" ).save( localExportPath )

##################
# claimDiagnosis #
##################
exportPath = silverExportPath + "claimDiagnosis/"

# historical
claimDiagDf = histClaimsDf.select( "created", "id", col( "patient.reference" ).alias( "patientId" ), col( "patient.display" ).alias( "patientFirstAndLast" ), col( "priority.coding" ).alias( "priorityCoding" ), col( "provider.display" ).alias( "providerDisplay" ), "resourceType", "status", col( "total.value" ).alias( "totalValue" ), explode_outer( "item").alias( "itemExpand" ), "createdDate" ).select( "created", "id", "patientId", "patientFirstAndLast", "priorityCoding", "providerDisplay", "resourceType", "status", "totalValue", "itemExpand", "createdDate" )

# claimDiagDf.printSchema()
# claimDiagDf.show( 10, False )

exportClaimDiagDf = claimDiagDf.withColumn( "year", year( col( "createdDate" ) ) ).select( "created", "id", "patientId", "patientFirstAndLast", col( "priorityCoding.code").alias( "priorityCode" ), "providerDisplay", "resourceType", "status", "totalValue", explode_outer( "itemExpand.productOrService.coding" ).alias( "patientProductOrService" ), "year" ).select( "created", "id", "patientId", "patientFirstAndLast", "priorityCode", "providerDisplay", "resourceType", "status", "totalValue", "patientProductOrService", "year" ).repartition( "year" )

localExportPath = exportPath + "historical/"
# exportClaimDiagDf.select( "created", "id", "patientId", "patientFirstAndLast", "priorityCode", "providerDisplay", "resourceType", "status", "totalValue", "patientProductOrService", "year" ).write.format( "delta" ).partitionBy( "year" ).save( localExportPath )

# incremental
claimDiagDf = incrClaimsDf.select( "created", "id", col( "patient.reference" ).alias( "patientId" ), col( "patient.display" ).alias( "patientFirstAndLast" ), col( "priority.coding" ).alias( "priorityCoding" ), col( "provider.display" ).alias( "providerDisplay" ), "resourceType", "status", col( "total.value" ).alias( "totalValue" ), explode_outer( "item").alias( "itemExpand" ), "createdDate" ) .select( "created", "id", "patientId", "patientFirstAndLast", "priorityCoding", "providerDisplay", "resourceType", "status", "totalValue", "itemExpand", "createdDate" )

exportClaimDiagDf = claimDiagDf.withColumn( "year", year( col( "createdDate" ) ) ).withColumn( "month", month( col( "createdDate" ) ) ).withColumn( "day", date_format( col( "createdDate" ), "d" ) ).select( "created", "id", "patientId", "patientFirstAndLast", col( "priorityCoding.code").alias( "priorityCode" ), "providerDisplay", "resourceType", "status", "totalValue", explode_outer( "itemExpand.productOrService.coding" ).alias( "patientProductOrService" ), "year", "month", "day" ).select( "created", "id", "patientId", "patientFirstAndLast", "priorityCode", "providerDisplay", "resourceType", "status", "totalValue", "patientProductOrService", "year", "month", "day" ).repartition( "year", "month", "day" )

localExportPath = exportPath + "incremental/"
# exportClaimDiagDf.select( "created", "id", "patientId", "patientFirstAndLast", "priorityCode", "providerDisplay", "resourceType", "status", "totalValue", "patientProductOrService", "year", "month", "day" ).write.format( "delta" ).partitionBy( "year", "month", "day" ).save( localExportPath )

##################
# claimProcedure #
##################

exportPath = silverExportPath + "claimProcedure/"

# historical
claimProcDf = histClaimsDf.select( "created", "id", col( "patient.reference" ).alias( "patientId" ), col( "patient.display" ).alias( "patientFirstAndLast" ), col( "priority.coding" ).alias( "priorityCoding" ), col( "provider.display" ).alias( "providerDisplay" ), "resourceType", "status", col( "total.value" ).alias( "totalValue" ), explode_outer( "item").alias( "itemExpand" ), "createdDate" ) .select( "created", "id", "patientId", "patientFirstAndLast", "priorityCoding", "providerDisplay", "resourceType", "status", "totalValue", "itemExpand", "createdDate" )

# claimDiagDf.printSchema()
# claimDiagDf.show( 10, False )

exportClaimProcDf = claimProcDf.withColumn( "year", year( col( "createdDate" ) ) ).select( "created", "id", "patientId", "patientFirstAndLast", col( "priorityCoding.code").alias( "priorityCode" ), "providerDisplay", "resourceType", "status", "totalValue", explode_outer( "itemExpand.productOrService.coding" ).alias( "patientProductOrService" ), "year" ).select( "created", "id", "patientId", "patientFirstAndLast", "priorityCode", "providerDisplay", "resourceType", "status", "totalValue", "patientProductOrService", "year" ).repartition( "year" )

localExportPath = exportPath + "historical/"
# exportClaimProcDf.select( "created", "id", "patientId", "patientFirstAndLast", "priorityCode", "providerDisplay", "resourceType", "status", "totalValue", "patientProductOrService", "year" ).write.format( "delta" ).partitionBy( "year" ).save( localExportPath )

# incremental
claimProcDf = incrClaimsDf.select( "created", "id", col( "patient.reference" ).alias( "patientId" ), col( "patient.display" ).alias( "patientFirstAndLast" ), col( "priority.coding" ).alias( "priorityCoding" ), col( "provider.display" ).alias( "providerDisplay" ), "resourceType", "status", col( "total.value" ).alias( "totalValue" ), explode_outer( "item").alias( "itemExpand" ), "createdDate" ) .select( "created", "id", "patientId", "patientFirstAndLast", "priorityCoding", "providerDisplay", "resourceType", "status", "totalValue", "itemExpand", "createdDate" )

exportClaimProcDf = claimProcDf.withColumn( "year", year( col( "createdDate" ) ) ).withColumn( "month", month( col( "createdDate" ) ) ).withColumn( "day", date_format( col( "createdDate" ), "d" ) ).select( "created", "id", "patientId", "patientFirstAndLast", col( "priorityCoding.code").alias( "priorityCode" ), "providerDisplay", "resourceType", "status", "totalValue", explode_outer( "itemExpand.productOrService.coding" ).alias( "patientProductOrService" ), "year", "month", "day" ).select( "created", "id", "patientId", "patientFirstAndLast", "priorityCode", "providerDisplay", "resourceType", "status", "totalValue", "patientProductOrService", "year", "month", "day" ).repartition( "year", "month", "day" )

localExportPath = exportPath + "incremental/"
# exportClaimProcDf.select( "created", "id", "patientId", "patientFirstAndLast", "priorityCode", "providerDisplay", "resourceType", "status", "totalValue", "patientProductOrService", "year", "month", "day" ).write.format( "delta" ).partitionBy( "year", "month", "day" ).save( localExportPath )

In [6]:
# silver layer
# part 3 - Observations

silverExportPath = abfssUrl + "silver/"
historicalFilePath = abfssUrl + "bronze/historic_data/"
incrementalFilePath = abfssUrl + "bronze/incremental_data/"

# load observations files
histFirstFilePath = historicalFilePath + "Observation/year=2016/part-00000-4ede0f25-40a2-488b-8796-917f45de2ea7.c000.json"
incrFirstFilePath = incrementalFilePath + "Observation/year=2021/month=02/day=22/part-00000-3420bf8a-7e08-4a6e-84ee-00d1625d2995.c000.json"

## historical ##
# create schema first to do initial file load
pathObsSchema = histFirstFilePath
obsSchema = spark.read.option( "multiline", "true" ).json( pathObsSchema ).schema

abfssFileList = historicalFilePath + "Observation/"

# recursively load historic Claims files
historyObsDf = spark.read.option( "multiline", "true" ).option( "columnNameOfCorruptRecord", "corruptRecord" ).option( "recursiveFileLookup", "true" ).schema( obsSchema ).json( abfssFileList )
# historyObsDf.printSchema()
# historyObsDf.show( 10, False )

histObsDf = historyObsDf.withColumn( "issuedDate", to_date( col( "issued" ) ).cast( "date" ) )
# histObsDf.printSchema()
# histObsDf.show( 10, False )

## incremental ##
# create schema
pathObsSchema = incrFirstFilePath
obsSchema = spark.read.option( "multiline", "true" ).json( pathObsSchema ).schema

abfssFileList = incrementalFilePath + "Observation/"

# recursively load incremental Claims files
incrementObsDf = spark.read.option( "multiline", "true" ).option( "columnNameOfCorruptRecord", "corruptRecord" ).option( "recursiveFileLookup", "true" ).schema( obsSchema ).json( abfssFileList )
# incrementObsDf.printSchema()
# incrementObsDf.show( 10, False )

incrObsDf = incrementObsDf.withColumn( "issuedDate", to_date( col( "issued" ) ).cast( "date" ) )
# incrObsDf.printSchema()
# incrObsDf.show( 10, False )

########################
# Observation Category #
########################

exportPath = silverExportPath + "observationCategory/"

# historical
obsCategoryDf = histObsDf.withColumn( "year", year( col( "issuedDate" ) ) ).select( "year", "effectiveDateTime", "id", "issued", "resourceType", "status", "subject.reference", explode_outer( "category" ).alias( "categoryExpand" ) ).select( "year", "effectiveDateTime", "id", "issued", "resourceType", "status", "reference", "categoryExpand" ).repartition( "year" )
# obsCategoryDf.printSchema()
# obsCategoryDf.show( 10, False )

exportObsCatDf = obsCategoryDf.select( "year", "effectiveDateTime", "id", "issued", "resourceType", "status", "reference", explode_outer( "categoryExpand.coding" ).alias( "categoryCoding" ) ).select( "year", "effectiveDateTime", "id", "issued", "resourceType", "status", "reference", col( "categoryCoding.display" ).alias( "categoryCodeDisplay" ) ).select( "year", "effectiveDateTime", "id", "issued", "resourceType", "status", "reference", "categoryCodeDisplay" )
# exportObsCatDf.printSchema()
# exportObsCatDf.show( 10, False )

localExportPath = exportPath + "historical/"
# exportObsCatDf.select( "year", "effectiveDateTime", "id", "issued", "resourceType", "status", "reference", "categoryCodeDisplay" ).write.format( "delta" ).partitionBy( "year" ).save( localExportPath )

# incremental
obsCategoryDf = incrObsDf.withColumn( "year", year( col( "issuedDate" ) ) ).withColumn( "month", month( col( "issuedDate" ) ) ).withColumn( "day", date_format( col( "issuedDate" ), "d" ) ).select( "year", "month", "day", "effectiveDateTime", "id", "issued", "resourceType", "status", "subject.reference", explode_outer( "category" ).alias( "categoryExpand" ) ).select( "year", "month", "day", "effectiveDateTime", "id", "issued", "resourceType", "status", "reference", "categoryExpand" ).repartition( "year", "month", "day" )
# obsCategoryDf.printSchema()
# obsCategoryDf.show( 10, False )

exportObsCatDf = obsCategoryDf.select( "year", "month", "day", "effectiveDateTime", "id", "issued", "resourceType", "status", "reference", explode_outer( "categoryExpand.coding" ).alias( "categoryCoding" ) ).select( "year", "month", "day", "effectiveDateTime", "id", "issued", "resourceType", "status", "reference", col( "categoryCoding.display" ).alias( "categoryCodeDisplay" ) ).select( "year", "month", "day", "effectiveDateTime", "id", "issued", "resourceType", "status", "reference", "categoryCodeDisplay" )
# exportObsCatDf.printSchema()
# exportObsCatDf.show( 10, False )

localExportPath = exportPath + "incremental/"
# exportObsCatDf.select( "year", "month", "day", "effectiveDateTime", "id", "issued", "resourceType", "status", "reference", "categoryCodeDisplay" ).write.format( "delta" ).partitionBy( "year", "month", "day" ).save( localExportPath )

######################
# Observation Coding #
######################

exportPath = silverExportPath + "observationCoding/"

# historical
obsCodingDf = histObsDf.withColumn( "year", year( col( "issuedDate" ) ) ).select( "year", "effectiveDateTime", "id", "issued", "resourceType", "status", "subject.reference", explode_outer( "code.coding" ).alias( "codingExpand" ) ).select( "year", "effectiveDateTime", "id", "issued", "resourceType", "status", "reference", "codingExpand" ).repartition( "year" )
# obsCodingDf.printSchema()
# obsCodingDf.show( 10, False )

exportObsCodingDf = obsCodingDf.select( "year", "effectiveDateTime", "id", "issued", "resourceType", "status", "reference", col( "codingExpand.code" ).alias( "code"), col( "codingExpand.display" ).alias( "codeDisplay" ) ).select( "year", "effectiveDateTime", "id", "issued", "resourceType", "status", "reference", "code", "codeDisplay" )
# exportObsCodingDf.printSchema()
# exportObsCodingDf.show( 10, False )

localExportPath = exportPath + "historical/"
# exportObsCodingDf.select( "year", "effectiveDateTime", "id", "issued", "resourceType", "status", "reference", "code", "codeDisplay" ).write.format( "delta" ).partitionBy( "year" ).save( localExportPath )

# incremental
obsCodingDf = incrObsDf.withColumn( "year", year( col( "issuedDate" ) ) ).withColumn( "month", month( col( "issuedDate" ) ) ).withColumn( "day", date_format( col( "issuedDate" ), "d" ) ).select( "year", "month", "day", "effectiveDateTime", "id", "issued", "resourceType", "status", "subject.reference", explode_outer( "code.coding" ).alias( "codingExpand" ) ).select( "year", "month", "day", "effectiveDateTime", "id", "issued", "resourceType", "status", "reference", "codingExpand" ).repartition( "year", "month", "day" )
# obsCodingDf.printSchema()
# obsCodingDf.show( 10, False )

exportObsCodingDf = obsCodingDf.select( "year", "month", "day", "effectiveDateTime", "id", "issued", "resourceType", "status", "reference", col( "codingExpand.code" ).alias( "code"), col( "codingExpand.display" ).alias( "codeDisplay" ) ).select( "year", "month", "day", "effectiveDateTime", "id", "issued", "resourceType", "status", "reference", "code", "codeDisplay" )
# exportObsCodingDf.printSchema()
# exportObsCodingDf.show( 10, False )

localExportPath = exportPath + "incremental/"
# exportObsCodingDf.select( "year", "month", "day", "effectiveDateTime", "id", "issued", "resourceType", "status", "reference", "code", "codeDisplay" ).write.format( "delta" ).partitionBy( "year", "month", "day" ).save( localExportPath )

#############################
# Observation valueQuantity #
#############################

exportPath = silverExportPath + "observationValueQuantity/"

# historical
obsVqDf = histObsDf.withColumn( "year", year( col( "issuedDate" ) ) ).select( "year", "effectiveDateTime", "id", "issued", "resourceType", "status", "subject.reference", col( "valueQuantity.unit" ).alias( "measureUnit"), col( "valueQuantity.value" ).alias( "measureValue" ) ).select( "year", "effectiveDateTime", "id", "issued", "resourceType", "status", "reference", "measureUnit", "measureValue" ).repartition( "year" )
# obsVqDf.printSchema()
# obsVqDf.show( 10, False )

localExportPath = exportPath + "historical/"
# obsVqDf.select( "year", "effectiveDateTime", "id", "issued", "resourceType", "status", "reference", "measureUnit", "measureValue" ).write.format( "delta" ).partitionBy( "year" ).save( localExportPath )

# incremental
obsVqDf = incrObsDf.withColumn( "year", year( col( "issuedDate" ) ) ).withColumn( "month", month( col( "issuedDate" ) ) ).withColumn( "day", date_format( col( "issuedDate" ), "d" ) ).select( "year", "month", "day", "effectiveDateTime", "id", "issued", "resourceType", "status", "subject.reference", col( "valueQuantity.unit" ).alias( "measureUnit"), col( "valueQuantity.value" ).alias( "measureValue" ) ).select( "year", "month", "day", "effectiveDateTime", "id", "issued", "resourceType", "status", "reference", "measureUnit", "measureValue" ).repartition( "year", "month", "day" )
# obsVqDf.printSchema()
# obsVqDf.show( 10, False )

localExportPath = exportPath + "incremental/"
# obsVqDf.select( "year", "month", "day", "effectiveDateTime", "id", "issued", "resourceType", "status", "reference", "measureUnit", "measureValue" ).write.format( "delta" ).partitionBy( "year", "month", "day" ).save( localExportPath )

In [8]:
obsVqDf.printSchema()
obsVqDf.show( 10, False )