In [0]:
from pyspark.sql.functions import explode, mean, stddev, col, trim


### Population data from JSON file to dataframe

In [0]:

# Get the most recent JSON file
files = dbutils.fs.ls("/Volumes/thequest/bronze/datausa_io")
latest_file = sorted([f for f in files if f.name.endswith('.json')], key=lambda x: x.name, reverse=True)[0]

# Read as multi-line JSON
df_raw = spark.read.option("multiLine", "true").json(latest_file.path)

# Extract the data array
df_population = (df_raw
  .select(explode(col("data.data")).alias("record"))
  .select("record.*")
)

display(df_population)

Nation,Nation ID,Population,Year
United States,01000US,316128839.0,2013
United States,01000US,318857056.0,2014
United States,01000US,321418821.0,2015
United States,01000US,323127515.0,2016
United States,01000US,325719178.0,2017
United States,01000US,327167439.0,2018
United States,01000US,328239523.0,2019
United States,01000US,331893745.0,2021
United States,01000US,333287562.0,2022
United States,01000US,334914896.0,2023


### BLS Data as a dataframe

In [0]:
df_bls_current = spark.read.table('thequest.bronze.bls_pr_data_current')

display(df_bls_current.limit(100))

series_id,year,period,value,footnote_codes,_rescued_data,_source_file,_ingestion_time
PRS30006011,1995,Q01,2.6,,,s3://steve-m-bls-demo-bucket/bls/pr/pr.data.0.Current,2026-02-05T04:57:40.815Z
PRS30006011,1995,Q02,2.1,,,s3://steve-m-bls-demo-bucket/bls/pr/pr.data.0.Current,2026-02-05T04:57:40.815Z
PRS30006011,1995,Q03,0.9,,,s3://steve-m-bls-demo-bucket/bls/pr/pr.data.0.Current,2026-02-05T04:57:40.815Z
PRS30006011,1995,Q04,0.1,,,s3://steve-m-bls-demo-bucket/bls/pr/pr.data.0.Current,2026-02-05T04:57:40.815Z
PRS30006011,1995,Q05,1.4,,,s3://steve-m-bls-demo-bucket/bls/pr/pr.data.0.Current,2026-02-05T04:57:40.815Z
PRS30006011,1996,Q01,-0.2,,,s3://steve-m-bls-demo-bucket/bls/pr/pr.data.0.Current,2026-02-05T04:57:40.815Z
PRS30006011,1996,Q02,-0.3,,,s3://steve-m-bls-demo-bucket/bls/pr/pr.data.0.Current,2026-02-05T04:57:40.815Z
PRS30006011,1996,Q03,-0.1,,,s3://steve-m-bls-demo-bucket/bls/pr/pr.data.0.Current,2026-02-05T04:57:40.815Z
PRS30006011,1996,Q04,0.2,,,s3://steve-m-bls-demo-bucket/bls/pr/pr.data.0.Current,2026-02-05T04:57:40.815Z
PRS30006011,1996,Q05,-0.1,,,s3://steve-m-bls-demo-bucket/bls/pr/pr.data.0.Current,2026-02-05T04:57:40.815Z


### mean and the standard deviation of the annual US population across the years [2013, 2018] inclusive

In [0]:
# Filter to years 2013-2018 and calculate mean and standard deviation

population_mean_stddev = (df_population
  .filter((col("Year") >= 2013) & (col("Year") <= 2018))
  .select(
    mean("Population").alias("mean_population"),
    stddev("Population").alias("stddev_population")
  )
)

display(population_mean_stddev)

mean_population,stddev_population
322069808.0,4158441.040908095


### Best year calc

In [0]:
%sql
WITH yearly_sums AS (
  SELECT 
    TRIM(series_id) as series_id,
    CAST(TRIM(year) as INT) as year,
    SUM(CAST(TRIM(value) as DOUBLE)) as total_value
  FROM thequest.bronze.bls_pr_data_current
  WHERE TRIM(period) LIKE 'Q%'
  GROUP BY TRIM(series_id), CAST(TRIM(year) as INT)
),
max_per_series AS (
  SELECT 
    series_id,
    year,
    total_value,
    MAX(total_value) OVER (PARTITION BY series_id) as max_value
  FROM yearly_sums
)
SELECT 
  series_id,
  year,
  total_value as value
FROM max_per_series
WHERE total_value = max_value
ORDER BY series_id, year




series_id,year,value
PRS30006011,2022,20.5
PRS30006012,2022,17.1
PRS30006013,1998,705.895
PRS30006021,2010,17.7
PRS30006022,2010,12.4
PRS30006023,2014,503.21600000000007
PRS30006031,2022,20.5
PRS30006032,2021,17.1
PRS30006033,1998,702.672
PRS30006061,2022,34.5


### Population with series data

In [0]:

result = (df_bls_current.alias("b")
  .join(df_population.alias("p"), 
        col("b.year") == col("p.year"), 
        "inner")
  .filter((trim(col("b.series_id")) == "PRS30006032") & 
          (trim(col("b.period")) == "Q01"))
  .select(
    col("b.series_id"),
    col("b.year"),
    col("b.period"),
    col("b.value"),
    col("p.population")
  )
)

display(result)

series_id,year,period,value,population
PRS30006032,2013,Q01,0.5,316128839.0
PRS30006032,2014,Q01,-0.1,318857056.0
PRS30006032,2015,Q01,-1.7,321418821.0
PRS30006032,2016,Q01,-1.4,323127515.0
PRS30006032,2017,Q01,0.9,325719178.0
PRS30006032,2018,Q01,0.5,327167439.0
PRS30006032,2019,Q01,-1.6,328239523.0
PRS30006032,2021,Q01,0.7,331893745.0
PRS30006032,2022,Q01,5.3,333287562.0
PRS30006032,2023,Q01,0.3,334914896.0
