In [0]:
import requests
import pandas as pd
from pyspark.sql import SparkSession

# Dataset ID
dataset_id = "d_3cf667d761b4bdc6d4d3d3aeec37dea5"
url = f"https://data.gov.sg/api/action/datastore_search?resource_id={dataset_id}&limit=5000"

# Fetch data
response = requests.get(url)
if response.status_code != 200:
    raise Exception(f"Failed: {response.status_code}")

data_json = response.json()
records = data_json['result']['records']
df_raw = pd.DataFrame(records)

# Convert numeric year columns to int where possible
year_cols = [col for col in df_raw.columns if col.isdigit()]
df_raw[year_cols] = df_raw[year_cols].apply(pd.to_numeric, errors='coerce')

# Melt the wide-format dataframe to long-format
df_long = df_raw.melt(
    id_vars=["DataSeries"], 
    value_vars=year_cols,
    var_name="year", 
    value_name="population"
)

# Rename columns
df_long.rename(columns={"DataSeries": "age_group"}, inplace=True)

# Convert year to int
df_long["year"] = df_long["year"].astype(int)
df_long["population"] = pd.to_numeric(df_long["population"], errors="coerce")

# Drop nulls if any
df_long.dropna(subset=["population"], inplace=True)

# Convert to Spark DataFrame
df_spark = spark.createDataFrame(df_long)

# Display sample
df_spark.display()

df_spark.write.mode("overwrite").format("delta").saveAsTable("civAI.raw.sdoh_age_population")
