In [0]:
import pandas as pd
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("CivAI_Ingestion").getOrCreate()

# Define the path to the downloaded CSV file
csv_file_path = "/Volumes/civai/raw/singaporegovdata/ResidentPopulationbyPlanningAreaSubzoneofResidenceAgeGroupandSexCensusofPopulation2020.csv"

# Read the CSV file into a pandas DataFrame
df_pd = pd.read_csv(csv_file_path)

# Display the first few rows
print(df_pd.head())

# Convert pandas DataFrame to Spark DataFrame
df_spark = spark.createDataFrame(df_pd)

# Rename columns if necessary
df_spark = df_spark.withColumnRenamed("YourPlanningAreaColumn", "planning_area") \
                   .withColumnRenamed("YourSubzoneColumn", "subzone") \
                   .withColumnRenamed("YourAgeGroupColumn", "age_group") \
                   .withColumnRenamed("YourSexColumn", "sex") \
                   .withColumnRenamed("YourPopulationColumn", "population")


df_spark = df_spark.toDF(*[col.strip().replace(" ", "_").lower() for col in df_spark.columns])

df_spark = df_spark.withColumn("Number", df_spark["Number"].cast("string"))

# Show the Spark DataFrame
df_spark.show()

df_spark.write.mode("overwrite").format("delta").saveAsTable("civAI.raw.age_group_distribution")
