In [1]:
#create glue session
%idle_timeout 180
%timeout 60
%glue_version 5.0
%worker_type G.1X
%number_of_workers 2

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import pyspark.sql.functions as F
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.7 
Current idle_timeout is None minutes.
idle_timeout has been set to 180 minutes.
Current timeout is None minutes.
timeout has been set to 60 minutes.
Setting Glue version to: 5.0
Previous worker type: None
Setting new worker type to: G.1X
Previous number of workers: None
Setting new number of workers to: 2
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 2
Idle Timeout: 180
Timeout: 60
Session ID: 77e1ee85-3134-46ea-9ad2-d8013860cfc9
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
Waiting for session 77e1ee85-3134-46ea-9ad2-d8013860c

In [2]:
#set bucket directory
transformed_bucket = 'cloud9-transformed'
mart_bucket = 'cloud9-mart'




In [9]:
#load schema from transformed_bucket data catalog
df_district = glueContext.create_data_frame_from_catalog(database = "cloud9_transformed", table_name = "tr_district_seoul")




In [7]:
# list loaded schema
#df_district.printSchema()

root
 |-- district_name: string (nullable = true)
 |-- dong_name: string (nullable = true)
 |-- population: integer (nullable = true)
 |-- area: double (nullable = true)
 |-- density: integer (nullable = true)
 |-- year: integer (nullable = true)


In [10]:
#df_district.show()

+-------------+---------+----------+----+-------+----+
|district_name|dong_name|population|area|density|year|
+-------------+---------+----------+----+-------+----+
|       강남구|  개포1동|     11907|1.27|   9376|2023|
|       강남구|  개포2동|     41804|2.51|  16655|2023|
|       강남구|  개포3동|     16894|1.24|  13624|2023|
|       강남구|  개포4동|     23384|1.49|  15694|2023|
|       강남구|  논현1동|     21759|1.25|  17407|2023|
|       강남구|  논현2동|     20774|1.47|  14132|2023|
|       강남구|  대치1동|     23934|0.79|  30296|2023|
|       강남구|  대치2동|     38417| 2.0|  19209|2023|
|       강남구|  대치4동|     18929|0.73|  25930|2023|
|       강남구|  도곡1동|     20701|1.02|  20295|2023|
|       강남구|  도곡2동|     32738|1.02|  32096|2023|
|       강남구|  삼성1동|     12384|1.94|   6384|2023|
|       강남구|  삼성2동|     30888|1.24|  24910|2023|
|       강남구|   세곡동|     45724|6.36|   7189|2023|
|       강남구|   수서동|     14090|1.43|   9853|2023|
|       강남구|   신사동|     15681|1.89|   8297|2023|
|       강남구| 압구정동|     25755|2.53|  10180|2023|
|  

In [42]:
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, IntegerType, DoubleType

df_district_grouped = df_district.groupby("district_name", "year") \
    .sum("population", "area", "density") \
    .withColumn("district_name", F.col("district_name").cast(StringType())) \
    .withColumn("year", F.col("year").cast(StringType())) \
    .withColumn("sum(population)", F.col("sum(population)").cast(IntegerType())) \
    .withColumn("sum(area)", F.round(F.col("sum(area)"),1).cast(DoubleType())) \
    .withColumn("sum(density)", F.col("sum(density)").cast(IntegerType())) \
    .drop("sum(density)")

df_mart = df_district_grouped
#df_mart.printSchema()
#df_mart.show()




In [43]:
# write mart data to S3
df_mart.write \
              .mode("overwrite") \
              .format('parquet') \
              .partitionBy("year") \
              .save(f's3a://{mart_bucket}/district_seoul_grouped_mart/')

# create table if not exists
spark.sql(
    """
    CREATE EXTERNAL TABLE IF NOT EXISTS cloud9_mart.mt_district_seoul_grouped (
        `district_name` STRING,
        `sum(population)` INT, 
        `sum(area)` DOUBLE
    )
    PARTITIONED BY (`year` STRING
    )
    ROW FORMAT SERDE 
        'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' 
    STORED AS INPUTFORMAT 
        'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' 
    OUTPUTFORMAT 
        'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
    LOCATION
        's3://cloud9-mart/district_seoul_grouped_mart'
    TBLPROPERTIES (
        'classification' = 'parquet'
    )
    """
)

# Load partitions
spark.sql("MSCK REPAIR TABLE cloud9_mart.mt_district_seoul_grouped")

DataFrame[]
