In [1]:
#create glue session
%idle_timeout 60
%timeout 60
%glue_version 5.0
%worker_type G.1X
%number_of_workers 2

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import pyspark.sql.functions as f
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.7 
Current idle_timeout is None minutes.
idle_timeout has been set to 60 minutes.
Current timeout is None minutes.
timeout has been set to 60 minutes.
Setting Glue version to: 5.0
Previous worker type: None
Setting new worker type to: G.1X
Previous number of workers: None
Setting new number of workers to: 2
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 2
Idle Timeout: 60
Timeout: 60
Session ID: 6da159ee-476e-4fcf-995b-5ba52c6a65b4
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
Waiting for session 6da159ee-476e-4fcf-995b-5ba52c6a65b

In [2]:
#set bucket directory
raw_bucket = 'cloud9-batch-raw'
transformed_bucket = 'cloud9-transformed'




In [3]:
#load schema from raw data catalog
df = glueContext.create_data_frame_from_catalog(database = "cloud9_raw", table_name = "rw_district_seoul")



In [6]:
# list loaded schema
#df.printSchema()

#print sample rows
#df.show(3)

In [7]:
#Transform columns, data types
from pyspark.sql.functions import col
from pyspark.sql.types import *

# 'struct' 컬럼에서 'string'이 아닌 필드를 추출하고 필요한 타입으로 변환
df_transformed = df \
    .withColumn("population", col("population.long").cast(IntegerType())) \
    .withColumn("area", col("area.double").cast(DoubleType())) \
    .withColumn("density", col("density.long").cast(IntegerType())) \
    .select(
        "district_name", 
        "dong_name", 
        "population", 
        "area", 
        "density",
        "year"
    )

# 결과 확인
#df_transformed.printSchema()
#df_transformed.show(3)




In [12]:
# Write transformed data to S3
df_transformed.write \
              .mode("overwrite") \
              .format('parquet') \
              .partitionBy("year") \
              .save(f's3a://{transformed_bucket}/district_seoul_transformed/')

# Create tr_district_seoul table if not exists
spark.sql(
    """
    CREATE EXTERNAL TABLE IF NOT EXISTS cloud9_transformed.tr_district_seoul (
        `district_name` STRING,
        `dong_name` STRING,
        `population` BIGINT,
        `area` DOUBLE,
        `density` BIGINT
    )
    PARTITIONED BY (
        `year` STRING 
    )
    ROW FORMAT SERDE 
        'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
    STORED AS INPUTFORMAT 
        'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' 
    OUTPUTFORMAT 
        'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
    LOCATION 
        's3://cloud9-transformed/district_seoul_transformed/'
    TBLPROPERTIES (
        'classification' = 'parquet'
    )
    """
)

# Add yesterday's partition
from datetime import datetime, timedelta

yesterday_kst = datetime.utcnow() + timedelta(hours=9)
year = yesterday_kst.strftime('%Y')

spark.sql(
    f"""
    ALTER TABLE cloud9_transformed.tr_district_seoul 
    ADD IF NOT EXISTS 
    PARTITION (year='{year}') 
        LOCATION 's3a://cloud9-transformed/district_seoul_transformed/year={year}/'
    """
)

DataFrame[]
