In [1]:
%idle_timeout 60
%timeout 30
%glue_version 5.0
%worker_type G.1X
%number_of_workers 3

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import pyspark.sql.functions as f
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.7 
Current idle_timeout is None minutes.
idle_timeout has been set to 60 minutes.
Current timeout is None minutes.
timeout has been set to 30 minutes.
Setting Glue version to: 5.0
Previous worker type: None
Setting new worker type to: G.1X
Previous number of workers: None
Setting new number of workers to: 3
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 3
Idle Timeout: 60
Timeout: 30
Session ID: a585cd45-42e7-4381-8458-610fba9fb974
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
Waiting for session a585cd45-42e7-4381-8458-610fba9fb97

In [2]:
#set bucket directory
transformed_bucket = 'cloud9-transformed'




In [3]:
#load schema from raw data catalog
df = glueContext.create_data_frame_from_catalog(database = "cloud9_raw", table_name = "rw_seoul_rescue")



In [8]:
#df.printSchema()
#df.show(3)

root
 |-- msfrtn_resc_reprt_no: string (nullable = true)
 |-- acdnt_cause: string (nullable = true)
 |-- prcs_result_se_nm: string (nullable = true)
 |-- dclr_ymd: string (nullable = true)
 |-- dclr_tm: long (nullable = true)
 |-- season_se_nm: string (nullable = true)
 |-- qtr_se: long (nullable = true)
 |-- dclr_hour: long (nullable = true)
 |-- dclr_min: long (nullable = true)
 |-- daywk: string (nullable = true)
 |-- dsp_ymd: string (nullable = true)
 |-- dsp_tm: string (nullable = true)
 |-- dsp_yr: string (nullable = true)
 |-- dsp_mnth: string (nullable = true)
 |-- dsp_day: string (nullable = true)
 |-- dsp_hour: string (nullable = true)
 |-- dsp_min: string (nullable = true)
 |-- spt_arvl_ymd: string (nullable = true)
 |-- spt_arvl_tm: string (nullable = true)
 |-- spt_arvl_yr: string (nullable = true)
 |-- spt_arvl_mnth: string (nullable = true)
 |-- spt_arvl_day: string (nullable = true)
 |-- spt_arvl_hour: string (nullable = true)
 |-- spt_arvl_min: string (nullable = true)

In [4]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Null 값이 있는 행 제거
df_dropped = df.dropna(subset=["spt_arvl_tm"])

df_transformed = df_dropped \
    .withColumn("dclr_ymd", to_date(regexp_replace(col("dclr_ymd"), "\\.0$", ""), "yyyyMMdd")) \
    .withColumn("dclr_tm", col("dclr_tm").cast("int")) \
    .withColumn("qtr_se", col("qtr_se").cast("int")) \
    .withColumn("dclr_hour", col("dclr_hour").cast("int")) \
    .withColumn("dclr_min", col("dclr_min").cast("int")) \
    .withColumn("dsp_ymd", to_date(regexp_replace(col("dsp_ymd"), "\\.0$", ""), "yyyyMMdd")) \
    .withColumn("dsp_tm", col("dsp_tm").cast("int")) \
    .withColumn("spt_arvl_ymd", to_date(regexp_replace(col("spt_arvl_ymd"), "\\.0$", ""), "yyyyMMdd")) \
    .withColumn("spt_arvl_tm", col("spt_arvl_tm").cast("int")) \
    .withColumn("resc_cmptn_ymd", to_date(regexp_replace(col("resc_cmptn_ymd"), "\\.0$", ""), "yyyyMMdd")) \
    .withColumn("resc_cmptn_tm", col("resc_cmptn_tm").cast("int")) \
    .withColumn("hmg_ymd", to_date(regexp_replace(col("hmg_ymd"), "\\.0$", ""), "yyyyMMdd")) \
    .withColumn("hmg_tm", col("hmg_tm").cast("int")) \
    .withColumn("gis_x_axis", col("gis_x_axis")) \
    .withColumn("gis_y_axis", col("gis_y_axis")) \
    .withColumn("spt_frstt_dist", col("spt_frstt_dist"))

# 결과 확인
#df_transformed.printSchema()
#df_transformed.show(3)




In [13]:
spark.conf.set("spark.sql.parquet.datetimeRebaseModeInWrite", "CORRECTED")
spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")

# Write transformed data to S3
df_transformed.write \
              .mode("overwrite") \
              .format('parquet') \
              .partitionBy("dclr_yr", "dclr_mnth", "dclr_day") \
              .save(f's3a://{transformed_bucket}/seoul_rescue_transformed/')

# Create tr_incident_report table if not exists
spark.sql(
    """
    CREATE EXTERNAL TABLE IF NOT EXISTS cloud9_transformed.tr_seoul_rescue (
        `msfrtn_resc_reprt_no` STRING, 
        `acdnt_cause` STRING, 
        `prcs_result_se_nm` STRING, 
        `dclr_ymd` DATE, 
        `dclr_tm` INT, 
        `season_se_nm` STRING, 
        `qtr_se` INT, 
        `dclr_hour` INT, 
        `dclr_min` INT, 
        `daywk` STRING, 
        `dsp_ymd` DATE, 
        `dsp_tm` INT, 
        `dsp_yr` STRING, 
        `dsp_mnth` STRING, 
        `dsp_day` STRING, 
        `dsp_hour` STRING, 
        `dsp_min` STRING, 
        `spt_arvl_ymd` DATE, 
        `spt_arvl_tm` INT, 
        `spt_arvl_yr` STRING, 
        `spt_arvl_mnth` STRING, 
        `spt_arvl_day` STRING, 
        `spt_arvl_hour` STRING, 
        `spt_arvl_min` STRING, 
        `resc_cmptn_ymd` DATE, 
        `resc_cmptn_tm` INT, 
        `resc_cmptn_yr` STRING, 
        `resc_cmptn_mnth` STRING, 
        `resc_cmptn_day` STRING, 
        `resc_cmptn_hour` STRING, 
        `resc_cmptn_min` STRING, 
        `hmg_ymd` DATE, 
        `hmg_tm` INT, 
        `hmg_yr` STRING, 
        `hmg_mnth` STRING, 
        `hmg_day` STRING, 
        `hmg_hour` STRING, 
        `hmg_min` STRING, 
        `sido_nm` STRING, 
        `sigungu_nm` STRING, 
        `emd_nm` STRING, 
        `cty_frmvl_se_nm` STRING, 
        `emd_se_nm` STRING, 
        `gis_x_axis` DOUBLE, 
        `gis_y_axis` DOUBLE, 
        `spt_frstt_dist` DOUBLE, 
        `acdnt_place_nm` STRING, 
        `acdnt_place_detail_nm` STRING, 
        `acdnt_cause_asort_nm` STRING, 
        `frstt_nm` STRING, 
        `ward_nm` STRING, 
        `lfdau_nm` STRING, 
        `time_unit_tmprt` STRING, 
        `time_unit_rainqty` STRING, 
        `time_unit_ws` STRING, 
        `time_unit_wd` STRING, 
        `time_unit_humidity` STRING, 
        `time_unit_msnf` STRING, 
        `time_unit` STRING
    )
    PARTITIONED BY ( 
        `dclr_yr` STRING, 
        `dclr_mnth` STRING, 
        `dclr_day` STRING
    )
    STORED AS PARQUET
    LOCATION 
        's3://cloud9-transformed/seoul_rescue_transformed/'
    TBLPROPERTIES (
        'classification' = 'parquet'
    )
    """
)

from datetime import datetime, timedelta

# 어제 날짜를 KST 시간으로 계산
yesterday_kst = datetime.utcnow() + timedelta(hours=9) - timedelta(days=1)

# 연도, 월, 일을 추출 (한 자리 수일 경우 앞의 0 제거)
dclr_yr = yesterday_kst.strftime('%Y')  # 연도는 그대로
dclr_mnth = yesterday_kst.strftime('%m').lstrip('0')  # 0 제거
dclr_day = yesterday_kst.strftime('%d').lstrip('0')   # 0 제거

# Spark SQL로 파티션 추가
spark.sql(
    f"""
    ALTER TABLE cloud9_transformed.tr_seoul_rescue 
    ADD IF NOT EXISTS 
    PARTITION (dclr_yr='{dclr_yr}', dclr_mnth='{dclr_mnth}', dclr_day='{dclr_day}') 
        LOCATION 's3a://cloud9-transformed/seoul_rescue_transformed/dclr_yr={dclr_yr}/dclr_mnth={dclr_mnth}/dclr_day={dclr_day}/'
    """
)

DataFrame[]
