In [1]:
#create glue session
%idle_timeout 60
%timeout 60
%glue_version 5.0
%worker_type G.1X
%number_of_workers 2

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import pyspark.sql.functions as f
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.7 
Current idle_timeout is None minutes.
idle_timeout has been set to 60 minutes.
Current timeout is None minutes.
timeout has been set to 60 minutes.
Setting Glue version to: 5.0
Previous worker type: None
Setting new worker type to: G.1X
Previous number of workers: None
Setting new number of workers to: 2
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 2
Idle Timeout: 60
Timeout: 60
Session ID: f760cbd3-e5f8-4a1d-aa5d-b92008b19748
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
Waiting for session f760cbd3-e5f8-4a1d-aa5d-b92008b1974

In [4]:
#set bucket directory
mart_bucket = 'cloud9-mart'




In [5]:
#load schema from raw data catalog
df = glueContext.create_data_frame_from_catalog(database = "cloud9_transformed", table_name = "tr_seoul_rescue")

#load schema from raw data catalog
df_district = glueContext.create_data_frame_from_catalog(database = "cloud9_transformed", table_name = "tr_district_seoul")



In [7]:
# list loaded schema
#df.printSchema()

#print sample rows
#df.show(3)

In [61]:
from pyspark.sql.functions import *

# 시간 데이터를 자리 수에 따라 처리하는 함수 정의
def format_time_column(column):
    return when(length(column) == 2, concat(lit("00:00:"), lpad(column.cast("string"), 2, "0"))) \
        .when(length(column) == 3, concat(lit("00:"), lpad((column / 100).cast("int").cast("string"), 2, "0"), lit(":"), lpad((column % 100).cast("int").cast("string"), 2, "0"))) \
        .when(length(column) == 4, concat(lit("00:"), lpad((column / 100).cast("int").cast("string"), 2, "0"), lit(":"), lpad((column % 100).cast("int").cast("string"), 2, "0"))) \
        .when(length(column) == 5, concat(lpad((column / 10000).cast("int").cast("string"), 2, "0"), lit(":"), lpad(((column / 100) % 100).cast("int").cast("string"), 2, "0"), lit(":"), lpad((column % 100).cast("int").cast("string"), 2, "0"))) \
        .when(length(column) == 6, concat(lpad((column / 10000).cast("int").cast("string"), 2, "0"), lit(":"), lpad(((column / 100) % 100).cast("int").cast("string"), 2, "0"), lit(":"), lpad((column % 100).cast("int").cast("string"), 2, "0")))

# 데이터프레임 변환
df_transformed = df \
    .withColumnRenamed("msfrtn_resc_reprt_no", "incident_report_id") \
    .withColumnRenamed("dclr_ymd", "report_date") \
    .withColumnRenamed("dsp_ymd", "dispatch_date") \
    .withColumnRenamed("spt_arvl_ymd", "arrival_date") \
    .withColumnRenamed("resc_cmptn_ymd", "completion_date") \
    .withColumnRenamed("hmg_ymd", "return_date") \
    .withColumnRenamed("sigungu_nm", "district_name") \
    .withColumn("report_time", col("dclr_tm").cast("int")) \
    .withColumn("dispatch_time", col("dsp_tm").cast("int")) \
    .withColumn("arrival_time", col("spt_arvl_tm").cast("int")) \
    .withColumn("completion_time", col("resc_cmptn_tm").cast("int")) \
    .withColumn("return_time", col("hmg_tm").cast("int")) \
    .withColumn("report_yr", col("dclr_yr").cast("int")) \
    .withColumn("report_mnth", col("dclr_mnth").cast("int")) \
    .withColumn("report_day", col("dclr_day").cast("int")) \
    .withColumn(
        "report_timestamp",
        to_timestamp(
            concat(
                col("report_date"), lit(" "),
                format_time_column(col("report_time"))
            )
        )
    ) \
    .withColumn(
        "arrival_timestamp",
        to_timestamp(
            concat(
                col("arrival_date"), lit(" "),
                format_time_column(col("arrival_time"))
            )
        )
    ) \
    .withColumn(
        "time_difference",
        round((unix_timestamp("arrival_timestamp") - unix_timestamp("report_timestamp")) / 60, 2)
    ) \
    .select(
        "incident_report_id", 
        "report_date", 
        "report_time", 
        "dispatch_date", 
        "dispatch_time",
        "arrival_date",
        "arrival_time", 
        "completion_date", 
        "completion_time", 
        "return_date", 
        "return_time",
        "report_yr",
        "report_mnth",
        "report_day",
        "district_name",
        "report_timestamp",
        "arrival_timestamp",
        "time_difference"
    )

# 결과 확인
#df_transformed.printSchema()
#df_transformed.show(5, truncate=False)




In [62]:
# DISTINCT 처리된 district_name을 가져오기
df_district_distinct = df_district.select("district_name").distinct()

# 조인 수행
df_joined = df_transformed.join(
    df_district_distinct,
    df_transformed["district_name"] == df_district_distinct["district_name"],
    how="inner"
).select(
    df_transformed["incident_report_id"], 
    df_transformed["report_date"], 
    df_transformed["report_time"], 
    df_transformed["dispatch_date"], 
    df_transformed["dispatch_time"],
    df_transformed["arrival_date"],
    df_transformed["arrival_time"], 
    df_transformed["completion_date"], 
    df_transformed["completion_time"], 
    df_transformed["return_date"], 
    df_transformed["return_time"],
    df_transformed["report_yr"],
    df_transformed["report_mnth"],
    df_transformed["report_day"],
    df_transformed["report_timestamp"],
    df_transformed["arrival_timestamp"],
    df_transformed["time_difference"]
)

df_joined = df_joined.withColumn(
    "report_date",
    when(col("report_date") < "1900-01-01", None).otherwise(col("report_date"))
).withColumn(
    "dispatch_date",
    when(col("dispatch_date") < "1900-01-01", None).otherwise(col("dispatch_date"))
).withColumn(
    "arrival_date",
    when(col("arrival_date") < "1900-01-01", None).otherwise(col("arrival_date"))
)


# 결과 확인
#df_joined.show(5, truncate=False)

+------------------+-----------+-----------+-------------+-------------+------------+------------+---------------+---------------+-----------+-----------+---------+-----------+----------+-------------------+-------------------+---------------+
|incident_report_id|report_date|report_time|dispatch_date|dispatch_time|arrival_date|arrival_time|completion_date|completion_time|return_date|return_time|report_yr|report_mnth|report_day|report_timestamp   |arrival_timestamp  |time_difference|
+------------------+-----------+-----------+-------------+-------------+------------+------------+---------------+---------------+-----------+-----------+---------+-----------+----------+-------------------+-------------------+---------------+
|20181117507S01524 |2018-05-17 |47         |2018-05-17   |200          |2018-05-17  |400         |2018-05-17     |2000           |2018-05-17 |3000       |2018     |5          |17        |2018-05-17 00:00:47|2018-05-17 00:04:00|3.22           |
|20181113509S01275 |2018

In [63]:
# Dynamic Partition Overwrite 설정
spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")
spark.conf.set("spark.sql.parquet.datetimeRebaseModeInWrite", "LEGACY")

# Write transformed data to S3 with Dynamic Partition Overwrite
df_joined.write \
    .mode("overwrite") \
    .format('parquet') \
    .partitionBy("report_yr", "report_mnth", "report_day") \
    .save(f's3a://{mart_bucket}/incident_time_mart/')

# Create tr_incident_time table if not exists
spark.sql(
    """
    CREATE EXTERNAL TABLE IF NOT EXISTS cloud9_mart.mt_incident_time (
        `incident_report_id` STRING, 
        `report_date` DATE, 
        `report_time` INT, 
        `dispatch_date` DATE, 
        `dispatch_time` INT, 
        `arrival_date` DATE, 
        `arrival_time` INT, 
        `completion_date` DATE, 
        `completion_time` INT, 
        `return_date` DATE, 
        `return_time` INT,
        `report_timestamp` TIMESTAMP,
        `arrival_timestamp` TIMESTAMP,
        `time_difference` DOUBLE
    )
    PARTITIONED BY ( 
        `report_yr` STRING, 
        `report_mnth` STRING, 
        `report_day` STRING
    )
    STORED AS PARQUET
    LOCATION 
        's3://cloud9-mart/incident_time_mart/'
    TBLPROPERTIES (
        'classification' = 'parquet'
    )
    """
)

# Add yesterday's partition
from datetime import datetime, timedelta

# 어제 날짜를 KST 기준으로 계산
yesterday_kst = datetime.utcnow() + timedelta(hours=9) - timedelta(days=1)

# 연도, 월, 일을 추출 (한 자리 수일 경우 앞의 0 제거)
report_yr = yesterday_kst.strftime('%Y')  # 연도는 그대로
report_mnth = yesterday_kst.strftime('%m').lstrip('0')  # 0 제거
report_day = yesterday_kst.strftime('%d').lstrip('0')   # 0 제거

# Spark SQL로 파티션 추가
spark.sql(
    f"""
    ALTER TABLE cloud9_mart.mt_incident_time 
    ADD IF NOT EXISTS 
    PARTITION (report_yr='{report_yr}', report_mnth='{report_mnth}', report_day='{report_day}') 
        LOCATION 's3a://cloud9-mart/incident_time_mart/report_yr={report_yr}/report_mnth={report_mnth}/report_day={report_day}/'
    """
)

Py4JJavaError: An error occurred while calling o4147.save.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.jobAbortedError(QueryExecutionErrors.scala:638)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:279)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:193)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:103)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:107)
	at org.apa