In [1]:
#create glue session
%idle_timeout 30
%timeout 30
%glue_version 5.0
%worker_type G.1X
%number_of_workers 2
%additional_python_modules pyproj

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import pyspark.sql.functions as f
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.7 
Current idle_timeout is None minutes.
idle_timeout has been set to 60 minutes.
Current timeout is None minutes.
timeout has been set to 60 minutes.
Setting Glue version to: 5.0
Previous worker type: None
Setting new worker type to: G.1X
Previous number of workers: None
Setting new number of workers to: 2
Additional python modules to be included:
pyproj
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 2
Idle Timeout: 60
Timeout: 60
Session ID: 53ede21f-0757-4483-a4ec-1110a4fa05d4
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
--addi

In [2]:
#set bucket directory
transformed_bucket = 'cloud9-transformed'
mart_bucket = 'cloud9-mart'




In [3]:
#load schema from raw data catalog
df = glueContext.create_data_frame_from_catalog(database = "cloud9_transformed", table_name = "tr_seoul_rescue")

#load schema from raw data catalog
df_district = glueContext.create_data_frame_from_catalog(database = "cloud9_transformed", table_name = "tr_district_seoul")



In [8]:
# list loaded schema
#df.printSchema()

#print sample rows
#df.show(3)

In [5]:
#Transform columns, data types
from pyspark.sql.functions import *
from pyspark.sql.types import *

df_transformed = df \
    .withColumnRenamed("msfrtn_resc_reprt_no", "incident_report_id") \
    .withColumnRenamed("acdnt_cause", "cause") \
    .withColumnRenamed("prcs_result_se_nm", "result") \
    .withColumnRenamed("sigungu_nm", "district_name") \
    .withColumnRenamed("emd_nm", "dong_name") \
    .withColumnRenamed("cty_frmvl_se_nm", "city_rural_category") \
    .withColumn("longitude", col("gis_x_axis").cast(DoubleType())) \
    .withColumn("latitude", col("gis_y_axis").cast(DoubleType())) \
    .withColumnRenamed("acdnt_place_nm", "place_name") \
    .withColumnRenamed("acdnt_place_detail_nm", "place_detail") \
    .withColumnRenamed("acdnt_cause_asort_nm", "cause_subcategory") \
    .withColumnRenamed("frstt_nm", "fire_station_name") \
    .withColumnRenamed("ward_nm", "fire_station_center_name") \
    .withColumnRenamed("dclr_yr", "report_yr") \
    .withColumnRenamed("dclr_mnth", "report_mnth") \
    .withColumnRenamed("dclr_day", "report_day") \
    .select(
        "incident_report_id", 
        "cause", 
        "result", 
        "district_name", 
        "dong_name",
        "city_rural_category",
        "latitude", 
        "longitude", 
        "place_name", 
        "place_detail", 
        "cause_subcategory",
        "fire_station_name",
        "fire_station_center_name", 
        "report_yr", 
        "report_mnth",
        "report_day"
    )

# 결과 확인
#df_transformed.printSchema()
#df_transformed.show(3)

root
 |-- incident_report_id: string (nullable = true)
 |-- cause: string (nullable = true)
 |-- result: string (nullable = true)
 |-- district_name: string (nullable = true)
 |-- dong_name: string (nullable = true)
 |-- city_rural_category: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- place_name: string (nullable = true)
 |-- place_detail: string (nullable = true)
 |-- cause_subcategory: string (nullable = true)
 |-- fire_station_name: string (nullable = true)
 |-- fire_station_center_name: string (nullable = true)
 |-- report_yr: integer (nullable = true)
 |-- report_mnth: integer (nullable = true)
 |-- report_day: integer (nullable = true)


In [6]:
# SQL 쿼리 작성
df_transformed.createOrReplaceTempView("incident_tmp")
df_district.createOrReplaceTempView("district_tmp")

query = """
SELECT 
    i.*
FROM 
    incident_tmp i
INNER JOIN 
    (SELECT DISTINCT district_name FROM district_tmp) d
ON 
    i.district_name = d.district_name
"""

# SQL 실행 및 결과 DataFrame 생성
df_joined = spark.sql(query)

# 결과 확인
#df_joined.show()

+------------------+------------+--------+-------------+---------+-------------------+-----------+-----------+-------------+---------------+-----------------+-----------------+------------------------+---------+-----------+----------+
|incident_report_id|       cause|  result|district_name|dong_name|city_rural_category|   latitude|  longitude|   place_name|   place_detail|cause_subcategory|fire_station_name|fire_station_center_name|report_yr|report_mnth|report_day|
+------------------+------------+--------+-------------+---------+-------------------+-----------+-----------+-------------+---------------+-----------------+-----------------+------------------------+---------+-----------+----------+
| 20181117507S01524|        기타|안전조치|       강서구|   등촌동|               도시|550845.0514|187745.1455|     공동주택| 주상복합아파트|             기타|       강서소방서|              현장대응단|     2018|          5|        17|
| 20181113509S01275|    안전조치|안전조치|       마포구|   대흥동|               도시|  550592.21|  195244.81|   

In [7]:
from pyspark.sql.functions import udf
from pyproj import Transformer

# 좌표 변환기 설정 (EPSG:5186 -> EPSG:4326)
transformer = Transformer.from_crs("EPSG:5186", "EPSG:4326")

# 좌표 변환 함수 정의
def transform_coordinates(x, y):
    if x is not None and y is not None:
        lat, lon = transformer.transform(x, y)
        return lat, lon
    else:
        return None, None

# UDF로 등록
transform_coordinates_udf = udf(
    lambda x, y: transform_coordinates(x, y),
    StructType([
        StructField("latitude", DoubleType()),
        StructField("longitude", DoubleType())
    ])
)

# UDF 적용
df_joined = df_joined \
    .withColumn(
        "coordinates",
        transform_coordinates_udf(col("latitude"), col("longitude"))
    ) \
    .withColumn("latitude", col("coordinates").getField("latitude")) \
    .withColumn("longitude", col("coordinates").getField("longitude")) \
    .drop("coordinates")

# 결과 확인
#df_joined.show(3)

+------------------+------------+--------+-------------+---------+-------------------+------------------+------------------+----------+--------------+-----------------+-----------------+------------------------+---------+-----------+----------+
|incident_report_id|       cause|  result|district_name|dong_name|city_rural_category|          latitude|         longitude|place_name|  place_detail|cause_subcategory|fire_station_name|fire_station_center_name|report_yr|report_mnth|report_day|
+------------------+------------+--------+-------------+---------+-------------------+------------------+------------------+----------+--------------+-----------------+-----------------+------------------------+---------+-----------+----------+
| 20181117507S01524|        기타|안전조치|       강서구|   등촌동|               도시|  37.5570504106502|126.86130475937877|  공동주택|주상복합아파트|             기타|       강서소방서|              현장대응단|     2018|          5|        17|
| 20181113509S01275|    안전조치|안전조치|       마포구|   대흥동|     

In [8]:
spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")

#write transformed data to S3
df_joined.write \
              .mode("overwrite") \
              .format('parquet') \
              .partitionBy("report_yr", "report_mnth", "report_day") \
              .save(f's3a://{mart_bucket}/incident_report_mart/')

#create tr_incident_report table if not exists
spark.sql(
    """
    CREATE EXTERNAL TABLE IF NOT EXISTS cloud9_mart.mt_incident_report (
        `incident_report_id` STRING, 
        `cause` STRING, 
        `result` STRING, 
        `district_name` STRING, 
        `dong_name` STRING, 
        `city_rural_category` STRING, 
        `latitude` DOUBLE, 
        `longitude` DOUBLE, 
        `place_name` STRING, 
        `place_detail` STRING, 
        `cause_subcategory` STRING,
        `fire_station_name` STRING,
        `fire_station_center_name` STRING
    )
    PARTITIONED BY ( 
        `report_yr` STRING, 
        `report_mnth` STRING, 
        `report_day` STRING
    )
    STORED AS PARQUET
    LOCATION 
        's3://cloud9-mart/incident_report_mart/'
    TBLPROPERTIES (
        'classification' = 'parquet'
    )
    """
)

# Add yesterday's partition
from datetime import datetime, timedelta

# 어제 날짜를 KST 기준으로 계산
yesterday_kst = datetime.utcnow() + timedelta(hours=9) - timedelta(days=1)

# 연도, 월, 일을 추출 (한 자리 수일 경우 앞의 0 제거)
report_yr = yesterday_kst.strftime('%Y')  # 연도는 그대로
report_mnth = yesterday_kst.strftime('%m').lstrip('0')  # 0 제거
report_day = yesterday_kst.strftime('%d').lstrip('0')   # 0 제거

# Spark SQL로 파티션 추가
spark.sql(
    f"""
    ALTER TABLE cloud9_mart.mt_incident_report 
    ADD IF NOT EXISTS 
    PARTITION (report_yr='{report_yr}', report_mnth='{report_mnth}', report_day='{report_day}') 
        LOCATION 's3a://{mart_bucket}/incident_report_mart/report_yr={report_yr}/report_mnth={report_mnth}/report_day={report_day}/'
    """
)

DataFrame[]
