In [84]:
#create glue session
%idle_timeout 180
%timeout 30
%glue_version 5.0
%worker_type G.1X
%number_of_workers 2

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

You are already connected to a glueetl session 1a838cac-a610-4e71-9683-f328010f3f2d.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Current idle_timeout is 180 minutes.
idle_timeout has been set to 180 minutes.


You are already connected to a glueetl session 1a838cac-a610-4e71-9683-f328010f3f2d.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Current timeout is 30 minutes.
timeout has been set to 30 minutes.


You are already connected to a glueetl session 1a838cac-a610-4e71-9683-f328010f3f2d.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Setting Glue version to: 5.0


You are already connected to a glueetl session 1a838cac-a610-4e71-9683-f328010f3f2d.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Previous worker type: G.1X
Setting new worker type to: G.1X


You are already connected to a glueetl session 1a838cac-a610-4e71-9683-f328010f3f2d.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Previous number of workers: 2
Setting new number of workers to: 2



In [101]:
#set bucket directory
transformed_bucket = 'cloud9-transformed'
mart_bucket = 'cloud9-mart'




In [102]:
#load schema from transformed_bucket data catalog
df = glueContext.create_data_frame_from_catalog(database = "cloud9_transformed", table_name = "tr_fire_station")




In [103]:
# list loaded schema
df.printSchema()

# print sample rows
df.show(5)

root
 |-- fire_station_name: string (nullable = true)
 |-- fire_station_center_name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- phone: string (nullable = true)
 |-- fax: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- employee_count: long (nullable = true)
 |-- year: integer (nullable = true)

+-----------------+------------------------+--------------------------------+------------+-----------+-----------+-----------+--------------+----+
|fire_station_name|fire_station_center_name|                         address|       phone|        fax|   latitude|  longitude|employee_count|year|
+-----------------+------------------------+--------------------------------+------------+-----------+-----------+-----------+--------------+----+
| 서울소방재난본부|         개포119안전센터|서울특별시 강남구 논현로 10길...| 02-577-2540|02-529-1119|37.47470048| 127.049895|          null|null|
| 서울소방재난본부|         삼성119안전센터|서울특별시 강남구 테헤란로 62...|02-

In [104]:
from pyspark.sql.functions import col

# fire_station_name 컬럼 제거
df_dropped = df.drop("fire_station_name")

# employee_count 컬럼에서 null 값을 가진 행 제거
df_dropped_filtered = df_dropped.filter(col("employee_count").isNotNull())

# 결과 확인
df_dropped_filtered.show()

+------------------------+---------------------------------+------------+------------+-----------+-----------+--------------+----+
|fire_station_center_name|                          address|       phone|         fax|   latitude|  longitude|employee_count|year|
+------------------------+---------------------------------+------------+------------+-----------+-----------+--------------+----+
|              강남소방서| 서울특별시 강남구 테헤란로 62...|02-6981-7408| 02-556-2119|37.51027919|127.0668302|           372|2021|
|              강동소방서|  서울특별시 강동구 성내로 39 ...|02-6981-7600|02-6981-7717|37.52945788|127.1253721|           297|2021|
|              강북소방서|  서울특별시 강북구 한천로 911...|02-6946-0100|02-6946-0128|37.63305134|127.0381809|           238|2021|
|              강서소방서|  서울특별시 강서구 양천로 550...|02-6981-5000|02-2187-8243|37.55824607| 126.860047|           316|2021|
|              관악소방서|  서울특별시 관악구 관악로 97 ...|02-6981-8200| 02-877-4119|37.47412481|126.9526855|           274|2021|
|              광진소방서| 서울특별시 광진구 광

In [105]:
from pyspark.sql.functions import col, regexp_extract

# address 컬럼에서 "구" 앞의 부분(구 이름) 추출하여 gu_nm 컬럼에 저장
df_gu_nm = df_dropped_filtered.withColumn("district_name", regexp_extract(col("address"), r"([가-힣]+구)", 1))

# 결과 확인
df_gu_nm.show()

+------------------------+---------------------------------+------------+------------+-----------+-----------+--------------+----+-------------+
|fire_station_center_name|                          address|       phone|         fax|   latitude|  longitude|employee_count|year|district_name|
+------------------------+---------------------------------+------------+------------+-----------+-----------+--------------+----+-------------+
|              강남소방서| 서울특별시 강남구 테헤란로 62...|02-6981-7408| 02-556-2119|37.51027919|127.0668302|           372|2021|       강남구|
|              강동소방서|  서울특별시 강동구 성내로 39 ...|02-6981-7600|02-6981-7717|37.52945788|127.1253721|           297|2021|       강동구|
|              강북소방서|  서울특별시 강북구 한천로 911...|02-6946-0100|02-6946-0128|37.63305134|127.0381809|           238|2021|       강북구|
|              강서소방서|  서울특별시 강서구 양천로 550...|02-6981-5000|02-2187-8243|37.55824607| 126.860047|           316|2021|       강서구|
|              관악소방서|  서울특별시 관악구 관악로 97 ...|02-6981-8200| 02-8

In [106]:
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, IntegerType, DoubleType

# 컬럼에서 'string'이 아닌 필드를 추출하고 필요한 타입으로 변환
df_mart = df_gu_nm \
    .withColumn("fire_station_center_name", F.col("fire_station_center_name").cast(StringType())) \
    .withColumn("address", F.col("address").cast(StringType())) \
    .withColumn("phone", F.col("phone").cast(StringType())) \
    .withColumn("fax", F.col("fax").cast(StringType())) \
    .withColumn("latitude", F.col("latitude").cast(DoubleType())) \
    .withColumn("longitude", F.col("longitude").cast(DoubleType())) \
    .withColumn("employee_count", F.col("employee_count").cast(IntegerType())) \
    .withColumn("year", F.col("year").cast(StringType())) \
    .withColumn("district_name", F.col("district_name").cast(StringType())) \
    .select(
        "fire_station_center_name", 
        "address", 
        "phone", 
        "fax",
        "latitude",
        "longitude",
        "employee_count",
        "year",
        "district_name"
    )

# 결과 확인
df_mart.printSchema()

root
 |-- fire_station_center_name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- phone: string (nullable = true)
 |-- fax: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- employee_count: integer (nullable = true)
 |-- year: string (nullable = true)
 |-- district_name: string (nullable = true)


In [107]:
#write mart data to S3
df_mart.write \
              .mode("overwrite") \
              .format('parquet') \
              .partitionBy("year") \
              .save(f's3a://{mart_bucket}/firefighter_mart/')

# Create mt_fire_station table if not exists
spark.sql(
    """
    CREATE EXTERNAL TABLE IF NOT EXISTS `cloud9_mart`.mt_firefighter (
        `fire_station_center_name` STRING,
        `address` STRING, 
        `phone` STRING, 
        `fax` STRING, 
        `latitude` DOUBLE, 
        `longitude` DOUBLE,
        `employee_count` INT,
        `district_name` STRING
    )
    PARTITIONED BY (`year` STRING
    )
    ROW FORMAT SERDE 
        'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' 
    STORED AS INPUTFORMAT 
        'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' 
    OUTPUTFORMAT 
        'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
    LOCATION
        's3://cloud9-mart/firefighter_mart'
    TBLPROPERTIES (
        'classification' = 'parquet'
    )
    """
)

#Load partitions
spark.sql("MSCK REPAIR TABLE cloud9_mart.mt_firefighter")

DataFrame[]
