In [8]:
#create glue session
%idle_timeout 60
%timeout 60
%glue_version 5.0
%worker_type G.1X
%number_of_workers 2

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import pyspark.sql.functions as F
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

You are already connected to a glueetl session 2463911c-0b38-49a2-9534-b7d5bc87a9a6.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Current idle_timeout is None minutes.
idle_timeout has been set to 60 minutes.


You are already connected to a glueetl session 2463911c-0b38-49a2-9534-b7d5bc87a9a6.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Current timeout is None minutes.
timeout has been set to 60 minutes.


You are already connected to a glueetl session 2463911c-0b38-49a2-9534-b7d5bc87a9a6.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Setting Glue version to: 5.0


You are already connected to a glueetl session 2463911c-0b38-49a2-9534-b7d5bc87a9a6.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Previous worker type: None
Setting new worker type to: G.1X


You are already connected to a glueetl session 2463911c-0b38-49a2-9534-b7d5bc87a9a6.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Previous number of workers: None
Setting new number of workers to: 2



In [9]:
#set bucket directory
transformed_bucket = 'cloud9-transformed'
mart_bucket = 'cloud9-mart'




In [10]:
#load schema from transformed_bucket data catalog
df_district = glueContext.create_data_frame_from_catalog(database = "cloud9_transformed", table_name = "tr_district_seoul")
df_station = glueContext.create_data_frame_from_catalog(database = "cloud9_mart", table_name = "mt_fire_station")



In [11]:
# list loaded schema
df_district.printSchema()
df_station.printSchema()

root
 |-- district_name: string (nullable = true)
 |-- dong_name: string (nullable = true)
 |-- population: integer (nullable = true)
 |-- area: double (nullable = true)
 |-- density: integer (nullable = true)
 |-- year: integer (nullable = true)

root
 |-- fire_station_name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- phone: string (nullable = true)
 |-- fax: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- employee_count: long (nullable = true)
 |-- year: integer (nullable = true)
 |-- fire_station_center_name: string (nullable = true)


In [32]:
# address 칼럼에서 구 이름 추출하여 gu_nm 생성
from pyspark.sql.functions import col, regexp_extract

df_station_gu_nm = df_station.withColumn("gu_nm", regexp_extract(col("address"), r"([가-힣]+구)", 0))

df_station_gu_nm_grouped = df_station_gu_nm.groupby("gu_nm") \
    .agg(
        F.count("gu_nm").alias("subdistrict_count"),
        F.sum("employee_count").alias("total_employee_count")
    )




In [25]:
df_station_gu_nm_grouped.show()

+--------+-----------------+--------------------+
|   gu_nm|subdistrict_count|total_employee_count|
+--------+-----------------+--------------------+
|영등포구|                6|                 284|
|  성북구|                5|                 248|
|  노원구|                6|                 275|
|서대문구|                5|                 244|
|  용산구|                6|                 252|
|  광진구|                4|                 223|
|  송파구|                7|                 324|
|  동작구|                5|                 237|
|  마포구|                7|                 314|
|  관악구|                5|                 274|
|  은평구|                5|                 257|
|  중랑구|                5|                 245|
|  강동구|                7|                 300|
|  도봉구|                5|                 231|
|  강북구|                5|                 241|
|  금천구|                3|                 187|
|  구로구|                7|                 303|
|  서초구|                7|                 321|
|  강서구

In [14]:
#df_district.show()

+-------------+---------+----------+----+-------+----+
|district_name|dong_name|population|area|density|year|
+-------------+---------+----------+----+-------+----+
|       강남구|  개포1동|     11907|1.27|   9376|2023|
|       강남구|  개포2동|     41804|2.51|  16655|2023|
|       강남구|  개포3동|     16894|1.24|  13624|2023|
|       강남구|  개포4동|     23384|1.49|  15694|2023|
|       강남구|  논현1동|     21759|1.25|  17407|2023|
|       강남구|  논현2동|     20774|1.47|  14132|2023|
|       강남구|  대치1동|     23934|0.79|  30296|2023|
|       강남구|  대치2동|     38417| 2.0|  19209|2023|
|       강남구|  대치4동|     18929|0.73|  25930|2023|
|       강남구|  도곡1동|     20701|1.02|  20295|2023|
|       강남구|  도곡2동|     32738|1.02|  32096|2023|
|       강남구|  삼성1동|     12384|1.94|   6384|2023|
|       강남구|  삼성2동|     30888|1.24|  24910|2023|
|       강남구|   세곡동|     45724|6.36|   7189|2023|
|       강남구|   수서동|     14090|1.43|   9853|2023|
|       강남구|   신사동|     15681|1.89|   8297|2023|
|       강남구| 압구정동|     25755|2.53|  10180|2023|
|  

In [33]:
# district_seoul 그룹화
from pyspark.sql.functions import max, col
from pyspark.sql.types import *

# 최근 년도 데이터만 필터하기
recent_year = df_district.agg(max('year')).collect()[0][0]  # 'year' 칼럼의 최대값 구하기
filtered_df_district = df_district.filter((col('year') == recent_year) | col('year').isNull())

# 그룹화
filtered_df_district_grouped = filtered_df_district.groupby("district_name") \
    .agg(
        F.count("district_name").alias("gu_dong_count"),
        F.sum("population").alias("total_population"),
        F.sum("area").alias("total_area"),  # 소수점 둘째 자리까지 포맷
        F.sum("density").alias("total_density")
    ) \

# 결과 출력
#filtered_df_district.show()




In [37]:
filtered_df_district_grouped.printSchema()

root
 |-- district_name: string (nullable = true)
 |-- gu_dong_count: integer (nullable = false)
 |-- total_population: double (nullable = true)
 |-- total_area: double (nullable = true)
 |-- total_density: integer (nullable = true)


In [35]:
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType, DoubleType

# 'struct' 컬럼에서 'string'이 아닌 필드를 추출하고 필요한 타입으로 변환
filtered_df_district_grouped = filtered_df_district_grouped \
    .withColumn("district_name", col("district_name").cast(StringType())) \
    .withColumn("gu_dong_count", col("gu_dong_count").cast(IntegerType())) \
    .withColumn("total_population", col("total_population").cast(DoubleType())) \
    .withColumn("total_area", col("total_area").cast(DoubleType())) \
    .withColumn("total_density", col("total_density").cast(IntegerType())) \
    .select(
        "district_name", 
        "gu_dong_count", 
        "total_population", 
        "total_area", 
        "total_density",
    )




In [30]:
# 조인하기

df_mart = filtered_df_district_grouped.join(df_station_gu_nm_grouped, filtered_df_district_grouped.district_name == df_station_gu_nm_grouped.gu_nm, "outer")




In [36]:
#write mart data to S3
df_mart.write \
              .mode("overwrite") \
              .format('parquet') \
              .partitionBy("gu_nm") \
              .save(f's3a://{mart_bucket}/join_recent_district_seoul_fire_station_mart/')

#create tr_fire_station table if not exists
spark.sql(
    """
    CREATE EXTERNAL TABLE IF NOT EXISTS cloud9_mart.mt_join_recent_district_seoul_fire_station (
        `district_name` STRING,
        `subdistrict_count` int, 
        `total_population` double, 
        `total_area` double, 
        `total_density` int, 
        `fire_station` int,
        `firefighter` int
    )
    ROW FORMAT SERDE 
        'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' 
    STORED AS INPUTFORMAT 
        'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' 
    OUTPUTFORMAT 
        'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
    LOCATION
        's3://cloud9-mart/join_recent_district_seoul_fire_station_mart'
    TBLPROPERTIES (
        'classification' = 'parquet'
    )
    """
)

#Load partitions
spark.sql("MSCK REPAIR TABLE cloud9_mart.mt_fire_station")

DataFrame[]
