In [1]:
%idle_timeout 60
%timeout 30
%glue_version 5.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import pyspark.sql.functions as f
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.7 
Current idle_timeout is None minutes.
idle_timeout has been set to 60 minutes.
Current timeout is None minutes.
timeout has been set to 30 minutes.
Setting Glue version to: 5.0
Previous worker type: None
Setting new worker type to: G.1X
Previous number of workers: None
Setting new number of workers to: 5
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 5
Idle Timeout: 60
Timeout: 30
Session ID: 85042cf0-7ae7-4e34-bd4f-aadb337d4d04
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
Waiting for session 85042cf0-7ae7-4e34-bd4f-aadb337d4d0

In [2]:
raw_bucket = 'cloud9-batch-raw'




In [3]:
# 데이터프레임 로드
df = spark.read \
    .option("header", "true") \
    .option("encoding", "utf-8") \
    .csv(f's3://{raw_bucket}/district_seoul/')

df.printSchema()
df.show(5, False)

root
 |-- 지역구: string (nullable = true)
 |-- 읍면동: string (nullable = true)
 |-- 인구 (명): string (nullable = true)
 |-- 면적 (㎢): string (nullable = true)
 |-- 인구밀도 (명/㎢): string (nullable = true)
 |-- 연도: string (nullable = true)

+------+-------+---------+---------+----------------+----+
|지역구|읍면동 |인구 (명)|면적 (㎢)|인구밀도 (명/㎢)|연도|
+------+-------+---------+---------+----------------+----+
|강남구|개포1동|20111    |1.27     |15835           |2017|
|강남구|개포2동|14678    |2.51     |5848            |2017|
|강남구|개포3동|-        |-        |-               |2017|
|강남구|개포4동|18525    |1.49     |12433           |2017|
|강남구|논현1동|24186    |1.25     |19349           |2017|
+------+-------+---------+---------+----------------+----+
only showing top 5 rows


In [5]:
new_cols = ["district_name", "dong_name", "population", "area", "density", "year"]

for i in range (0, len(df.columns)):
    df = df.withColumnRenamed(df.columns[i], new_cols[i])
    
# 결과 확인
df.printSchema()
df.show(5, False)

root
 |-- district_name: string (nullable = true)
 |-- dong_name: string (nullable = true)
 |-- population: string (nullable = true)
 |-- area: string (nullable = true)
 |-- density: string (nullable = true)
 |-- year: string (nullable = true)

+-------------+---------+----------+----+-------+----+
|district_name|dong_name|population|area|density|year|
+-------------+---------+----------+----+-------+----+
|강남구       |개포1동  |20111     |1.27|15835  |2017|
|강남구       |개포2동  |14678     |2.51|5848   |2017|
|강남구       |개포3동  |-         |-   |-      |2017|
|강남구       |개포4동  |18525     |1.49|12433  |2017|
|강남구       |논현1동  |24186     |1.25|19349  |2017|
+-------------+---------+----------+----+-------+----+
only showing top 5 rows


In [9]:
df.write \
  .mode("overwrite") \
  .option("header", "false") \
  .partitionBy("year") \
  .csv(f's3a://{raw_bucket}/district_seoul_raw_partitioned/')

spark.sql(
    """
    CREATE EXTERNAL TABLE if not exists cloud9_raw.rw_district_seoul (
        `district_name` string,
        `dong_name` string, 
        `population` bigint, 
        `area` double, 
        `density` bigint
    )
    PARTITIONED BY ( 
        `year` bigint
    )
    ROW FORMAT DELIMITED 
        FIELDS TERMINATED BY ',' 
    STORED AS INPUTFORMAT 
        'org.apache.hadoop.mapred.TextInputFormat' 
    OUTPUTFORMAT 
        'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
    LOCATION
        's3://cloud9-batch-raw/district_seoul_raw_partitioned/'
    TBLPROPERTIES (
        'classification'='csv'
    )
    """
)

spark.sql("MSCK REPAIR TABLE cloud9_raw.rw_district_seoul")

DataFrame[]
