In [1]:
%idle_timeout 60
%timeout 30
%glue_version 5.0
%worker_type G.1X
%number_of_workers 3

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import pyspark.sql.functions as f
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.7 
Current idle_timeout is None minutes.
idle_timeout has been set to 60 minutes.
Current timeout is None minutes.
timeout has been set to 30 minutes.
Setting Glue version to: 5.0
Previous worker type: None
Setting new worker type to: G.1X
Previous number of workers: None
Setting new number of workers to: 3
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 3
Idle Timeout: 60
Timeout: 30
Session ID: 37007a84-29d8-4685-9126-2e6e478c061f
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
Waiting for session 37007a84-29d8-4685-9126-2e6e478c061

In [23]:
raw_bucket = 'cloud9-batch-raw'




In [24]:
df = spark.read \
          .option("header", "true") \
          .csv(f's3://{raw_bucket}/seoul_rescue/')




In [25]:
# 스키마 확인
#df.printSchema()

root
 |-- msfrtn_resc_reprt_no: string (nullable = true)
 |-- acdnt_cause: string (nullable = true)
 |-- prcs_result_se_nm: string (nullable = true)
 |-- dclr_ymd: string (nullable = true)
 |-- dclr_tm: string (nullable = true)
 |-- dclr_yr: string (nullable = true)
 |-- season_se_nm: string (nullable = true)
 |-- qtr_se: string (nullable = true)
 |-- dclr_mnth: string (nullable = true)
 |-- dclr_day: string (nullable = true)
 |-- dclr_hour: string (nullable = true)
 |-- dclr_min: string (nullable = true)
 |-- daywk: string (nullable = true)
 |-- dsp_ymd: string (nullable = true)
 |-- dsp_tm: string (nullable = true)
 |-- dsp_yr: string (nullable = true)
 |-- dsp_mnth: string (nullable = true)
 |-- dsp_day: string (nullable = true)
 |-- dsp_hour: string (nullable = true)
 |-- dsp_min: string (nullable = true)
 |-- spt_arvl_ymd: string (nullable = true)
 |-- spt_arvl_tm: string (nullable = true)
 |-- spt_arvl_yr: string (nullable = true)
 |-- spt_arvl_mnth: string (nullable = true)
 |--

In [8]:
# 10개 row 데이터 확인
#df.show(5, False)

In [16]:
spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")

df = df.filter(df['msfrtn_resc_reprt_no'] != 'msfrtn_resc_reprt_no')

df.write \
  .mode("overwrite") \
  .option("header", "false") \
  .partitionBy("dclr_yr", "dclr_mnth", "dclr_day") \
  .csv(f's3a://{raw_bucket}/seoul_rescue_raw_partitioned/')

spark.sql(
    """
    CREATE EXTERNAL TABLE IF NOT EXISTS cloud9_raw.rw_seoul_rescue (
        `msfrtn_resc_reprt_no` string, 
        `acdnt_cause` string, 
        `prcs_result_se_nm` string, 
        `dclr_ymd` string, 
        `dclr_tm` string, 
        `season_se_nm` string, 
        `qtr_se` string, 
        `dclr_hour` string, 
        `dclr_min` string, 
        `daywk` string, 
        `dsp_ymd` string, 
        `dsp_tm` string, 
        `dsp_yr` string, 
        `dsp_mnth` string, 
        `dsp_day` string, 
        `dsp_hour` string, 
        `dsp_min` string, 
        `spt_arvl_ymd` string, 
        `spt_arvl_tm` string, 
        `spt_arvl_yr` string, 
        `spt_arvl_mnth` string, 
        `spt_arvl_day` string, 
        `spt_arvl_hour` string, 
        `spt_arvl_min` string, 
        `resc_cmptn_ymd` string, 
        `resc_cmptn_tm` string, 
        `resc_cmptn_yr` string, 
        `resc_cmptn_mnth` string, 
        `resc_cmptn_day` string, 
        `resc_cmptn_hour` string, 
        `resc_cmptn_min` string, 
        `hmg_ymd` string, 
        `hmg_tm` string, 
        `hmg_yr` string, 
        `hmg_mnth` string, 
        `hmg_day` string, 
        `hmg_hour` string, 
        `hmg_min` string, 
        `sido_nm` string, 
        `sigungu_nm` string, 
        `emd_nm` string, 
        `cty_frmvl_se_nm` string, 
        `emd_se_nm` string, 
        `gis_x_axis` double, 
        `gis_y_axis` double, 
        `spt_frstt_dist` double, 
        `acdnt_place_nm` string, 
        `acdnt_place_detail_nm` string, 
        `acdnt_cause_asort_nm` string, 
        `frstt_nm` string, 
        `ward_nm` string, 
        `lfdau_nm` string, 
        `time_unit_tmprt` string, 
        `time_unit_rainqty` string, 
        `time_unit_ws` string, 
        `time_unit_wd` string, 
        `time_unit_humidity` string, 
        `time_unit_msnf` string, 
        `time_unit` string
    )
    PARTITIONED BY ( 
        `dclr_yr` string, 
        `dclr_mnth` string, 
        `dclr_day` string
    )
    ROW FORMAT DELIMITED 
        FIELDS TERMINATED BY ',' 
    STORED AS INPUTFORMAT 
        'org.apache.hadoop.mapred.TextInputFormat' 
    OUTPUTFORMAT 
        'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
    LOCATION
        's3://cloud9-batch-raw/seoul_rescue_raw_partitioned/'
    TBLPROPERTIES (
        'classification'='csv'
    )
    """
)

spark.sql("MSCK REPAIR TABLE cloud9_raw.rw_seoul_rescue")

DataFrame[]
