# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


####  Run this cell to set up and start your interactive session.


In [1]:
%idle_timeout 2880
%glue_version 5.0
%worker_type G.1X
%number_of_workers 2

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.7 
Current idle_timeout is None minutes.
idle_timeout has been set to 2880 minutes.
Setting Glue version to: 5.0
Previous worker type: None
Setting new worker type to: G.1X
Previous number of workers: None
Setting new number of workers to: 2
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 2
Idle Timeout: 2880
Session ID: 9672d515-50dc-4ab3-958e-fb381fc45141
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
Waiting for session 9672d515-50dc-4ab3-958e-fb381fc45141 to get into ready status...
Session 9672d515-50dc-4ab3-958e-fb381fc45141 ha

In [3]:
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import boto3
import json
from awsglue.context import GlueContext
from pyspark.context import SparkContext
from awsglue.job import Job
from awsglue.utils import getResolvedOptions




In [72]:
# Spark 세션 초기화

spark = SparkSession.builder \
    .appName("Kinesis to Glue") \
    .getOrCreate()




In [81]:
# Kinesis 스트림에서 데이터 읽기 위한 설정

stream_name = "traffic_volume"
region_name = "ap-northeast-2"
shard_iterator_type = "TRIM_HORIZON"




In [82]:
# Kinesis 클라이언트 초기화

kinesis_client = boto3.client('kinesis', region_name=region_name)




In [89]:
# 첫 번째 샤드 가져오기 
shard_id = kinesis_client.describe_stream(StreamName=stream_name)['StreamDescription']['Shards'][3]['ShardId']

shardId-000000000003


In [92]:
# 샤드 이터레이터 생성 (Stream에서 처음부터 데이터를 읽음)

shard_iterator = kinesis_client.get_shard_iterator(
    StreamName=stream_name,
    ShardId=shard_id,
    ShardIteratorType=shard_iterator_type
)['ShardIterator']

# print(shard_iterator)

AAAAAAAAAAF8FGEiH9C+biyiOq+5CNpdSN9Z6lJ3OdL2O0PYb0XA1D8IykSXy3qDWsRRKScUx90D3eCxbYxaueLmCWdXC/9eejzEYbb30Tf61fHrE8nlyXvzIF0vCiXTHOAgMJfZd0Zl3ytcN18th73cbyntL4iK8Uj8b5CWexn8xxkPjc3kEWbuQL2AQDYQW8vFp2YruuwSZ67xFntHm5iRWnpUKdptq0xhNSoHkYEclyKTDcAnwQ==


In [93]:
# Kinesis 스트림에서 데이터 읽기

records = kinesis_client.get_records(ShardIterator=shard_iterator, Limit=10)

# print(records)

{'Records': [{'SequenceNumber': '49659446533842424901982992592902781809348023648409092146', 'ApproximateArrivalTimestamp': datetime.datetime(2025, 1, 13, 7, 12, 26, 11000, tzinfo=tzlocal()), 'Data': b'{"SPOT_NUM": "A-05", "YMD": "20250113", "HH": "15", "IO_TYPE": "1", "LANE_NUM": "1", "VOL": "277"}', 'PartitionKey': 'A-05'}, {'SequenceNumber': '49659446533842424901982992592903990735167638277583798322', 'ApproximateArrivalTimestamp': datetime.datetime(2025, 1, 13, 7, 12, 26, 24000, tzinfo=tzlocal()), 'Data': b'{"SPOT_NUM": "A-05", "YMD": "20250113", "HH": "15", "IO_TYPE": "1", "LANE_NUM": "2", "VOL": "765"}', 'PartitionKey': 'A-05'}, {'SequenceNumber': '49659446533842424901982992592910035364265711423457329202', 'ApproximateArrivalTimestamp': datetime.datetime(2025, 1, 13, 7, 12, 26, 45000, tzinfo=tzlocal()), 'Data': b'{"SPOT_NUM": "A-05", "YMD": "20250113", "HH": "15", "IO_TYPE": "1", "LANE_NUM": "3", "VOL": "486"}', 'PartitionKey': 'A-05'}, {'SequenceNumber': '4965944653384242490198299

In [86]:
# 데이터를 Spark DataFrame 으로 변환

def process_kinesis_data(records):
    data = []
    for record in records['Records']:
        
        # Kinesis의 각 레코드에서 데이터를 JSON 형식으로 디코딩
        data.append(json.loads(record['Data'].decode('utf-8')))
    return data




In [87]:
# Kinesis에서 읽은 데이터를 DataFrame 으로 변환
kinesis_data = process_kinesis_data(records)
df = spark.read.json(spark.sparkContext.parallelize(kinesis_data))

# Spark DataFrame 에서 데이터 확인
df.show()

+---+-------+--------+--------+---+--------+
| HH|IO_TYPE|LANE_NUM|SPOT_NUM|VOL|     YMD|
+---+-------+--------+--------+---+--------+
| 14|      1|       1|    A-05|250|20250113|
| 14|      1|       2|    A-05|735|20250113|
| 14|      1|       3|    A-05|487|20250113|
| 14|      2|       1|    A-05|473|20250113|
| 14|      2|       2|    A-05|672|20250113|
| 14|      2|       3|    A-05|383|20250113|
| 14|      1|       1|    A-15|  0|20250113|
| 14|      1|       2|    A-15|586|20250113|
| 14|      1|       3|    A-15|614|20250113|
| 14|      1|       4|    A-15|138|20250113|
+---+-------+--------+--------+---+--------+


In [106]:
import boto3
import json
import time
import pandas as pd
import numpy as np
import base64

# 데이터 읽을 시간 설정 (예: 60 초)
start_time = time.time()
end_time = start_time + 10


# 사드에서 데이터를 읽을 수 있는 위치를 가져오기 (GetShardIterator)
#while time.time() < end_time : # 10 초되면 종료

data_list = []

while time.time() < end_time : # 60 초되면 종료

# 스트림에서 샤드 아이디 가져오기

    shard_id = kinesis_client.describe_stream(StreamName=stream_name)['StreamDescription']['Shards'][3]['ShardId']

    shard_iterator = kinesis_client.get_shard_iterator(
        StreamName=stream_name,
        ShardId=shard_id,
        ShardIteratorType=shard_iterator_type
    )['ShardIterator']
    
    
    shard_iterator_response = kinesis_client.get_shard_iterator(
        StreamName=stream_name,
        ShardId=shard_id,
        ShardIteratorType='TRIM_HORIZON'
    )


    #데이터를 가져올 때, 이터레이터가 만료되지 않도록 체크 및 새로 요청
    while time.time() < end_time:
        try:
            # 레코드 가져오기
            records_response = kinesis_client.get_records(
                ShardIterator=shard_iterator,
                Limit=100 # 한번에 읽을 레코드 수 (최대 10 개)
            )

            # 레코드가 있으면 처리
            if records_response['Records'] :
                for record in records_response['Records']:
                    # Kinesis에서 읽은 데이터를 DataFrame 으로 변환
                    kinesis_data = process_kinesis_data(records)
                    df = spark.read.json(spark.sparkContext.parallelize(kinesis_data))

                # Spark DataFrame 에서 데이터 확인
                    df.show()
            else:
                print("No new records, waiting for new data...")


            # 다음 샤드 이터레이터 가져오기
            shard_iterator = records_response['NextShardIterator']

        except kinesis_client.exceptions.ExpiredIteratorException:
            
        # 10 초 대기후 재시도도
            time.sleep(10)


+---+-------+--------+--------+---+--------+
| HH|IO_TYPE|LANE_NUM|SPOT_NUM|VOL|     YMD|
+---+-------+--------+--------+---+--------+
| 15|      1|       1|    A-05|277|20250113|
| 15|      1|       2|    A-05|765|20250113|
| 15|      1|       3|    A-05|486|20250113|
| 15|      2|       1|    A-05|530|20250113|
| 15|      2|       2|    A-05|656|20250113|
| 15|      2|       3|    A-05|342|20250113|
| 15|      1|       1|    A-15|  0|20250113|
| 15|      1|       2|    A-15|556|20250113|
| 15|      1|       3|    A-15|585|20250113|
| 15|      1|       4|    A-15|205|20250113|
+---+-------+--------+--------+---+--------+

+---+-------+--------+--------+---+--------+
| HH|IO_TYPE|LANE_NUM|SPOT_NUM|VOL|     YMD|
+---+-------+--------+--------+---+--------+
| 15|      1|       1|    A-05|277|20250113|
| 15|      1|       2|    A-05|765|20250113|
| 15|      1|       3|    A-05|486|20250113|
| 15|      2|       1|    A-05|530|20250113|
| 15|      2|       2|    A-05|656|20250113|
| 15|    

#### Example: Write the data in the DynamicFrame to a location in Amazon S3 and a table for it in the AWS Glue Data Catalog


In [110]:
#DataFrame에서 DynamicFrame으로 변환
from awsglue.dynamicframe import DynamicFrame

DyF = DynamicFrame.fromDF(df, glueContext, "DyF")




In [111]:
s3output = glueContext.getSink(
  path="s3://sesac-tempo-kinesis",
  connection_type="s3",
  updateBehavior="UPDATE_IN_DATABASE",
  partitionKeys=[],
  compression="snappy",
  enableUpdateCatalog=True,
  transformation_ctx="s3output",
)
s3output.setCatalogInfo(
  catalogDatabase="cloud9_transformed", catalogTableName="tmp_volume"
)
s3output.setFormat("glueparquet")
s3output.writeFrame(DyF)

<awsglue.dynamicframe.DynamicFrame object at 0x7fe273e49a90>
