In [53]:
import json, boto3, logging, sys, datetime
from farmhash import FarmHash32 as fhash
from botocore.exceptions import ClientError
import pandas as pd
import utils

In [7]:
# S3 client 생성에 필요한 보안 자격 증명 정보 get
with open("../../.KEYS/FIRST_PREPROCESSING_KEY.json", "r") as f:
    aws_key = json.load(f)

# S3 버킷 정보 get
with open("../../.KEYS/DATA_SRC_INFO.json", "r") as f:
    storage_info = json.load(f)
    
# S3 섹션 및 client 생성
session = boto3.Session(
    aws_access_key_id=aws_key['aws_access_key_id'],
    aws_secret_access_key=aws_key['aws_secret_key'],
    region_name=aws_key['region']
)

# S3 버킷 정보 init
s3 = session.client('s3')
pull_bucket_name = storage_info['pull_bucket_name']
push_table_name = storage_info['restore_table_name']
data_archive_bucket_name = storage_info['crawl_data_bucket_name']
id_list_bucket_name = storage_info['id_storage_bucket_name']
target_folder_prefix = storage_info['target_folder_prefix']['wanted_path']

In [None]:
def get_id_from_s3(s3_client, buket_name, prefix):
    metadata_list = utils.get_bucket_metadata(s3_client, buket_name,prefix)
    if len(metadata_list) > 1:
        try:
            _obj = metadata_list[1]
            response = s3.get_object(Bucket=buket_name, Key=_obj['Key'])
            json_context = response['Body'].read().decode('utf-8')
            join_dict = json.loads(json_context)
            return join_dict.get('ids')
        except json.JSONDecodeError as e:
            logging.error(f"JSONDecodeError encountered: {e}")
            return False
        except ClientError as e:
            logging.error(f"ClientError encountered: {e}")
            return False
        except Exception as e:
            logging.error(f"Unknow Error. encountered: {e}")
            return False

In [66]:
def put_id_to_s3(s3_client, buket_name, prefix, upload_id_list):
    odd_id_list = get_id_from_s3(s3_client, buket_name, "obj_ids.json")
    new_id_list = list(set(odd_id_list + upload_id_list))
    update_date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
    data = {
        "version": "2024-09-04",
        "statement": "id_list_of_precessed_data_objects",
        "ids": new_id_list,
        "last_update": update_date
    }
    json_string = json.dumps(data)
    response = s3_client.put_object(
        Bucket=buket_name,          # 버킷 이름
        Key=prefix,             # 업로드할 파일의 키(경로 및 이름)
        Body=json_string,               # 업로드할 파일의 내용
        ContentType='application/json'  # 파일의 MIME 타입
    )
    
    return response
    
def get_id_from_s3(s3_client, buket_name, prefix):
    metadata_list = utils.get_bucket_metadata(s3_client, buket_name, prefix)
    if metadata_list:
        try:
            _obj = metadata_list[0]
            response = s3.get_object(Bucket=buket_name, Key=_obj['Key'])
            json_context = response['Body'].read().decode('utf-8')
            join_dict = json.loads(json_context)
            return join_dict.get('ids')
        except json.JSONDecodeError as e:
            logging.error(f"JSONDecodeError encountered: {e}")
            return False
        except ClientError as e:
            logging.error(f"ClientError encountered: {e}")
            return False
        except Exception as e:
            logging.error(f"Unknow Error. encountered: {e}")
            return False
        
def remove_duplicate_id(s3_client, buket_name, _df):
    id_list = get_id_from_s3(s3_client, buket_name, "obj_ids.json")
    df_id_list = _df['id'].unique().tolist()
    if id_list:
        unput_id_list = [id for id in df_id_list if id not in id_list]
        return unput_id_list
    else:
        return df_id_list

In [56]:
import pandas as pd

# 예시 데이터프레임 생성
data = {
    'id': [1, 2, 2, 3, 4, 4, 4, 5],
    'name': ['Alice', 'Bob', 'Bob', 'Charlie', 'David', 'David', 'David', 'Eve']
}
df = pd.DataFrame(data)

# 'id' 컬럼의 유니크 값만 추출
unique_ids = df['id'].unique()

print(unique_ids)


[1 2 3 4 5]


In [57]:
result = remove_duplicate_id(s3, id_list_bucket_name, df)

In [58]:
result

[1, 2, 3, 4, 5]

In [67]:
result1 = put_id_to_s3(s3, id_list_bucket_name, "obj_ids.json", result)

In [68]:
result1

{'ResponseMetadata': {'RequestId': 'FYQ7ECYMGTX39D5Z',
  'HostId': '0nbyY78kkWhsZCaPK7vRwGcnwNtcnECZ1KKtKnIsa1SwC07sqZrLgJCgFB7POT7NHdkFFu/88HJW8f7GxPxg8Q==',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '0nbyY78kkWhsZCaPK7vRwGcnwNtcnECZ1KKtKnIsa1SwC07sqZrLgJCgFB7POT7NHdkFFu/88HJW8f7GxPxg8Q==',
   'x-amz-request-id': 'FYQ7ECYMGTX39D5Z',
   'date': 'Wed, 04 Sep 2024 03:42:07 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"19dd81bac54fc989ab08af0e1cfdb14d"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"19dd81bac54fc989ab08af0e1cfdb14d"',
 'ServerSideEncryption': 'AES256'}