In [53]:
from farmhash import FarmHash32 as fh
import pandas as pd
import pytz
import boto3
import json
import os
import datetime
from json.decoder import JSONDecodeError
from botocore.exceptions import ClientError
import logging
import re

In [32]:
file_path = '/home/team3/repository/keys/'

In [None]:
os.listdir(file_path)
with open(f'{file_path}/API_KEYS.json', 'r') as f:
    key = json.load(f)
with open(f'{file_path}/DATA_SRC_INFO.json', 'r') as f:
    bucket_info = json.load(f)

In [45]:
pull_bucket_name = bucket_info['pull_bucket_name']
push_bucket_name = bucket_info['push_bucket_name']
target_folder_prefix = bucket_info['target_folder_prefix']['rocketpunch_path']

In [46]:
print(target_folder_prefix)

rocketpunch/data/


In [48]:
# S3 섹션 및 client 생성
session = boto3.Session(
    aws_access_key_id=key['aws_access_key_id'],
    aws_secret_access_key=key['aws_secret_key'],
    region_name=key['region']
)

s3 = session.client('s3')

In [51]:
# 특정 폴더 내 파일 목록 가져오기
# TODO: 
# - 마지막 실행일(년,월,일)을 datetime으로 저장한 파일을 읽어들여 curr_date에 적용하기; 당담: 유정연
response = s3.list_objects_v2(Bucket=pull_bucket_name, Prefix=target_folder_prefix, Delimiter='/')
curr_date = datetime.datetime.now(pytz.timezone('Asia/Seoul')).date()  # 로컬 시간대(UTC+9)로 현재 날짜 설정
kst_tz = pytz.timezone('Asia/Seoul') # kst timezone 설정
#curr_date = datetime.date(2024, 8, 21)

# curr_date 보다 날짜가 늦은 data josn 파일 metadata 객체 분류
if 'Contents' in response:
    target_file_list = [obj for obj in response['Contents'] if curr_date <= obj['LastModified'].astimezone(kst_tz).date()]
    print(target_file_list)
else:
    print("No objects found in the folder.")

[{'Key': 'rocketpunch/data/2024-08-21_1702.json', 'LastModified': datetime.datetime(2024, 8, 21, 17, 6, 2, tzinfo=tzutc()), 'ETag': '"c0d72cb0580d227fba38776a7f3f8782"', 'Size': 931963, 'StorageClass': 'STANDARD'}]


In [54]:
for obj in target_file_list:
    try:
        response = s3.get_object(Bucket=pull_bucket_name, Key=obj['Key'])
        json_context = response['Body'].read().decode('utf-8')
        cleaned_text = re.sub(r'[\r\u2028\u2029]+', ' ', json_context) # 파싱을 위해 unuseal line terminators 제거
        json_list = [json.loads(line) for line in cleaned_text.strip().splitlines()] # pandas format으로 맞추기
        df = pd.DataFrame(json_list)
    except JSONDecodeError as e:
        logging.error(f"JSONDecodeError encountered: {e}")
        continue
    except ClientError as e:
        logging.error(f"ClientError encountered: {e}")
        continue
    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}")
        continue
    
    break

In [59]:
df.head(2)

Unnamed: 0,company_id,company_name,description,job_id,job_title,job_career,timestamp,crawl_domain,job_url,date_end,date_start,job_task,job_specialties,job_detail,job_industry,site_symbol
0,29987,런치팩,개발자 없이 웹/앱 플랫폼 서비스를 만들 수 있는 솔루션,151140,"풀스택 개발자 (테마, 서비스 부분수정 개발 / 리드급)",[경력],2024-08-21_17:02:32,www.rocketpunch.com,https://www.rocketpunch.com/jobs/151140,2024.09.05,2024.08.21,"- 플랫폼 서비스 프론트엔드, 백엔드 개발- 서비스 Back-office 개발100...","Django, Vue.js",자격요건- Django Rest Framework 구조를 통해 플랫폼 서비스 개발 ...,"IT서비스, 소프트웨어개발, 솔루션, SaaS, 앱개발, UX, 모바일앱개발, 웹빌...",RP
1,29987,런치팩,개발자 없이 웹/앱 플랫폼 서비스를 만들 수 있는 솔루션,151141,프론트엔드 개발자 (테마 프론트엔드 개발 / 시니어),[경력],2024-08-21_17:02:32,www.rocketpunch.com,https://www.rocketpunch.com/jobs/151141,2024.09.05,2024.08.21,- AWS 기반 솔루션 인프라 관리- 솔루션 백오피스 개발100% 원격근무 가능합니...,Vue.js,"자격요건- 관련 경력 5년 이상- VueJS를 버전2의 Options API부터, ...","IT서비스, 소프트웨어개발, 솔루션, SaaS, 앱개발, UX, 모바일앱개발, 웹빌...",RP


In [86]:
processing_df = pd.DataFrame()

In [96]:
for i, data in df.iterrows() :
    # rocketpunch key > merged key
    # job_task > job_tasks
    if pd.notnull(data['job_task']) :
        processing_df.at[i, 'job_tasks'] = ' '.join(
            [item for item in re.sub(r'[^.,/\-+()\s\w]', ' ', \
                re.sub(r'\\/', '/', data['job_task'])).split() if item not in ['-', '+']]
        )
    # job_specialites > stacks
    processing_df['stacks'] = re.sub(r'\\/', '/', data['job_specialties'])
    # job_detail > job_requirements
    processing_df['job_requirements'] = ' '.join(
        [item for item in re.sub(r'[^.,/\-+()\s\w]', ' ', \
            re.sub(r'\\/', '/', data['job_detail'])).split() if item not in ['-', '+']]
        )
    # job_industry > indurstry_type
    processing_df['indurstry_type'] = re.sub(r'\\/', '/', data['job_industry'])
    # date_start > start_date
    processing_df['start_date'] = datetime.datetime.strftime(data['date_start'])
    # date_end > end_date
    
    #job_career > required_career
    processing_df['required_career'] = any(career in "신입" for career in data['job_career'])
    
    # symbol 추가
    processing_df['symbol'] = "RP"
    
    print(processing_df)
    
    
    break

TypeError: descriptor 'strftime' for 'datetime.date' objects doesn't apply to a 'str' object