In [64]:
import json, boto3, logging
from botocore.exceptions import ClientError
import google.generativeai as genai
from farmhash import FarmHash32 as fhash

In [54]:
with open("../.KEYS/API_KEYS.json", "r") as f:
    key = json.load(f)

# S3 버킷 정보 get
with open("../.KEYS/DATA_SRC_INFO.json", "r") as f:
    bucket_info = json.load(f)

with open("../.KEYS/GEMINI_API_KEY.json", "r") as f:
    gemini_api_key = json.load(f)

with open("../.DATA/PROMPT_INFO.json") as f:
    prompt_metadata = json.load(f)

In [3]:
dynamo_table_name = bucket_info['restore_table_name']
dynamodb = boto3.resource(
        'dynamodb',
        aws_access_key_id=key['aws_access_key_id'],
        aws_secret_access_key=key['aws_secret_key'],
        region_name=key['region']
    )
table = dynamodb.Table(dynamo_table_name)

scan_kwargs = {}
source_data = []

In [4]:
# Pagination을 고려하여 모든 데이터를 가져오기
while True:
    response = table.scan(**scan_kwargs)
    source_data.extend(response.get('Items', []))
    scan_kwargs['ExclusiveStartKey'] = response.get('LastEvaluatedKey', None)
    if not scan_kwargs['ExclusiveStartKey']:
        break

In [14]:
wanted_data = [obj for obj in source_data if obj.get('site_symbol', '').upper() == "WAN"]
jobkorea_data = [obj for obj in source_data if obj.get('site_symbol', '').upper() == "JK"]
rocketpunch_data = [obj for obj in source_data if obj.get('site_symbol', '').upper() == "RP"]
programmers_data = [obj for obj in source_data if obj.get('site_symbol', '').upper() == "PRO"]

In [20]:
genai.configure(api_key=gemini_api_key['GEMINI_API'])

# Create the model
generation_config = {
  "temperature": 0.7,
  "top_p": 0.95,
  "top_k": 64,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
}

model = genai.GenerativeModel(
  model_name="gemini-1.5-flash",
  generation_config=generation_config,
  # safety_settings = Adjust safety settings
  # See https://ai.google.dev/gemini-api/docs/safety-settings
)

chat_session = model.start_chat(
  history=[
  ]
)

In [75]:
def return_object_prompt(data, symbol_key):
    return data.get(symbol_key, {}).get("prompt")

def return_object_source_keys(data, symbol_key):
    return data.get(symbol_key, {}).get("source_key")

def return_concat_data_record(obj, data_dict):
    data_record = {
        "pid": obj.get("id"),
        "get_date": obj.get("get_date"),
        "site_symbol": obj.get("site_symbol"),
        "job_title": obj.get("job_title", None),
        "dev_stack": data_dict.get("dev_stack", []),
        "job_requirements": data_dict.get("job_requirements", []),
        "job_prefer": data_dict.get("job_prefer", []),
        "job_category": data_dict.get("job_category", []),
        "indurstry_type": data_dict.get("indurstry_type", []),
        "required_career": obj.get("required_career", None),
        "resume_required": obj.get("resume_required", None),
        "post_status": obj.get("post_status", None),
        "company_name": obj.get("company_name", None),
        "cid": fhash(obj.get("site_symbol")+obj.get("company_name")+str(int(obj.get("company_id")))),
        "start_date": obj.get("start_date", None),
        "end_date": obj.get("end_date", None),
        "crawl_domain": obj.get("crawl_domain", None),
        "crawl_url": obj.get("crawl_url", None)
    }
    
    return data_record

In [80]:
dynamodb = boto3.resource(
        'dynamodb',
        aws_access_key_id=key['aws_access_key_id'],
        aws_secret_access_key=key['aws_secret_key'],
        region_name=key['region']
    )

In [78]:
prompt_data = prompt_metadata.get("data", {})
for _obj in source_data:
    _symbol = _obj.get('site_symbol', "").upper()
    if _symbol in prompt_data.keys():
        _data_source_keys = return_object_source_keys(prompt_data, _symbol)
        _prompt = return_object_prompt(prompt_data, _symbol).format(
            data_source_keys=_data_source_keys, 
            input_data=str(_obj)
            )
        try:
            response = chat_session.send_message(_prompt)
            json_data = response.text.replace("```json\n", "").replace("\n```", "")
            dict_data = json.loads(json_data)
            data_record = return_concat_data_record(obj=_obj, data_dict=dict_data)
        except Exception as e:
            print(e)
    break
        

In [79]:
def upload_data(records):
    # DynamoDB 클라이언트 생성
    dynamodb = boto3.resource(
        'dynamodb',
        aws_access_key_id=key['aws_access_key_id'],
        aws_secret_access_key=key['aws_secret_key'],
        region_name=key['region']
    )
    table = dynamodb.Table(push_table_name)
    for item in records:
        table.put_item(Item=item)

{'pid': Decimal('2902426399'),
 'get_date': Decimal('20240828'),
 'site_symbol': 'PRO',
 'job_title': '프로그램 개발 엔지니어(PM/PL)',
 'dev_stack': ['Java',
  'JavaScript',
  'Spring',
  'Oracle',
  'MSSQL',
  'AWS',
  'Azure',
  'Google Cloud',
  'MySQL',
  'PostgreSQL'],
 'job_requirements': ['IT 프로젝트 경력 6년 이상',
  'PL 경험 2년 이상',
  '웹 프로그래밍 능숙',
  'RDBMS 활용 가능',
  '고객 요구사항 분석',
  '시스템 설계 가능'],
 'job_prefer': ['구매 관련 프로젝트 경력',
  '엠로 개발 프레임워크 경험',
  'SCM 솔루션 개발 경험',
  '클라우드 플랫폼 경험',
  '프로젝트 관리 자격증'],
 'job_category': ['Software Development',
  'Backend Development',
  'Frontend Development',
  'Project Management'],
 'indurstry_type': ['Software',
  'SCM',
  'Supply Chain Management',
  'E-commerce'],
 'required_career': True,
 'resume_required': True,
 'post_status': True,
 'company_name': '엠로',
 'cid': 3388543428,
 'start_date': '2024-07-03',
 'end_date': None,
 'crawl_domain': 'https://career.programmers.co.kr/',
 'crawl_url': 'https://career.programmers.co.kr/api/job_positions/8394'}