In [2]:
import json, boto3, logging, time
from botocore.exceptions import ClientError
import google.generativeai as genai
from farmhash import FarmHash32 as fhash
import asyncio

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
with open("../.KEYS/API_KEYS.json", "r") as f:
    key = json.load(f)

# S3 버킷 정보 get
with open("../.KEYS/DATA_SRC_INFO.json", "r") as f:
    bucket_info = json.load(f)

with open("../.KEYS/GEMINI_API_KEY.json", "r") as f:
    gemini_api_key = json.load(f)

with open("../.DATA/PROMPT_INFO.json") as f:
    prompt_metadata = json.load(f)

In [6]:
dynamo_table_name = bucket_info['restore_table_name']
dynamodb = boto3.resource(
        'dynamodb',
        aws_access_key_id=key['aws_access_key_id'],
        aws_secret_access_key=key['aws_secret_key'],
        region_name=key['region']
    )
table = dynamodb.Table(dynamo_table_name)

scan_kwargs = {}
source_data = []

In [13]:
table

dynamodb.Table(name='merged-data-table')

In [14]:
import time
import botocore.exceptions

# 지수 백오프를 포함한 스캔 작업
def scan_with_backoff(table, scan_kwargs):
    retry_attempts = 0
    max_retries = 10
    backoff_factor = 0.5
    source_data = []

    while True:
        try:
            response = table.scan(**scan_kwargs)
            source_data.extend(response.get('Items', []))
            last_evaluated_key = response.get('LastEvaluatedKey', None)
            if last_evaluated_key:
                scan_kwargs['ExclusiveStartKey'] = last_evaluated_key
            else:
                break
        except botocore.exceptions.ClientError as error:
            if error.response['Error']['Code'] == 'ProvisionedThroughputExceededException':
                if retry_attempts < max_retries:
                    retry_attempts += 1
                    time.sleep(backoff_factor * (2 ** retry_attempts))  # 지수 백오프
                else:
                    raise
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            break

    return source_data


In [15]:
source_data = scan_with_backoff(table, scan_kwargs)

In [16]:
len(source_data)

2298

In [18]:
# Pagination을 고려하여 모든 데이터를 가져오기
while True:
    response = table.scan(**scan_kwargs)
    source_data.extend(response.get('Items', []))
    scan_kwargs['ExclusiveStartKey'] = response.get('LastEvaluatedKey', None)
    if not scan_kwargs['ExclusiveStartKey']:
        break

In [19]:
wanted_data = [obj for obj in source_data if obj.get('site_symbol', '').upper() == "WAN"]
jobkorea_data = [obj for obj in source_data if obj.get('site_symbol', '').upper() == "JK"]
rocketpunch_data = [obj for obj in source_data if obj.get('site_symbol', '').upper() == "RP"]
programmers_data = [obj for obj in source_data if obj.get('site_symbol', '').upper() == "PRO"]

In [20]:
print(len(wanted_data))
print(len(jobkorea_data))
print(len(rocketpunch_data))
print(len(programmers_data))

243
392
357
1518


In [29]:
import pandas as pd
tmp = pd.DataFrame(programmers_data)
tmp2 = pd.DataFrame(jobkorea_data)

In [33]:
jobkrea_df = pd.DataFrame(jobkorea_data)

In [64]:
# 중복 제거
jobkrea_df = jobkrea_df.groupby(by=['id']).apply(lambda x : x[x.index == min(x.index)]).reset_index(drop=True)

  jobkrea_df = jobkrea_df.groupby(by=['id']).apply(lambda x : x[x.index == min(x.index)]).reset_index(drop=True)


In [65]:
genai.configure(api_key=gemini_api_key['GEMINI_API'])

# Create the model
generation_config = {
  "temperature": 0.7,
  "top_p": 0.95,
  "top_k": 64,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
}

model = genai.GenerativeModel(
  model_name="gemini-1.5-flash",
  generation_config=generation_config,
  # safety_settings = Adjust safety settings
  # See https://ai.google.dev/gemini-api/docs/safety-settings
)

chat_session = model.start_chat(
  history=[
  ]
)

In [66]:
def return_object_prompt(data, symbol_key):
    return data.get(symbol_key, {}).get("prompt")

def return_object_source_keys(data, symbol_key):
    return data.get(symbol_key, {}).get("source_key")

def return_concat_data_record(obj, data_dict):
    data_record = {
        "pid": obj.get("id"),
        "get_date": obj.get("get_date"),
        "site_symbol": obj.get("site_symbol"),
        "job_title": obj.get("job_title", None),
        "dev_stack": data_dict.get("dev_stack", []),
        "job_requirements": data_dict.get("job_requirements", []),
        "job_prefer": data_dict.get("job_prefer", []),
        "job_category": data_dict.get("job_category", []),
        "indurstry_type": data_dict.get("indurstry_type", []),
        "required_career": obj.get("required_career", None),
        "resume_required": obj.get("resume_required", None),
        "post_status": obj.get("post_status", None),
        "company_name": obj.get("company_name", None),
        "cid": fhash(obj.get("site_symbol")+obj.get("company_name")+str(int(obj.get("company_id")))),
        "start_date": obj.get("start_date", None),
        "end_date": obj.get("end_date", None),
        "crawl_domain": obj.get("crawl_domain", None),
        "crawl_url": obj.get("crawl_url", None)
    }
    
    return data_record

def upload_data(_item):
    # DynamoDB 클라이언트 생성
    dynamodb = boto3.resource(
        'dynamodb',
        aws_access_key_id=key['aws_access_key_id'],
        aws_secret_access_key=key['aws_secret_key'],
        region_name=key['region']
    )
    table = dynamodb.Table("precessed-data-table")
    table.put_item(Item=_item)

In [67]:
dynamodb = boto3.resource(
        'dynamodb',
        aws_access_key_id=key['aws_access_key_id'],
        aws_secret_access_key=key['aws_secret_key'],
        region_name=key['region']
    )

In [95]:
from datetime import datetime
async def test(source_data,_response):
    prompt_data = prompt_metadata.get("data", {})
    try:
        for idx, _obj in enumerate(source_data):
            _symbol = _obj.get('site_symbol', "").upper()
            if _symbol in prompt_data.keys():
                _data_source_keys = return_object_source_keys(prompt_data, _symbol)
                _prompt = return_object_prompt(prompt_data, _symbol).format(
                    data_source_keys=_data_source_keys, 
                    input_data=str(_obj)
                )
            task = asyncio.create_task(chat_session.send_message_async(_prompt))
            _response[idx] = await task
        print(datetime.now())
        await asyncio.sleep(60)
        print(datetime.now())
        for idx in range(len(source_data)):
            json_data = _response[idx].text.replace("```json\n", "").replace("\n```", "")
            dict_data = json.loads(json_data)
            data_item = return_concat_data_record(obj=_obj, data_dict=dict_data)
            upload_data(data_item)
    except Exception as e:
        print(e)
    

In [96]:
# count: 15 
count = 10
for i in range(len(source_data) // count):
    source_data[i:i+count]
    _response = [None for _ in range(count)]
    # 비동기 코드 실행
    await test(source_data[i:i+count],_response)