In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import sys
import os
import yaml
from dotenv import  load_dotenv

import numpy as np
sys.path.append('../../system/')
# from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import OpenAIEmbeddings
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import LocalFileStore

from configs import JD_PATH, COLLECTION, DB_PATH
from get_similarity.utils.preprocess import preprocess
from insert_chunks import *

from tqdm import tqdm

# 데이터 로드 및 전처리

In [7]:
# jd_folder = "/home/yhkim/code/JobPT/data/jd_origin"
jd_folder = "../../data/jd_origin"


all_data = pd.DataFrame()
for jd_path in os.listdir(jd_folder):
    full_path = os.path.join(jd_folder, jd_path)
    df = preprocess(full_path)
    print(f"Path: {full_path}, Length: {len(df)}")
    all_data = pd.concat([all_data, df], ignore_index=True)
print(f"Total Length: {len(all_data)}")

Path: ../../data/jd_origin/USA_jobs_total_front_end.csv, Length: 20
Path: ../../data/jd_origin/USA_jobs_total_back_end.csv, Length: 20
Path: ../../data/jd_origin/USA_jobs_total.csv, Length: 20
Total Length: 60


In [8]:
print(df.columns)

Index(['job_url', 'title', 'company', 'company_url', 'location', 'job_type',
       'date_posted', 'is_remote', 'description'],
      dtype='object')


In [None]:
example_cv = pd.read_csv("./UpdatedResumeDataSet.csv", encoding='utf=8')
print(example_cv.iloc[3])

Category                                         Data Science
Resume      Skills • R • Python • SAP HANA • Tableau • SAP...
Name: 3, dtype: object


In [None]:
# #초기 데이터 인코딩 오류 수정
# example_cv = pd.read_csv("./UpdatedResumeDataSet.csv", encoding='latin-1')

# col_name = "Resume"        # 예: 'description', 'Skills', …
# # ③ 역‑디코딩:  문자열 → 바이트(CP‑1252) → UTF‑8 → 문자열
# example_cv[col_name] = (
#     example_cv[col_name]
#       .str.encode("latin1")      # ⇢   원래 바이트값 E2 80 A2 복원
#       .str.decode("utf-8")       # ⇢   • 로 올바르게 변환
# )

# #중복 제거
# example_cv.drop_duplicates(subset=["Resume"], inplace=True)
# # example_cv.to_csv("./UpdatedResumeDataSet.csv", encoding='utf=8', index=False)

In [20]:
len((example_cv["Resume"]).unique())

166

# Prompt Engineering

In [21]:
from openai import OpenAI
client = OpenAI()

In [22]:
def make_cv(system, user, model="gpt-4.1-mini"):
    completion = client.chat.completions.create(
      model=model,
      messages=[
    {"role": "developer", "content": f"{system}"},
    {"role": "user", "content": f"{user}"}
  ]
    )
    return completion.choices[0].message.content

In [23]:
with open("./data/cv_prompt.yaml", "r") as f:
    prompt = yaml.safe_load(f)

In [24]:
prompt

{'system': {'base': 'You are a professional CV generator.\n Your task is to create a tailored curriculum vitae (CV) based on the provided job description (JD).\n The CV must be professional, realistic, and meet the key requirements and preferred qualifications in the JD.\n Include sections for personal information, education, work experience, skills, certifications, and project experience.\n The CV should be of practical quality and proper formatting for real-world use.\n',
  'version1(format)': 'You are a professional CV generator.\nYou must output a CV that **exactly matches** the example format \n(same section order, headings, bold text, indentation, bullet style, and line breaks).\nThe CV should be of practical quality and proper formatting for real-world use.\n'},
 'user': {'base': '"Generate a professional CV tailored to the following job description:\n\nExample CV format : {cv}\n\nJD: {jd}\n\nPlease create a complete CV that matches the requirements in the job description while 

## 데이터 생성 예제

In [None]:
###cv 생성 예제

# cv_result = make_cv(
#     system=prompt["system"]["base"],
#     user=prompt["user"]["base"].format(cv=example_cv["Resume"].iloc[0], jd=df.iloc[0]),
#     model="gpt-4o-mini"
# )

In [25]:
results = []
for i in tqdm(range(10)):
    # cv와 jd 예제를 하나씩 가져옴
    cv_example = example_cv["Resume"].iloc[i]
    jd_example = df.iloc[i]["description"]

    system = prompt["system"]["base"]
    user=prompt["user"]["base"].format(cv=cv_example, jd=jd_example)

    generated_cv = make_cv(system, user, model="gpt-4.1-mini")

    #cv, jd, generated_cv를 dict로 저장
    results.append({
        "cv_example": cv_example,
        "jd": jd_example,
        "generated_cv": generated_cv
    })

# dataframe으로 변환 및 id 지정
df_result = pd.DataFrame(results)
df_result.index.name = "id"

100%|██████████| 10/10 [02:00<00:00, 12.09s/it]


In [28]:
df_result.head()

Unnamed: 0_level_0,cv_example,jd,generated_cv
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Skills * Programming Languages: Python (pandas...,##### **Job Type: Contract**\n\n##### **Job C...,**John Doe** \nAzure GenAI Engineer \nPhone:...
1,Education Details \r\nMay 2013 to May 2017 B.E...,"Location\n\nRemote, USA\n\nType\n\nFull time\n...","John A. Mitchell \nAustin, TX, USA \nPhone: ..."
2,"Areas of Interest Deep Learning, Control Syste...",EvenUp is on a mission to support injury law f...,"**John A. Smith** \nSan Francisco Bay Area, C..."
3,Skills • R • Python • SAP HANA • Tableau • SAP...,"Riverbed. Empower the Experience:\n\nRiverbed,...",John A. Mitchell \nPhone: +1 (312) 555-7890 ...
4,"Education Details \r\n MCA YMCAUST, Faridab...",## **Teamwork makes the stream work.**\n\n###...,Education Details \nMaster of Science (M.S.) ...


In [29]:
df_result.to_csv("./data/base_cv.csv", index=True)

In [30]:
len(all_data)

60

## jd별로 10개씩 생성 및 저장

In [None]:
for model in ["gpt-4.1-mini","o4-mini", "o3-mini", "gpt-4.1"]:
    results = []
    for i in tqdm(range(10)):
        cv_example = example_cv["Resume"].iloc[i]
        jd_example = df.iloc[i]["description"]

        system = prompt["system"]["version1(format)"]
        user=prompt["user"]["version1(format)"].format(cv=cv_example, jd=jd_example)

        generated_cv = make_cv(system, user, model=model)
        results.append({
            "cv_example": cv_example,
            "jd": jd_example,
            "generated_cv": generated_cv
        })


    df_result = pd.DataFrame(results)
    df_result.index.name = "id"
    df_result.to_csv(f"./data/{model}_format.csv", index=True)

100%|██████████| 10/10 [01:02<00:00,  6.22s/it]
100%|██████████| 10/10 [06:56<00:00, 41.65s/it]
100%|██████████| 10/10 [03:57<00:00, 23.77s/it]
100%|██████████| 10/10 [02:52<00:00, 17.30s/it]


## 모든 jd별로 cv생성하여 저장

In [29]:
for model in ["gpt-4.1-mini","o4-mini", "o3-mini", "gpt-4.1"]:
    results = []
    for i in tqdm(range(len(all_data))):
        cv_example = example_cv["Resume"].iloc[i]
        jd_example = all_data.iloc[i]["description"]

        system = prompt["system"]["version1(format)"]
        user=prompt["user"]["version1(format)"].format(cv=cv_example, jd=jd_example)

        generated_cv = make_cv(system, user, model=model)
        results.append({
            "cv_example": cv_example,
            "jd": jd_example,
            "generated_cv": generated_cv
        })


    df_result = pd.DataFrame(results)
    df_result.index.name = "id"
    df_result.to_csv(f"./data/{model}_resume.csv", index=True)

100%|██████████| 60/60 [07:58<00:00,  7.97s/it]
100%|██████████| 60/60 [42:41<00:00, 42.69s/it]
100%|██████████| 60/60 [23:50<00:00, 23.84s/it]
100%|██████████| 60/60 [10:30<00:00, 10.51s/it]
