In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import sys
import os
import yaml
from dotenv import  load_dotenv

import numpy as np
sys.path.append('../../system/')
# from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import OpenAIEmbeddings
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import LocalFileStore

from configs import JD_PATH, COLLECTION, DB_PATH
from get_similarity.utils.preprocess import preprocess
from insert_chunks import *

from tqdm import tqdm

# 데이터 로드 및 전처리

In [3]:
jd_folder = "updated_jd"

full_paths = []
for jd_path in os.listdir(jd_folder):
    full_paths.append(os.path.join(jd_folder, jd_path))

all_dfs = []
for path in full_paths:
    df = pd.read_csv(path)
    df["location"] = path.split("/")[1].split("_")[0]
    all_dfs.append(df)
# 하나의 DataFrame으로 병합
merged_df = pd.concat(all_dfs, ignore_index=True)

#description 기준 중복값 제거
merged_df_dedup = merged_df.drop_duplicates(subset="description")
print(f"중복 제거 전 description 개수: {len(merged_df)}")
print(f"중복 제거 후 description 개수: {len(merged_df_dedup)}")

중복 제거 전 description 개수: 1184
중복 제거 후 description 개수: 704


In [4]:
print(df.columns)

Index(['job_url', 'title', 'company', 'location', 'date_posted', 'job_type',
       'is_remote', 'description'],
      dtype='object')


In [5]:
example_cv = pd.read_csv("./UpdatedResumeDataSet.csv", encoding='utf=8')
print(example_cv.iloc[3])

Category                                         Data Science
Resume      Skills • R • Python • SAP HANA • Tableau • SAP...
Name: 3, dtype: object


In [7]:
# #초기 데이터 인코딩 오류 수정
# example_cv = pd.read_csv("./UpdatedResumeDataSet.csv", encoding='latin-1')

# col_name = "Resume"        # 예: 'description', 'Skills', …
# # ③ 역‑디코딩:  문자열 → 바이트(CP‑1252) → UTF‑8 → 문자열
# example_cv[col_name] = (
#     example_cv[col_name]
#       .str.encode("latin1")      # ⇢   원래 바이트값 E2 80 A2 복원
#       .str.decode("utf-8")       # ⇢   • 로 올바르게 변환
# )

In [9]:
# 과하게 길이가 길거나 짧은 이력서 제거
example_cv = example_cv[(example_cv["Resume"].str.len() > 1500) & (example_cv["Resume"].str.len() < 5000)]

In [10]:
len(example_cv)

86

# Prompt Engineering

In [12]:
from openai import OpenAI
client = OpenAI()

In [13]:
def make_cv(system, user, model="gpt-4.1-mini"):
    completion = client.chat.completions.create(
      model=model,
      messages=[
    {"role": "developer", "content": f"{system}"},
    {"role": "user", "content": f"{user}"}
  ]
    )
    return completion.choices[0].message.content

In [14]:
with open("./data/cv_prompt.yaml", "r") as f:
    prompt = yaml.safe_load(f)

In [15]:
prompt

{'system': {'base': 'You are a professional CV generator.\n Your task is to create a tailored curriculum vitae (CV) based on the provided job description (JD).\n The CV must be professional, realistic, and meet the key requirements and preferred qualifications in the JD.\n Include sections for personal information, education, work experience, skills, certifications, and project experience.\n The CV should be of practical quality and proper formatting for real-world use.\n',
  'version1(format)': 'You are a professional CV generator.\nYou must output a CV that **exactly matches** the example format \n(same section order, headings, bold text, indentation, bullet style, and line breaks).\nThe CV should be of practical quality and proper formatting for real-world use.\n'},
 'user': {'base': '"Generate a professional CV tailored to the following job description:\n\nExample CV format : {cv}\n\nJD: {jd}\n\nPlease create a complete CV that matches the requirements in the job description while 

## 데이터 생성 예제

In [35]:
###cv 생성 예제

# cv_result = make_cv(
#     system=prompt["system"]["base"],
#     user=prompt["user"]["base"].format(cv=example_cv["Resume"].iloc[0], jd=df.iloc[0]),
#     model="gpt-4o-mini"
# )

In [16]:
results = []
for i in tqdm(range(10)):
    # cv와 jd 예제를 하나씩 가져옴
    cv_example = example_cv["Resume"].iloc[i]
    jd_example = df.iloc[i]["description"]

    system = prompt["system"]["base"]
    user=prompt["user"]["base"].format(cv=cv_example, jd=jd_example)

    generated_cv = make_cv(system, user, model="gpt-4.1-mini")

    #cv, jd, generated_cv를 dict로 저장
    results.append({
        "cv_example": cv_example,
        "jd": jd_example,
        "generated_cv": generated_cv
    })

# dataframe으로 변환 및 id 지정
df_result = pd.DataFrame(results)
df_result.index.name = "id"

100%|██████████| 10/10 [03:26<00:00, 20.68s/it]


In [17]:
df_result.head()

Unnamed: 0_level_0,cv_example,jd,generated_cv
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Skills * Programming Languages: Python (pandas...,**An Exceptional Engineering Opportunity Await...,**John A. Smith** \nSenior Backend Software E...
1,"Areas of Interest Deep Learning, Control Syste...",**An Exceptional Engineering Opportunity Await...,**Johnathan A. Clarke** \nSenior Backend Soft...
2,Skills • Python • Tableau • Data Visualization...,**An Exceptional Engineering Opportunity Await...,**Johnathan Smith** \nSenior Backend Software...
3,Personal Skills ➢ Ability to quickly grasp tec...,**An Exceptional Engineering Opportunity Await...,John A. Smith \nPhone: +44 7700 900123 \nEma...
4,Expertise − Data and Quantitative Analysis − D...,**An Exceptional Engineering Opportunity Await...,**Expertise** \nBackend Software Engineering ...


In [18]:
df_result.to_csv("./data/base_cv.csv", index=True)

## jd별로 10개씩 생성 및 저장

In [None]:
for model in ["gpt-4.1-mini","o4-mini", "o3-mini", "gpt-4.1"]:
    results = []
    for i in tqdm(range(10)):
        cv_example = example_cv["Resume"].iloc[i]
        jd_example = df.iloc[i]["description"]

        system = prompt["system"]["version1(format)"]
        user=prompt["user"]["version1(format)"].format(cv=cv_example, jd=jd_example)

        generated_cv = make_cv(system, user, model=model)
        results.append({
            "cv_example": cv_example,
            "jd": jd_example,
            "generated_cv": generated_cv
        })


    df_result = pd.DataFrame(results)
    df_result.index.name = "id"
    df_result.to_csv(f"./data/{model}_format.csv", index=True)

100%|██████████| 10/10 [01:02<00:00,  6.22s/it]
100%|██████████| 10/10 [06:56<00:00, 41.65s/it]
100%|██████████| 10/10 [03:57<00:00, 23.77s/it]
100%|██████████| 10/10 [02:52<00:00, 17.30s/it]


# 추가 수집된 데이터로 벤치마크 생성

In [19]:
print(len(merged_df_dedup))     #JD 크기
print(len(example_cv))          #CV 크기    

704
86


In [21]:
#700개의 인덱스에서 300개를 stratified하게 뽑아냄
indices = np.linspace(0, len(merged_df_dedup) - 1, 300, dtype=int)
# cv랜덤 생성 seed
import numpy as np
np.random.seed(42)

In [25]:
for model in ["o3-mini"]:
    results = []
    for i in tqdm(indices):
        cv_index = np.random.randint(0, len(example_cv))
        cv_example = example_cv["Resume"].iloc[cv_index]
        jd_example = merged_df_dedup.iloc[i]["description"]

        system = prompt["system"]["version1(format)"]
        user=prompt["user"]["version1(format)"].format(cv=cv_example, jd=jd_example)

        generated_cv = make_cv(system, user, model=model)
        results.append({
            "cv_example": cv_example,
            "jd": jd_example,
            "generated_cv": generated_cv
        })


    df_result = pd.DataFrame(results)
    df_result.index.name = "id"
    df_result.to_csv(f"./data/{model}_resume.csv", index=True)

100%|██████████| 300/300 [2:40:45<00:00, 32.15s/it]  


In [23]:
pd.read_csv("./data/o3-mini_resume.csv").head()

Unnamed: 0,id,cv_example,jd,generated_cv
0,0,"TECHNICAL SKILLS • HP ALM, RTC and JIRA • AS40...","Full Stack Developer, Senior**The Opportunity:...","TECHNICAL SKILLS • Languages: Java, Python, Ja..."
1,1,KEY SKILLS: • Planning & Strategizing • Presen...,Full\\-Stack Web Developer**The Opportunity:**...,KEY SKILLS: • Full‐Stack Development • UI/UX C...
2,2,"Technical Skills Programming Languages: C, C++...",Posted Date\n\n\n5/08/2025\n\n\nDescription\n\...,Technical Skills \nProgramming Languages: Java...
3,3,Education Details \r\nJanuary 2016 BSc. Mumba...,Posted Date\n\n\n5/08/2025\n\n\nDescription\n\...,Education Details \nJanuary 2014 BSc. Computer...
4,4,SKILLS • 1.Autocad • 2.Pro v • 3.Catia • 4.wor...,Posted Date\n\n\n5/07/2025\n\n\nDescription\n\...,SKILLS • 1.Java • 2.Python • 3.Node.js • 4.SQL...
