In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import sys
import os
import yaml
from dotenv import  load_dotenv

import numpy as np
sys.path.append('../../system/')
from get_similarity.utils.preprocess import preprocess
# from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import OpenAIEmbeddings
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import LocalFileStore
# from langchain_chroma import Chroma
from configs import JD_PATH, COLLECTION, DB_PATH

from insert_chunks import *
from tqdm import tqdm
from collections import defaultdict


In [2]:
paths = [i for i in os.listdir("data") if i.endswith(".csv")]

In [3]:
paths

['gpt-4.1_resume.csv',
 'o4-mini.csv',
 'gpt-4.1-mini_resume.csv',
 'o4-mini_format.csv',
 'gpt-4.1-mini.csv',
 'o4-mini_resume.csv',
 'o3-mini_format.csv',
 'o3-mini.csv',
 'gpt-4.1.csv',
 'gpt-4.1_format.csv',
 'o3-mini_resume.csv',
 'gpt-4.1-mini_format.csv']

In [4]:
one_example = pd.read_csv("data/" + paths[1])

In [5]:
one_example.head()

Unnamed: 0,id,cv_example,jd,generated_cv
0,0,Skills * Programming Languages: Python (pandas...,##### **Job Type: Contract**\n\n##### **Job C...,Created CV:\n\nPersonal Information \nName: J...
1,1,Education Details \r\nMay 2013 to May 2017 B.E...,"Location\n\nRemote, USA\n\nType\n\nFull time\n...",Created CV:\n\nPersonal Information \nName: J...
2,2,"Areas of Interest Deep Learning, Control Syste...",EvenUp is on a mission to support injury law f...,Created CV:\n\nPersonal Information \nName: A...
3,3,Skills • R • Python • SAP HANA • Tableau • SAP...,"Riverbed. Empower the Experience:\n\nRiverbed,...","Created CV:\n\nAlexandra Johnson \nChicago, I..."
4,4,"Education Details \r\n MCA YMCAUST, Faridab...",## **Teamwork makes the stream work.**\n\n###...,Created CV:\n\nPersonal Information \nName: J...


<!-- # Prompting(llm-as-a-judge) -->

# LLM-judge structured-output, prompt 정의

In [6]:
from openai import OpenAI
from pydantic import BaseModel, Field
client = OpenAI()

In [7]:
# 1) OpenAI 클라이언트 초기화
client = OpenAI()

# 2) 출력 스키마 정의
class ResumeEvaluation(BaseModel):
    skills_score: int = Field(description="0-100: How well the applicant's skills match the JD requirements")
    experience_score: int = Field(description="0-100: Fit of years of experience and project background")
    culture_fit_score: int = Field(description="0-100: Cultural fit based on key values (e.g., collaboration, innovation)")
    readability_score: int = Field(description="0-100: Sentence clarity and overall readability")
    generation_score: int = Field(description="0-100: Overall quality of the generated resume")
    reasoning: str = Field(description="Reason for each score. Have to spperate by new line")

# 3) ChatCompletion 호출 & 파싱
completion = client.beta.chat.completions.parse(
    model="gpt-4.1-mini",
    messages=[
        {"role": "system",
         "content": "You are an AI judge. Evaluate the resume against the job description and return JSON matching the ResumeEvaluation schema."},
        {"role": "user", "content": """
Job Description:
- Experience with Python, machine learning, and cloud services
- Bachelor's degree in Computer Science
- 2+ years of AI project experience

Resume:
- Name: Jane Doe
- Skills: Python, TensorFlow, AWS
- Experience: 1.5 years in AI projects at XYZ Inc.
- Education: BSc in Computer Science from ABC University
"""}
    ],
    response_format=ResumeEvaluation,  # Pydantic 모델 지정
    temperature=0
)

# 4) 파싱된 결과 사용
evaluation: ResumeEvaluation = completion.choices[0].message.parsed
print("skills_score", evaluation.skills_score)
print("experience_score:", evaluation.experience_score)
print("culture_fit_score:", evaluation.culture_fit_score)
print("readability_score:", evaluation.readability_score)
print("generation_score:", evaluation.generation_score)
print("Reasoning:\n", evaluation.reasoning)


skills_score 80
experience_score: 70
culture_fit_score: 60
readability_score: 90
generation_score: 75
Reasoning:
 Skills Score: The candidate has Python, TensorFlow (a machine learning framework), and AWS (cloud service), which aligns well with the JD requirements, but TensorFlow is not explicitly mentioned and AWS is a broad cloud service; hence 80.
Experience Score: The candidate has 1.5 years of AI project experience, slightly below the 2+ years required, so score is 70.
Culture Fit Score: No explicit cultural values mentioned in JD or resume, but the candidate's background suggests some alignment with innovation and technical skills, so moderate score of 60.
Readability Score: The resume is concise and clear with no grammatical issues, so 90.
Generation Score: Overall, the resume is good but lacks some detail on projects and achievements, so 75.


In [11]:
print(one_example["generated_cv"].iloc[0][:1000])

Created CV:

Personal Information  
Name: John A. Doe  
Address: 1234 Elm Street, Austin, TX 78701  
Phone: (512) 555‑1234  
Email: john.a.doe@example.com  
LinkedIn: linkedin.com/in/johna-doe  

Education  
Master of Science in Data Science, May 2018  
University of Texas at Austin, Austin, TX  
Bachelor of Science in Computer Science, May 2016  
University of Texas at Austin, Austin, TX  

Certifications  
• Microsoft Certified: Azure AI Engineer Associate  
• Microsoft Certified: Azure Data Engineer Associate  
• Databricks Certified Associate Developer for Apache Spark 3.0  

Technical Skills  
Programming Languages:  
• Python (pandas, numpy, scikit‑learn, PyTorch, TensorFlow)  
• R (tidyverse, caret, mlr)  
• C# (.NET Core), SQL (T‑SQL), JavaScript (Node.js)  

Azure Services & Cloud Technologies:  
• Azure Machine Learning (automated ML, MLOps)  
• Azure Databricks (notebooks, Delta Lake, Spark)  
• Azure Cognitive Services (Language, Vision, Speech SDKs)  
• Azure Synapse Analy

In [12]:
def evaluate_gen_cv(jd, gen_cv, model="gpt-4.1", temperature=0):
    completion = client.beta.chat.completions.parse(
    model=model,
    messages=[
        {"role": "system",
         "content": "You are an impartial AI judge that evaluates a generated resume against a job description and outputs valid JSON"},
        {"role": "user", "content": f"""
Job Description:
{jd}
Generated Resume:
{gen_cv}
    
###Guidelines###
1. Generated Resume Do **NOT** copy or closely paraphrase sentences from the JD.\
      If such leakage is detected, set `generation_score` ≤ 30.
2. Reward resumes that demonstrably cover the JD's required skills, experience, \
and education with higher `skills_score` and `experience_score`.
3. Answer length alone does not guarantee a higher rank—concise, task-focused content should be rewarded with a higher readability_score.

    """}
        ],
        response_format=ResumeEvaluation,  # Pydantic 모델 지정
        # temperature=temperature           #reasoning 모델은 temperature 지정 붋가
    )
    return completion.choices[0].message.parsed

In [15]:
one_sample = one_example.iloc[0]
result = evaluate_gen_cv(one_sample["jd"], one_sample["generated_cv"], model="o3-mini")

In [16]:
result.dict()

{'skills_score': 95,
 'experience_score': 90,
 'culture_fit_score': 90,
 'readability_score': 90,
 'generation_score': 95,
 'reasoning': 'The resume demonstrates a strong match with the technical skills required by the JD, including extensive use of Azure services for generative AI, machine learning, and data pipelines, resulting in a high skills score.\nThe candidate has relevant professional experience (current contract role as Azure GenAI Engineer and experience at multiple firms) that aligns well with the 3+ years requirement and the responsibilities outlined, leading to a high experience score.\nThe resume highlights collaboration with cross-functional teams, stakeholder engagement, and contributions to documentation and training, showcasing good cultural fit.\nThe content is well-organized, clearly written and logically structured, thus meriting a high readability score.\nThe overall quality of the generated resume is strong with comprehensive details and appropriate context, and

<!-- # Evaluate full result -->

In [20]:
resume_dataset = pd.read_csv("./data/gpt-4.1-mini_resume.csv")
resume_dataset = resume_dataset.head()

In [21]:
results = []
for i in tqdm(range(len(resume_dataset))):
    one_sample = resume_dataset.iloc[i]
    # one_sample = resume_dataset.iloc[np.random.randint(0, len(resume_dataset))]

    jd = one_sample["jd"]
    gen_cv = one_sample["generated_cv"]
    results.append(evaluate_gen_cv(jd, gen_cv=gen_cv, model="gpt-4.1"))

100%|██████████| 5/5 [00:28<00:00,  5.74s/it]


In [22]:
results

[ResumeEvaluation(skills_score=98, experience_score=97, culture_fit_score=96, readability_score=93, generation_score=92, reasoning="Skills Score: The candidate lists direct experience and knowledge in all key manufacturing processes and relevant technical/soft skills described in the JD, earning certifications in required areas.\nExperience Score: States 12 months hands-on experience in manufacturing processes and clean room operations, specifically supporting wafer fabrication, directly matching the JD for an entry-level technician. Also documents participation in compressed shift schedules as required.\nCulture Fit Score: The resume repeatedly references participation in development opportunities, the collaborative work environment, and continuous professional growth, aligning closely with the company's stated values of innovation, learning, and inclusivity.\nReadability Score: The resume is well-structured, clear, and concise, using bullet points for clarity and logical section head

In [23]:
#점수가 예상보다 너무 높아서 random index로 일부로 negative sample을 넣어 결과확인
results = []
for i in tqdm(range(len(resume_dataset))):
    one_sample = resume_dataset.iloc[i]
    cv_one_sample = resume_dataset.iloc[np.random.randint(0, len(resume_dataset))]

    jd = one_sample["jd"]
    gen_cv = cv_one_sample["generated_cv"]
    results.append(evaluate_gen_cv(jd, gen_cv=gen_cv, model="gpt-4.1"))

100%|██████████| 5/5 [00:21<00:00,  4.36s/it]


In [24]:
results

[ResumeEvaluation(skills_score=40, experience_score=35, culture_fit_score=80, readability_score=85, generation_score=70, reasoning="Skills Score: The resume shows basic customer service, compliance, shift work, and transaction processing but lacks direct technical manufacturing, photolithography, etch, or wafer fab experience. No cleanroom, semiconductor, or relevant technical process certification is listed. \nExperience Score: Experience is primarily in retail (Walmart front end), not in manufacturing or wafer fab. No relevant manufacturing or technical projects/background, but strong shift work adaptability. \nCulture Fit Score: Shows adaptability, compliance, and customer focus, which aligns with Fujifilm's values around collaboration, growth, and accountability. Resume indicates openness to learning, fitting Fujifilm's learning/development culture. \nReadability Score: Resume is clear, logically organized, and easy to read. Education and skills are clearly indicated. \nGeneration 

# 전체 데이터셋 평가

In [27]:
# 평가하길 원하는 데이터셋만 로딩
path = "./data"
paths = [os.path.join(path, p) for p in os.listdir(path) if "resume" in p]

In [28]:
# 전체 JD활용하여 생성한 CV
paths

['./data/gpt-4.1_resume.csv',
 './data/gpt-4.1-mini_resume.csv',
 './data/o4-mini_resume.csv',
 './data/o3-mini_resume.csv']

In [29]:
# 데이터 로딩, 저장이름 지정
datasets = [pd.read_csv(p) for p in paths]
save_name = [i.split("/")[-1].split("_resume.csv")[0] for i in paths]

In [None]:
results = defaultdict(list)
for index, dataset in enumerate(datasets):
    for i in tqdm(range(len(dataset))):
        one_sample = dataset.iloc[i]

        jd = one_sample["jd"]
        gen_cv = one_sample["generated_cv"]
        results[save_name[index]].append(evaluate_gen_cv(jd, gen_cv=gen_cv, model="o3-mini", temperature=None))

100%|██████████| 60/60 [13:31<00:00, 13.53s/it]
100%|██████████| 60/60 [13:46<00:00, 13.78s/it]
100%|██████████| 60/60 [13:14<00:00, 13.23s/it]
100%|██████████| 60/60 [14:02<00:00, 14.04s/it]


In [None]:
results.keys()

dict_keys(['gpt-4.1', 'gpt-4.1-mini', 'o4-mini', 'o3-mini'])

In [None]:
# 각 데이터프레임에 평가 점수 추가

for i, dataset in enumerate(datasets):
    skill_score = [result.skills_score for result in results[save_name[i]]]
    experience_score = [result.experience_score for result in results[save_name[i]]]
    culture_fit_score = [result.culture_fit_score for result in results[save_name[i]]]
    readability_score = [result.readability_score for result in results[save_name[i]]]
    generation_score = [result.generation_score for result in results[save_name[i]]]
    reasoning = [result.reasoning for result in results[save_name[i]]]

    datasets[i]["skill_score"] = skill_score
    datasets[i]["experience_score"] = experience_score
    datasets[i]["culture_fit_score"] = culture_fit_score
    datasets[i]["readability_score"] = readability_score
    datasets[i]["generation_score"] = generation_score
    datasets[i]["reasoning"] = reasoning

<!-- ### 원본jd, resume+평가 점수 csv 저장 -->

In [None]:
for i, dataset in enumerate(datasets):
    dataset.to_csv(f"./data/evaluation/{save_name[i]}_evaluation2.csv", index=False)

<!-- # 점수 총합 -->

# 평가 후 데이터셋 로딩 후 결과확인

In [30]:
eval_path = "./data/evaluation"
eval_paths = [os.path.join(eval_path, p) for p in os.listdir(eval_path) if p.endswith("evaluation2.csv")]
model_name = [os.path.basename(p).split("_evaluation.csv")[0] for p in eval_paths]

In [31]:
eval_paths

['./data/evaluation/o3-mini_evaluation2.csv',
 './data/evaluation/gpt-4.1-mini_evaluation2.csv',
 './data/evaluation/o4-mini_evaluation2.csv',
 './data/evaluation/gpt-4.1_evaluation2.csv']

In [32]:
eval_datasets = [pd.read_csv(p) for p in eval_paths]
score_dicts = defaultdict(dict)

In [33]:
for i, dataset in enumerate(eval_datasets):
    score_dicts[model_name[i]]["skill_score"] = sum(dataset["skill_score"])
    score_dicts[model_name[i]]["experience_score"] = sum(dataset["experience_score"])
    score_dicts[model_name[i]]["culture_fit_score"] = sum(dataset["culture_fit_score"])
    score_dicts[model_name[i]]["readability_score"] = sum(dataset["readability_score"])
    score_dicts[model_name[i]]["generation_score"] = sum(dataset["generation_score"])

In [34]:
score_dicts

defaultdict(dict,
            {'o3-mini_evaluation2.csv': {'skill_score': 5030,
              'experience_score': 4735,
              'culture_fit_score': 4635,
              'readability_score': 4955,
              'generation_score': 3830},
             'gpt-4.1-mini_evaluation2.csv': {'skill_score': 5360,
              'experience_score': 5080,
              'culture_fit_score': 5080,
              'readability_score': 5225,
              'generation_score': 2540},
             'o4-mini_evaluation2.csv': {'skill_score': 5325,
              'experience_score': 5060,
              'culture_fit_score': 4920,
              'readability_score': 5095,
              'generation_score': 3145},
             'gpt-4.1_evaluation2.csv': {'skill_score': 5235,
              'experience_score': 4975,
              'culture_fit_score': 5045,
              'readability_score': 5045,
              'generation_score': 2665}})

In [35]:
for key, value in score_dicts.items():
    #generation_score는 추후 필터링용
    print(f"[{key}] Total Score: ", sum(list(value.values())))
    print(f"[{key}] Without generation_score: ", sum(list(value.values())[:-1]))

    print()

[o3-mini_evaluation2.csv] Total Score:  23185
[o3-mini_evaluation2.csv] Without generation_score:  19355

[gpt-4.1-mini_evaluation2.csv] Total Score:  23285
[gpt-4.1-mini_evaluation2.csv] Without generation_score:  20745

[o4-mini_evaluation2.csv] Total Score:  23545
[o4-mini_evaluation2.csv] Without generation_score:  20400

[gpt-4.1_evaluation2.csv] Total Score:  22965
[gpt-4.1_evaluation2.csv] Without generation_score:  20300



In [37]:
# # culture가 ATS에서 크게 중요한 요소는 아닌데 점수 편차가 커보여서 가중을 낮게 줘서 다시 계산
# total_score = defaultdict(dict)
# for key, value in score_dicts.items():
#     model = key
#     skill, experience, culture, readibe = value.values()
#     total_score[model]=skill*0.50 + experience*0.30 + culture*0.10 + readibe*0.10