In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import sys
import os
import yaml
from dotenv import  load_dotenv

import numpy as np
sys.path.append('../../system/')
from get_similarity.utils.preprocess import preprocess
# from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import OpenAIEmbeddings
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import LocalFileStore
from langchain_chroma import Chroma
from configs import JD_PATH, COLLECTION, DB_PATH

from insert_chunks import *
from tqdm import tqdm
from collections import defaultdict
import re
from uuid import uuid4

## 현재 vectorDB document

In [2]:
JD_PATH = "../../data/jd_origin"
jd_folder = JD_PATH
for jd_path in os.listdir(jd_folder):
    full_path = os.path.join(jd_folder, jd_path)

In [3]:
emb_model = load_emb_model()
preprocessed_doc = preprocess(full_path)    #dataframe
total_chunks = get_chunks(preprocessed_doc, set_splitter(emb_model))

In [4]:
total_chunks[0].page_content

'#####  **Job Type: Contract**\n\n##### **Job Category: IT**\n\n#### **Job Description**\n\nJob Title: Azure GenAI Engineer\n\nJob Summary:\n\nWe are seeking a highly skilled Azure GenAI Engineer to join our team and help\ndrive the development and implementation of cutting-edge artificial\nintelligence solutions on the Azure platform. The ideal candidate will have a\ndeep understanding of Azure services, machine learning models, and data\nanalytics to create robust and scalable AI solutions for our clients.'

## 현재 vectorDB query process

In [5]:
# 생성된 CV전문(`generated_cv`column)을 Omniparser로 처리된 데이터로 가정
# o3-mini의 generation_score가 제일높아 사용
dataset = pd.read_csv("./data/evaluation/o3-mini_evaluation2.csv")
print(f"벤치마크 데이터셋 크기: {len(dataset)}")

벤치마크 데이터셋 크기: 60


In [6]:
dataset = dataset[dataset["generation_score"]>=40]
print(f"필터링된 데이터셋 크기: {len(dataset)}")

필터링된 데이터셋 크기: 38


In [7]:
jd = dataset['jd']
generated_cv = dataset['generated_cv']

In [8]:
# 현재 JobPT의 쿼리 전처리

def preprocess_query(parsed_content):

    lines = parsed_content.split("\n")

    texts = []
    for line in lines:
        if ":" in line:
            text = line.split(":", 1)[1].strip()
            texts.append(text)

    processed_texts = []
    i = 0
    while i < len(texts):
        text = texts[i]
        if text.endswith("-"):
            text = text[:-1]
            if i + 1 < len(texts):
                text += texts[i + 1]
                i += 1
        processed_texts.append(text)
        i += 1

    combined_text = " ".join(processed_texts)

    sentence_endings = re.compile(r"(?<=[.!?]) +")
    sentences = sentence_endings.split(combined_text)

    paragraphs = []
    current_paragraph = ""
    for sentence in sentences:
        headings = ["Education", "Publications", "Honors", "Relevant Courses", "GPA", "Advisor", "Minor", "Email", "Research Advisor"]
        if any(heading in sentence for heading in headings):
            print(f"sentence: {sentence}")
            if current_paragraph:
                paragraphs.append(current_paragraph.strip())
            current_paragraph = sentence.strip() + " "
        else:
            current_paragraph += sentence.strip() + " "

    if current_paragraph:
        paragraphs.append(current_paragraph.strip())

    # for para in paragraphs:
        # print(para)
        # print("\n")

    return "\n".join(paragraphs)

In [9]:
print(preprocess_query(generated_cv[0]))

Photolithography, Thin Film Deposition, Etching, Diffusion, Inspection, Bonding, Sawing & Grinding. Clean Room Equipment, Semiconductor Process Tools, Optical Inspection Systems. Statistical Process Control, Defect Analysis, Calibration, Process Optimization. Safety Protocols, Lean Manufacturing, Six Sigma basics, Team Collaboration, Technical Documentation. Photolithography equipment, Thin Film Deposition systems, Etching apparatus, Digital Process Monitoring, MATLAB, Statistical Process Control tools. PLC Systems, Clean Room Equipment, MES, SCADA, Tableau, Python (for process analytics). MES Systems, PLC, SAP, Custom Database Reporting Tools. PLC, SCADA, SQL, Python, Data Visualization Tools, Custom Analytics Software.


# Benchmark

In [10]:
queries = generated_cv
answers = jd

In [11]:
def get_chunks_bench(df, text_splitter):
    ## 데이터 프레임을 넣으면 chunking된 결과를 리턴
    total_chunks = []
    # for i, desciption in enumerate(df["description"]):
        # meta_data = [df.loc[i, df.columns != 'description'].to_dict()]
        # meta_data = [df.loc[i].to_dict()]
    for i, desciption in enumerate(tqdm(df)):


        ### 메타데이터는 각 jd별 index로 지정
        chunk = text_splitter.create_documents([desciption], [{"index": i}])
        total_chunks.extend(chunk)
    return total_chunks

In [12]:
total_chunks = get_chunks_bench(jd, set_splitter(emb_model))

100%|██████████| 38/38 [00:18<00:00,  2.03it/s]


In [13]:
total_chunks[:20]

[Document(metadata={'index': 0}, page_content='Overview:\n\n  \nFujifilm Dimatix, Inc. strives to offer a healthy work environment that\npromotes individual responsibility and growth, a collaborative spirit, and an\natmosphere that encourages learning, professional development and achievement. We search for the most talented and qualified people for both external and\ninternal opportunities. At Fujifilm, performance, development and\naccountability are the standards to which the company and its people strive\ntoward. And to support its employees, we offer programs that motivate, educate\nand promote a healthy work-life balance, which increase employee satisfaction\nand overall personal well-being. Fujifilm is committed to offering a comprehensive and flexible benefit program\nto meet the needs of employees and their families. This flexible approach\nallows employees to create a program that fits their lives. At Fujifilm, INNOVATION _and_ PEOPLE matter. FUJIFILM Dimatix, Inc. is an E-Ve

In [14]:
try:
    db.reset_collection()
except NameError:
    pass
db = Chroma.from_documents(total_chunks, emb_model, collection_name=str(uuid4()))
print(f"생성된 DB의 크기: {len(db.get()['ids'])}")

생성된 DB의 크기: 170


In [15]:
# 쿼리 테스트
db.similarity_search("I want to be a Data Scientist.")

[Document(id='9a0b07ea-7acd-49fd-bb9f-d8bd835fd51d', metadata={'index': 35}, page_content="In this position you should have the following; excellent interpersonal skills\nwith the ability to communicate at all levels. Strong problem solving and\ncreative skills and the ability to exercise sound judgment. Most important,\ndemonstrate a high level of integrity and dependability with a strong sense of\nurgency and results-orientation. How you'll _create opportunities_ in this Senior Data Scientist role:\n\n  * Service Specialization: Develop service specific knowledge through greater exposure to peers, internal experts, clients, regular self-study, and formal training opportunities. Gain exposure to a variety of client situations to develop business skills."),
 Document(id='d6f3216b-ad4d-420d-bc06-0b1958ad7fbf', metadata={'index': 35}, page_content="Retain knowledge gained and performance feedback provided to transfer into future work. Approach all problems and projects with a high level 

In [17]:
scores = np.zeros([len(queries), 10])
for i in tqdm(range(len(queries))):
    query = queries.iloc[i]
    preprocessed_query = preprocess_query(query)
    results = db.similarity_search(preprocessed_query, k=10)
    for j in range(len(results)):
        if results[j].metadata['index'] ==  i:
            scores[i][j] = 1

 24%|██▎       | 9/38 [00:04<00:18,  1.53it/s]

sentence: Look-ups; Pivot table; other basic functions ; Power Point) • POS System & Inventory Management Software • Internet ApplicationsEducation Details    


 66%|██████▌   | 25/38 [00:09<00:05,  2.58it/s]

sentence: Education Details 


 79%|███████▉  | 30/38 [00:11<00:02,  3.21it/s]

sentence: ML Content Creation.Education Details  


100%|██████████| 38/38 [00:14<00:00,  2.67it/s]


In [18]:
# one-shot이상은 n개 이상의 정답 중 하나라도 matching됐을 경우
one_shot = scores[:,0]
ten_shot = np.sum(scores[:,:10], axis=1)

one_shot = np.mean(one_shot)
ten_shot = (ten_shot > 0).mean()

In [19]:
# 1-shot, 10-shot accuracy
print(one_shot)
print(ten_shot)

0.21052631578947367
0.3684210526315789
