In [1]:
import pandas as pd
import sys
import numpy as np
import os

sys.path.append('../../system/')
# from parser import run_parser, convert_pdf_to_jpg     #for image preprocess
# from get_similarity.utils.preprocess import preprocess
from configs import JD_PATH, COLLECTION, DB_PATH

from insert_chunks import *
from tqdm import tqdm
from collections import Counter, defaultdict
from uuid import uuid4
from dotenv import load_dotenv

from nltk.tokenize import word_tokenize
from langchain_community.retrievers import BM25Retriever
import string

In [2]:
def preprocess(df):
    ## 기존과 데이터가 달라져서 대부분의 전처리 과정이 필요없음
    df.dropna(subset=['description', 'is_remote'], inplace=True)
    df = df.reset_index(drop=True)
    return df

In [3]:
def classify_jobpype(df):
    remove_index = []
    for idx, data in df.iterrows():
        job_type = data["job_type"]
        if "fulltime" in job_type:
            label="fulltime"
        elif "parttime" in job_type:
            label="parttime"
        elif "contract" in job_type:
            label="fulltime"
        elif "internship" in job_type:
            label="fulltime"
        #대부분 위 4개의 카테고리에 속함, 그렇지 않은경우 제거
        else:
            remove_index.append(idx)

        df.at[idx, "job_type"] = label

    for index in remove_index:
        print(f"Removing index: {index}")
        df.drop(index, inplace=True)
        
    return df

In [4]:
JD_PATH = "updated_jd"
jd_folder = JD_PATH
full_paths = []
for jd_path in os.listdir(jd_folder):
    full_paths.append(os.path.join(jd_folder, jd_path))

### vectorDB에 들어갈 파일들
for path in full_paths:
    print(path)

updated_jd/USA_Back-End_jobs_total_filtered.csv
updated_jd/Germany_marketing_jobs_total_filtered.csv
updated_jd/Germany_Back-End_jobs_total_filtered.csv
updated_jd/UK_Machine Learning_jobs_total_filtered.csv
updated_jd/UK_mechanical engineer_jobs_total_filtered.csv
updated_jd/USA_mechanical engineer_jobs_total_filtered.csv
updated_jd/Germany_Machine Learning_jobs_total_filtered.csv
updated_jd/Germany_Front-End_jobs_total_filtered.csv
updated_jd/Germany_mechanical engineer_jobs_total_filtered.csv
updated_jd/UK_marketing_jobs_total_filtered.csv
updated_jd/UK_Front-End_jobs_total_filtered.csv
updated_jd/USA_marketing_jobs_total_filtered.csv
updated_jd/USA_Machine Learning_jobs_total_filtered.csv
updated_jd/USA_Front-End_jobs_total_filtered.csv
updated_jd/UK_Back-End_jobs_total_filtered.csv


In [5]:
all_dfs = []

for path in full_paths:
    df = pd.read_csv(path)
    df["location"] = path.split("/")[-1].split("_")[0]
    # print(df.shape)
    all_dfs.append(df)
# 하나의 DataFrame으로 병합
merged_df = pd.concat(all_dfs, ignore_index=True)
# 확인
print(merged_df.shape)

#description 기준 중복값 제거
print(len(merged_df["description"].unique()))
merged_df_dedup = merged_df.drop_duplicates(subset="description")
# 중복 제거 후 행 수 출력
print(f"중복 제거 후 description 개수: {len(merged_df_dedup)}")


(1184, 8)
704
중복 제거 후 description 개수: 704


In [6]:
merged_df_dedup = classify_jobpype(merged_df_dedup)
print(len(merged_df_dedup))

Removing index: 566
703


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(index, inplace=True)


In [7]:
## 길이가 과하게 긴 문서 제거
max_len = 10000
merged_df_dedup = merged_df_dedup[merged_df_dedup['description'].apply(lambda x: len(x) <= max_len if isinstance(x, str) else True)]
len(merged_df_dedup)

691

In [8]:
final_df = preprocess(merged_df_dedup)

In [9]:
### chunk로 나누어주어야 BM25가 더 잘 작동함

emb_model = load_emb_model()
#### 지금은 semantic chunking을 사용했는데 파이썬 파일 제작시에는 recursive chunking 사용할 것
total_chunks = get_chunks(final_df, set_splitter(emb_model))   # semantic chunking, langchain document list 반환
print(len(total_chunks))

2741


# Make lexical DB

In [10]:
def clean_tokens(text: str):
    """공백 기준 토큰화 후 특수문자 제거·소문자 변환"""
    tokens = text.split()                       # ① 공백 기준 분리
    cleaned = []
    for tok in tokens:
        # ② 토큰 앞뒤 특수문자 제거  ( ###Job**  →  Job )
        tok = tok.strip(string.punctuation)
        # ③ 소문자 변환
        tok = tok.lower()
        # ④ 빈 토큰·순수 특수문자 토큰은 건너뛰기
        if tok and not all(ch in string.punctuation for ch in tok):
            cleaned.append(tok)
    return cleaned

#### 예제 출력
# sample = """##### **Job Type: Contract** ##### **Job Category: IT** #### **Job Description**
# Job Title: Azure GenAI Engineer ..."""
# print(clean_tokens(sample)[:40])   # 앞 40개 토큰만 확인

In [11]:
word_tokenize(total_chunks[0].page_content)[:5]

['Full', 'Stack', 'Developer', ',', 'Senior']

In [12]:
#기본 tokenizer는 기본 sentence.split()으로 되어있음, markdown 불용어 처리만 더하여 토크나이징
# retriever = BM25Retriever.from_documents(total_chunks, preprocess_func=clean_tokens)
retriever = BM25Retriever.from_documents(total_chunks, preprocess_func=word_tokenize)
                                        #  bm25_params={"k1": 1.2, "b": 0.5})


In [13]:
retriever.get_relevant_documents("azure genai engineer")  # 검색 쿼리

  retriever.get_relevant_documents("azure genai engineer")  # 검색 쿼리


[Document(metadata={'job_url': 'https://www.indeed.com/viewjob?jk=75e354ef2304575e', 'title': 'Data Engineer', 'company': 'Booz Allen Hamilton', 'location': 'USA', 'date_posted': '2025-05-08', 'job_type': 'fulltime', 'is_remote': True, 'description': "Data Engineer**The Opportunity:**\n\n\nEver\\\\-expanding technology like IoT, machine learning, and artificial intelligence means that there’s more structured and unstructured data available today than ever before. As a data engineer, you know that organizing big data can yield pivotal insights when it’s gathered from disparate sources. We need an experienced data engineer like you to help our clients find answers in their big data to impact important missions—from fraud detection to cancer research to national intelligence.\n\n  \n\nAs a data engineer at Booz Allen, you’ll implement data engineering activities on some of the most mission\\\\-driven projects in the industry. You’ll deploy and develop pipelines and platforms that organize

In [14]:
sorted_dict = dict(sorted(retriever.vectorizer.idf.items(), key=lambda item: item[1], reverse=True))

In [23]:
list(sorted_dict.items())[:10]

[('responds', 7.510430556378006),
 ('lockdown', 7.510430556378006),
 ('Kit', 7.510430556378006),
 ('Gin', 7.510430556378006),
 ('Toolkit', 7.510430556378006),
 ('Struts', 7.510430556378006),
 ('antipattern', 7.510430556378006),
 ('diagnosing', 7.510430556378006),
 ('86,800\\\\.00', 7.510430556378006),
 ('198,000\\\\.00', 7.510430556378006)]

In [24]:
len(sorted_dict)

26744

In [26]:
import pickle
from pathlib import Path


SAVE_PATH = Path("temp_bm25_retriever.pkl")
with SAVE_PATH.open("wb") as f:
    pickle.dump(retriever, f)

In [27]:

with open("temp_bm25_retriever.pkl", "rb") as f:
    data = pickle.load(f)

In [28]:
data.get_relevant_documents("azure genai engineer")  # 검색 쿼리

[Document(metadata={'job_url': 'https://www.indeed.com/viewjob?jk=75e354ef2304575e', 'title': 'Data Engineer', 'company': 'Booz Allen Hamilton', 'location': 'USA', 'date_posted': '2025-05-08', 'job_type': 'fulltime', 'is_remote': True, 'description': "Data Engineer**The Opportunity:**\n\n\nEver\\\\-expanding technology like IoT, machine learning, and artificial intelligence means that there’s more structured and unstructured data available today than ever before. As a data engineer, you know that organizing big data can yield pivotal insights when it’s gathered from disparate sources. We need an experienced data engineer like you to help our clients find answers in their big data to impact important missions—from fraud detection to cancer research to national intelligence.\n\n  \n\nAs a data engineer at Booz Allen, you’ll implement data engineering activities on some of the most mission\\\\-driven projects in the industry. You’ll deploy and develop pipelines and platforms that organize