In [1]:
import os, re
import pandas as pd
from glob import glob

def normalize_name(name: str) -> str:
    name = re.sub(r"[^\w\s]", " ", name)  # 특수문자 제거
    name = re.sub(r"\s+", " ", name)      # 다중 공백 제거
    return name.lower().strip()

def load_column_values_from_csv(folder_path: str, column_name: str) -> set:
    values = set()
    csv_files = glob(os.path.join(folder_path, "*.csv"))

    for file_path in csv_files:
        try:
            df = pd.read_csv(file_path)
            df.columns = df.columns.str.strip()  # 공백 제거
            if column_name in df.columns:
                cleaned = df[column_name].dropna().astype(str).map(normalize_name)
                values.update(cleaned)
            else:
                print(f"[⚠️] 컬럼 '{column_name}' 없음: {file_path}")
        except Exception as e:
            print(f"[⚠️] CSV 로드 실패: {file_path} → {e}")

    return values

def save_names_to_csv(names: set, output_path: str):
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    df = pd.DataFrame(sorted(names), columns=["name"])
    df.to_csv(output_path, index=False, encoding="utf-8-sig")
    print(f"[✅] 저장 완료: {output_path} ({len(names)}개)")

def load_known_wine_names(data_dir: str = "VectorDB/data/wine") -> set:
    wine_names = set()
    csv_files = glob(os.path.join(data_dir, "*.csv"))
    for file_path in csv_files:
        try:
            df = pd.read_csv(file_path)
            df.columns = df.columns.str.strip()
            if "와인명" in df.columns:
                cleaned = df["와인명"].dropna().astype(str).map(normalize_name)
                wine_names.update(cleaned)
            if "와인 영문명" in df.columns:
                cleaned_eng = df["와인 영문명"].dropna().astype(str).map(normalize_name)
                wine_names.update(cleaned_eng)
        except Exception as e:
            print(f"[⚠️] CSV 로드 실패: {file_path} → {e}")
    return wine_names

def load_known_grape_names(data_dir: str = "VectorDB/data/grape") -> set:
    return load_column_values_from_csv(data_dir, "포도품종")

def load_known_region_names(data_dir: str = "VectorDB/data/region") -> set:
    return load_column_values_from_csv(data_dir, "생산지역")

def load_known_producer_names(data_dir: str = "VectorDB/data/producer") -> set:
    return load_column_values_from_csv(data_dir, "생산자")


In [3]:
# 와인명
wine_names = load_known_wine_names("data/wine")
save_names_to_csv(wine_names, "entity_names/wine/names.csv")

# 포도품종
grape_names = load_known_grape_names("data/grape")
save_names_to_csv(grape_names, "entity_names/grape/names.csv")

# 생산지역
region_names = load_known_region_names("data/region")
save_names_to_csv(region_names, "entity_names/region/names.csv")

# 생산자
producer_names = load_known_producer_names("data/producer")
save_names_to_csv(producer_names, "entity_names/producer/names.csv")


[✅] 저장 완료: entity_names/wine/names.csv (31879개)
[✅] 저장 완료: entity_names/grape/names.csv (529개)
[✅] 저장 완료: entity_names/region/names.csv (639개)
[✅] 저장 완료: entity_names/producer/names.csv (3805개)
