In [2]:
import re
import pandas as pd

# === 1. CSV 불러오기 ===
INPUT_PATH = "hf_perfumes_mapped_ko.csv"
OUTPUT_PATH = "hf_perfumes_mapped_ko_clean_stopwords_saved.csv"

df = pd.read_csv(INPUT_PATH, encoding="utf-8-sig")

# === 2. 브랜드 & 향수명 리스트 추출 ===
brand_words = df["brand"].dropna().unique().tolist()
perfume_words = df["name"].dropna().unique().tolist() if "name" in df.columns else []

brand_words = [b.lower() for b in brand_words]
perfume_words = [p.lower() for p in perfume_words]

# === 3. 제거 단어 사전 ===
meta_words = [
    # 작품/광고성
    "masterpiece", "bestseller", "iconic", "legendary", "timeless",
    "classic", "signature", "exclusive", "limited edition", "collector",
    "special edition", "best", "top", "famous", "well-known", "popular",
    "beloved", "favorite", "unique", "remarkable", "extraordinary",

    # 출시/제작 정보
    "created by", "designed by", "crafted", "from the house of",
    "house of", "brand", "launched", "introduced",
    "released", "since", "established", "heritage",

    # 마케팅 용어
    "luxury", "prestigious", "premium", "elegant",
    "sophisticated", "refined", "high quality", "finest", "award-winning",
]

# === 4. 전처리 함수 ===
def clean_description(text: str) -> str:
    if pd.isna(text):
        return ""

    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)      # 알파벳만 남기고 숫자 제거
    text = re.sub(r"\s+", " ", text).strip()   # 중복 공백 제거

    # meta_words, 브랜드명, 향수명 제거
    for w in (meta_words + brand_words + perfume_words):
        if w:
            text = text.replace(w, " ")

    return text.strip()

# === 5. 전처리 적용 ===
df["description_clean"] = df["description"].astype(str).apply(clean_description)

# === 6. 저장 ===
df.to_csv(OUTPUT_PATH, index=False, encoding="utf-8-sig")

print(f"✅ 저장 완료: {OUTPUT_PATH}")
print(df[["description", "description_clean"]].head(15))


✅ 저장 완료: hf_perfumes_mapped_ko_clean_stopwords_saved.csv
                                          description  \
0   A modern aroma, ideal for urban men and that h...   
1   Désinvolte is a generous bouquet of white flow...   
2   Day 1, 5:45 am. " evokes a joyous, exciting an...   
3   Confidence owns the night. A sexy energy that ...   
4   32 is a rich perfume, enhanced by a bouquet of...   
5   Inspired by Gothicism (circa 1144), Opus 1144 ...   
6   Surrender your senses and allow Sans Voir to g...   
7   Rose 01 is a floral fantasy that transports yo...   
8   Blu di Roma, a romantic, intense, and fresh fr...   
9   An intriguing fragrance that is sure to make a...   
10  Issey Miyake knows how to astonish with the ob...   
11  Love is an epidermal tale and the first memory...   
12  Opens on citrus notes, while Mediterranean fru...   
13           A lively, warm scent as sunny as summer.   
14  Celebrate 20 years of Rem with Rem L'Acqua! Im...   

                              