In [5]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords as nltk_stopwords

# === 1. NLTK Stopwords 다운로드 ===
nltk.download("stopwords")
stopwords = nltk_stopwords.words("english")  # 영어 전체 stopwords (약 180개)

# === 2. CSV 불러오기 ===
INPUT_PATH = "hf_perfumes_mapped_ko.csv"
OUTPUT_PATH = "hf_perfumes_mapped_ko_clean.csv"

df = pd.read_csv(INPUT_PATH, encoding="utf-8-sig")

# === 3. 브랜드 & 향수명 리스트 추출 ===
brand_words = df["brand"].dropna().unique().tolist()
perfume_words = df["name"].dropna().unique().tolist() if "name" in df.columns else []

brand_words = [b.lower() for b in brand_words]
perfume_words = [p.lower() for p in perfume_words]

# === 4. 제거 단어 사전 ===
meta_words = [
    # 작품/광고성
    "masterpiece", "bestseller", "iconic", "legendary", "timeless",
    "classic", "signature", "exclusive", "limited edition", "collector",
    "special edition", "best", "top", "famous", "well-known", "popular",
    "beloved", "favorite", "unique", "remarkable", "extraordinary",

    # 출시/제작 정보
    "created by", "designed by", "made by", "crafted", "from the house of",
    "house of", "brand", "line", "series", "launched", "introduced",
    "released", "since", "established", "heritage",

    # 마케팅 용어
    "luxury", "prestigious", "premium", "elegant",
    "sophisticated", "refined", "high quality", "finest", "award-winning",

    # 일반적인 향수 단어 (노이즈 가능)
    "perfume", "fragrance", "scent", "aroma", "odor", "smell"
]

# === 5. 전처리 함수 ===
def clean_description(text: str) -> str:
    if pd.isna(text):
        return ""

    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)      # 알파벳만 남기고 숫자 제거
    text = re.sub(r"\s+", " ", text).strip()   # 중복 공백 제거

    # 1) meta_words, 브랜드명, 향수명 → 부분 문자열도 바로 제거
    for w in (meta_words + brand_words + perfume_words):
        if w:
            text = text.replace(w, " ")

    # 2) stopwords (NLTK full) → 반드시 단어 경계 기반 제거
    for w in stopwords:
        pattern = r"\b" + re.escape(w) + r"\b"
        text = re.sub(pattern, " ", text)

    # 3) 토큰 분리 후 1~2 글자 단어 제거
    tokens = [t for t in text.split() if len(t) > 2]

    return " ".join(tokens)

# === 6. 전처리 적용 ===
df["description_clean"] = df["description"].astype(str).apply(clean_description)

# === 7. 저장 ===
df.to_csv(OUTPUT_PATH, index=False, encoding="utf-8-sig")

print(f"✅ 저장 완료: {OUTPUT_PATH}")
print(df[["description", "description_clean"]].head(15))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


✅ 저장 완료: hf_perfumes_mapped_ko_clean.csv
                                          description  \
0   A modern aroma, ideal for urban men and that h...   
1   Désinvolte is a generous bouquet of white flow...   
2   Day 1, 5:45 am. " evokes a joyous, exciting an...   
3   Confidence owns the night. A sexy energy that ...   
4   32 is a rich perfume, enhanced by a bouquet of...   
5   Inspired by Gothicism (circa 1144), Opus 1144 ...   
6   Surrender your senses and allow Sans Voir to g...   
7   Rose 01 is a floral fantasy that transports yo...   
8   Blu di Roma, a romantic, intense, and fresh fr...   
9   An intriguing fragrance that is sure to make a...   
10  Issey Miyake knows how to astonish with the ob...   
11  Love is an epidermal tale and the first memory...   
12  Opens on citrus notes, while Mediterranean fru...   
13           A lively, warm scent as sunny as summer.   
14  Celebrate 20 years of Rem with Rem L'Acqua! Im...   

                                    descriptio

In [4]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m175.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nltk
Successfully installed nltk-3.9.1
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
