
# Fragrantica 노트 스크래퍼 (.ipynb)

**기능**: 카테고리 앵커(대분류)를 기준으로 각 섹션의 `.notebox`에서 **노트명**, **노트 상세 URL**, **이미지 src(쿼리 제거)**, **이미지 alt**, **카테고리명**을 추출하여 CSV로 저장합니다.

> 스크래핑 전 반드시 해당 사이트의 이용약관/robots.txt를 확인하세요. 요청 빈도는 낮게 유지하고 `User-Agent`를 지정하세요.


## 1) 환경 설정

In [2]:

# 필요시 설치 (로컬/런타임 환경에 따라 주석 해제)
%pip install requests beautifulsoup4 lxml


Note: you may need to restart the kernel to use updated packages.


## 2) 라이브러리 임포트 & 헬퍼 함수

In [4]:

import csv
import time
from urllib.parse import urljoin, urlparse, urlunparse
import requests
from bs4 import BeautifulSoup

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
}

def strip_query(url: str) -> str:
    """쿼리스트링/프래그먼트 제거하여 순수 URL 반환"""
    if not url:
        return url
    parsed = urlparse(url)
    parsed = parsed._replace(query="", fragment="")
    return urlunparse(parsed)

def fetch_html(url: str, retries: int = 3, backoff: float = 1.0) -> str:
    """HTTP GET with 재시도/백오프"""
    last_exc = None
    for attempt in range(1, retries + 1):
        try:
            resp = requests.get(url, headers=HEADERS, timeout=20)
            if resp.status_code == 200 and resp.text:
                return resp.text
        except Exception as e:
            last_exc = e
        time.sleep(backoff * attempt)
    if last_exc:
        raise last_exc
    else:
        raise RuntimeError(f"Failed to fetch {url}")


## 3) 파싱 함수 (카테고리/노트 추출)

In [5]:

def find_category_name_from_anchor(anchor_id: str, soup: BeautifulSoup) -> str:
    """#groupnotes_group_01_title 등 id 요소의 텍스트 반환"""
    if not anchor_id:
        return ""
    title_el = soup.find(id=anchor_id)
    return title_el.get_text(strip=True) if title_el else ""

def extract_sections(soup: BeautifulSoup):
    """상단 p[data-magellan] 내 #id 목록 수집"""
    sections = []
    nav_p = soup.find("p", attrs={"data-magellan": True})
    if not nav_p:
        return sections
    for a in nav_p.find_all("a", href=True):
        href = a["href"].strip()
        if href.startswith("#"):
            sections.append(href.lstrip("#"))
    return sections

def extract_notes_for_section(section_id: str, soup: BeautifulSoup, base_url: str):
    """섹션 id 기준으로 다음 .grid-x 블록에서 .notebox 추출"""
    notes = []
    title_el = soup.find(id=section_id)
    if not title_el:
        return notes

    grid = title_el.find_next("div", class_="grid-x")
    while grid and "grid-x" in (grid.get("class") or []):
        noteboxes = grid.select(".notebox a[href]")
        if noteboxes:
            for a in noteboxes:
                note_url = urljoin(base_url, a.get("href", "").strip())
                note_url = strip_query(note_url)
                note_name = a.get_text(strip=True)

                img = a.find("img")
                img_src = ""
                if img and img.get("src"):
                    img_src = urljoin(base_url, img["src"].strip())
                    img_src = strip_query(img_src)
                img_alt = img.get("alt", "").strip() if img else ""

                notes.append({
                    "category": find_category_name_from_anchor(section_id, soup),
                    "note_name": note_name,
                    "image_url": img_src,
                })
            break
        grid = grid.find_next("div", class_="grid-x")
    return notes


def scrape_notes(page_url: str):
    html = fetch_html(page_url)
    soup = BeautifulSoup(html, "lxml")

    section_ids = extract_sections(soup)
    results = []

    if not section_ids:
        # fallback: 페이지 전체에서 notebox 수집 (카테고리 없음)
        for a in soup.select(".notebox a[href]"):
            note_name = a.get_text(strip=True)
            img = a.find("img")
            img_src = strip_query(urljoin(page_url, img["src"])) if img and img.get("src") else ""
            results.append({
                "category": "",
                "note_name": note_name,
                "image_url": img_src,
            })
        return results

    for sid in section_ids:
        results.extend(extract_notes_for_section(sid, soup, page_url))

    # 중복 제거
    seen = set()
    deduped = []
    for row in results:
        key = (row["note_name"], row["image_url"])
        if key not in seen:
            seen.add(key)
            deduped.append(row)
    return deduped


def save_csv(rows, out_path: str):
    fieldnames = ["category", "note_name", "image_url"]
    with open(out_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)


## 4) 실행 파라미터 설정

In [6]:

# ▶︎ 페이지 URL과 출력 CSV 경로를 설정하세요.
PAGE_URL = "https://www.fragrantica.com/notes/"  # 예시: 노트 인덱스 페이지
OUTPUT_CSV = "fragrantica_notes.csv"             # 예시 출력 파일명

print("PAGE_URL:", PAGE_URL)
print("OUTPUT_CSV:", OUTPUT_CSV)


PAGE_URL: https://www.fragrantica.com/notes/
OUTPUT_CSV: fragrantica_notes.csv


## 5) 크롤링 실행 (노트 추출 → CSV 저장)

In [7]:
rows = scrape_notes(PAGE_URL)
print(f"Extracted {len(rows)} notes")
save_csv(rows, OUTPUT_CSV)
print(f"Saved to {OUTPUT_CSV}")


Extracted 1707 notes
Saved to fragrantica_notes.csv


## 6) 결과 미리보기

In [8]:

import pandas as pd
from pathlib import Path

if Path(OUTPUT_CSV).exists():
    df = pd.read_csv(OUTPUT_CSV)
    display(df.head(10))
else:
    print("CSV not found yet. Run the previous cell first.")


Unnamed: 0,category,note_name,image_url
0,CITRUS SMELLS,Bergamot,https://fimgs.net/mdimg/sastojci/m.75.jpg
1,CITRUS SMELLS,Bigarade,https://fimgs.net/mdimg/sastojci/m.1083.jpg
2,CITRUS SMELLS,Bitter Orange,https://fimgs.net/mdimg/sastojci/m.79.jpg
3,CITRUS SMELLS,Blood Orange,https://fimgs.net/mdimg/sastojci/m.286.jpg
4,CITRUS SMELLS,Buddha's hand,https://fimgs.net/mdimg/sastojci/m.1589.jpg
5,CITRUS SMELLS,Calamansi,https://fimgs.net/mdimg/sastojci/m.1028.jpg
6,CITRUS SMELLS,Candied Lemon,https://fimgs.net/mdimg/sastojci/m.1485.jpg
7,CITRUS SMELLS,Chen Pi,https://fimgs.net/mdimg/sastojci/m.1289.jpg
8,CITRUS SMELLS,Chinotto,https://fimgs.net/mdimg/sastojci/m.866.jpg
9,CITRUS SMELLS,Citron,https://fimgs.net/mdimg/sastojci/m.373.jpg
