In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
import json

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

import google.generativeai as genai
import re

In [None]:
headers = {
    "User-Agent": "Mozilla/5.0"
}

# Step 1. 리뷰 URL 수집
def get_review_urls(max_pages=5):
    base_url = "https://pitchfork.com/reviews/albums/?page={}"
    review_urls = []

    for page in tqdm(range(1, max_pages + 1)):
        url = base_url.format(page)
        res = requests.get(url, headers=headers)
        soup = BeautifulSoup(res.content, 'html.parser')

        # 수정된 셀렉터
        links = soup.select('a[href^="/reviews/albums/"]')
        for link in links:
            href = link['href']
            full_url = "https://pitchfork.com" + href
            review_urls.append(full_url)
        time.sleep(1)

    return list(set(review_urls))  # 중복 제거

# Step 2. 평론 본문 추출
def extract_review_text(html):
    soup = BeautifulSoup(html, 'html.parser')
    
    # class 이름에 'article__body'가 포함된 div를 찾음
    candidates = soup.find_all("div", class_=lambda x: x and "article__body" in x)
    
    for c in candidates:
        paragraphs = c.find_all("p")
        if paragraphs:
            return "\n".join(p.get_text(strip=True) for p in paragraphs)
    
    return None

# Step 3. 전체 실행 및 저장장
def crawl_pitchfork_reviews(max_pages=5, save_path="pitchfork_reviews.json"):
    urls = get_review_urls(max_pages=max_pages)
    data = []

    for url in tqdm(urls):
        try:
            res = requests.get(url, headers=headers)
            if res.status_code == 200:
                review = extract_review_text(res.text)
                if review:
                    data.append({
                        "url": url,
                        "review_text": review
                    })
        except Exception as e:
            print(f"❌ 오류 발생: {url}", e)
        time.sleep(0.5)

    with open(save_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    print(f"✅ 수집 완료! {len(data)}개 리뷰 저장됨 → {save_path}")

# 예시 실행 (5페이지 = 약 450~500개 리뷰)
crawl_pitchfork_reviews(max_pages=5)

 60%|██████    | 3/5 [00:04<00:03,  1.53s/it]