In [1]:
# %pip install selenium
# %pip install requests

In [2]:
import requests
import os
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementClickInterceptedException, ElementNotInteractableException, TimeoutException, NoSuchElementException
import time
import pandas as pd

def find_elements(driver, xpath):
    return WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, xpath)))

def save_images(driver, product_list, image_folder):
    os.makedirs(image_folder, exist_ok=True)

    for product_name in product_list:
        try:
            xpath_query = '//img[' + ' and '.join([f'contains(@alt, "{word}")' for word in product_name.split()]) + ']'
            image_element = driver.find_element(By.XPATH, xpath_query)

            image_url = image_element.get_attribute("src")

            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                'Referer': driver.current_url,
            }

            response = requests.get(image_url, headers=headers)

            if response.status_code == 200:
                safe_product_name = re.sub(r'[^a-zA-Z0-9가-힣]+', '_', product_name).replace(' ', '_')
                image_filename = os.path.join(image_folder, f"{safe_product_name}.png")

                with open(image_filename, "wb") as f:
                    f.write(response.content)
            else:
                print(f"이미지 다운로드 실패. 응답 코드: {response.status_code}")
        except NoSuchElementException:
            print(f"이미지를 찾을 수 없습니다: {product_name}")

def extract_item_url(driver, product_list, final_list):
    for index, product_name in enumerate(product_list):
        try:
            cleaned_product_name = re.sub(r'[^a-zA-Z0-9가-힣]+', ' ', product_name)
            cleaned_product_name = cleaned_product_name.lower()

            xpath_query = '//a[' + ' and '.join([f'contains(translate(@href, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "{word}")' for word in cleaned_product_name.split()]) + ']'
            a_element = driver.find_element(By.XPATH, xpath_query)

            href_value = a_element.get_attribute("href")
            final_list[index].append(href_value)
        except NoSuchElementException:
            print(f"<a> 태그를 찾을 수 없습니다: {product_name}")

def extract_review_url(driver_all_reviews, final_list, chrome_options):
    for index, item in enumerate(final_list):
        driver_all_reviews.get(url=item[3])

        WebDriverWait(driver_all_reviews, 3600).until(
            lambda driver: driver.execute_script('return document.readyState') == 'complete'
        )

        try:
            iframe_element = WebDriverWait(driver_all_reviews, 120).until(
                EC.presence_of_element_located((By.ID, "crema-product-reviews-1"))
            )
            iframe_src = iframe_element.get_attribute("src")
            item.append(iframe_src)
            print(index, item[4])
        except NoSuchElementException:
            print("해당 id의 <iframe>을 찾을 수 없습니다.")

        if index % 500 == 0 and index > 0:
            print("브라우저 인스턴스를 재시작합니다.")
            driver_all_reviews.quit()
            driver_all_reviews = webdriver.Chrome(options=chrome_options)

def extract_reviews(driver, final_list, review_list, chrome_options):
    for index, item in enumerate(final_list):
        key = item[0]
        url = item[4]

        driver.get(url)

        no_data_elements = driver.find_elements(By.CSS_SELECTOR, ".js-renewed-no-data-content.hidden")
        if not no_data_elements:    
            continue

        try:
            WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, "ul.reviews li[id^='review_']")))
        except Exception as e:
            print(f"페이지 로드 대기 중 에러 발생: {e}")
            continue

        li_elements = driver.find_elements(By.CSS_SELECTOR, "ul.reviews li[id^='review_']")
        cnt = len(li_elements)
        info = [[key, 0, 0, 0, 0, 0, 0, 0] for _ in range(cnt)]

        for index, li_element in enumerate(li_elements):
            try:
                div_elements1 = li_element.find_elements(By.CLASS_NAME, "review_options_v2__option")

                for div_element in div_elements1:
                    # div_element에 대한 처리를 여기에 추가
                    if len(div_element.text) == 0:
                        continue
                    elif div_element.text.split()[0] == "키":
                        info[index][3] = div_element.text.split()[1]
                    elif div_element.text.split()[0] == "몸무게":
                        info[index][4] = div_element.text.split()[1]
                    elif div_element.text.split()[0] == "평소사이즈-상의":
                        info[index][5] = div_element.text.split()[1]
                    elif div_element.text.split()[0] == "평소사이즈-하의":
                        info[index][6] = div_element.text.split()[1]
                    elif div_element.text.split()[0] == "사이즈":
                        info[index][7] = div_element.text.split()[1]

            except TimeoutException:
                pass

        span_elements = WebDriverWait(driver, 30).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, "visually-hidden"))
        )
        for index, span_element in enumerate(span_elements):
            info[index][1] = int(span_element.text.split()[1].replace("점", ""))

        div_elements2 = WebDriverWait(driver, 120).until(
            EC.visibility_of_all_elements_located((By.CLASS_NAME, "review_list_v2__message.js-collapsed-review-content.js-translate-text"))
        )

        for index, div_element in enumerate(div_elements2):
            info[index][2] = div_element.text

        for list in info:
            review_list.append(list)

        if index % 500 == 0 and index > 0:
            print("브라우저 인스턴스를 재시작합니다.")
            driver.quit()
            driver = webdriver.Chrome(options=chrome_options)

def crawling(url_, cat):
    chrome_options = Options()
    chrome_options.add_experimental_option("detach", True)
    # chrome_options.add_argument("--headless") 
    driver = webdriver.Chrome(options=chrome_options)
    driver.implicitly_wait(3)
    driver.get(url=url_)

    while True:
        try:
            element = WebDriverWait(driver, 30).until(
                EC.element_to_be_clickable((By.XPATH, '//*[@id="contents"]/div[4]/a'))
            )
            driver.execute_script("arguments[0].scrollIntoView(true);", element)
            element.click()
        except (ElementClickInterceptedException, ElementNotInteractableException):
            driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.ARROW_DOWN)
            time.sleep(1)
        except TimeoutException:
            break

    wait = WebDriverWait(driver, 10)
    elements = wait.until(EC.presence_of_all_elements_located((By.XPATH, '//li[contains(@class, "swiper-slide") and contains(@id, "anchorBoxId_")]')))

    final_list = [[element.get_attribute('id')] for element in elements]

    filtered_elements = [element for element in elements if len(element.text.split('\n')) >= 3]

    for index, element in enumerate(filtered_elements):
        split_result = element.text.split('\n')
        
        if split_result[0] != "품절상품":
            filtered_items = [split_result[1], split_result[-1].rstrip()]
        else:
            filtered_items = [split_result[2], split_result[-1].rstrip()]
        final_list[index].extend(filtered_items)

    # 디버깅용
    print(final_list[0], len(final_list))
    
    image_folder = os.path.join(os.path.expanduser("~"), "Desktop", "온더룩", "온더룩_대표이미지_"+cat)
    product_list = [item[1] for item in final_list]

    # save_images(driver, product_list, image_folder)
    extract_item_url(driver, product_list, final_list)

    # 디버깅용
    print(final_list[0])
    
    # Close driver after extracting item URLs
    driver.quit()

    # Create a new WebDriver for review-related tasks
    driver_all_reviews = webdriver.Chrome(options=chrome_options)
    driver_all_reviews.implicitly_wait(3)

    extract_review_url(driver_all_reviews, final_list, chrome_options)
    
    # 디버깅용
    print(final_list[0])
    
    # 드라이버 재시작
    driver_all_reviews.quit()    
    driver_review = webdriver.Chrome(options=chrome_options)
    driver_review.implicitly_wait(3)    

    review_list=[]
    extract_reviews(driver_review, final_list, review_list, chrome_options)

    # 디버깅용
    if len(review_list) > 0:
        print(review_list[0])
    
    driver_review.quit()

    columns = ["product_id", "이름", "가격"]
    flat_final_list = [item[:3] for item in final_list]
    final_df = pd.DataFrame(flat_final_list, columns=columns)
    final_df.to_csv(f'product_data_{cat}.csv', index=False, encoding='utf-8-sig')


    columns = ["product_id", "평점", "리뷰", "키", "몸무게", "평소 사이즈-상의", "평소 사이즈-하의", "구매 사이즈"]
    flat_review_list = [item for item in review_list]
    review_df = pd.DataFrame(flat_review_list, columns=columns)
    review_df.to_csv(f'review_data_{cat}.csv', index=False, encoding='utf-8-sig')


In [3]:
url_list=['https://m.onthelook.store/product/list.html?cate_no=152', "https://m.onthelook.store/product/list.html?cate_no=139", "https://m.onthelook.store/product/list.html?cate_no=140",  "https://m.onthelook.store/product/list.html?cate_no=135", "https://m.onthelook.store/product/list.html?cate_no=134"]
cat_list=["outer", "cap", "acc", "bottom",  "top"]

for i in range(4,5):
    url_=url_list[i]
    cat=cat_list[i]
    crawling(url_, cat)

['anchorBoxId_1875', 'Symbol Dyed Hoodie - Blue', '99,000원'] 1161
['anchorBoxId_1875', 'Symbol Dyed Hoodie - Blue', '99,000원', 'https://m.onthelook.store/product/symbol-dyed-hoodie-blue/1875/category/134/display/1/']
0 https://review8.cre.ma/onthelook.store/mobile/products/reviews?product_code=1875&iframe_id=crema-product-reviews-1&widget_id=14&iframe=1&widget_style=&app=0&device=mobile&parent_url=https%3A%2F%2Fm.onthelook.store%2Fproduct%2Fsymbol-dyed-hoodie-blue%2F1875%2Fcategory%2F134%2Fdisplay%2F1%2F&nonmember_token=&secure_device_token=V264a68efcb5d16f7f3f30cc94604a15524dd8b7aee80f00acbcaa9ba09b842a46612779fd34131e2af28c7143047dd660
1 https://review8.cre.ma/onthelook.store/mobile/products/reviews?product_code=1496&iframe_id=crema-product-reviews-1&widget_id=14&iframe=1&widget_style=&app=0&device=mobile&parent_url=https%3A%2F%2Fm.onthelook.store%2Fproduct%2Four-1989-%25ED%259B%2584%25EB%2593%259Csthstd-0004%2F1496%2Fcategory%2F134%2Fdisplay%2F1%2F&nonmember_token=&secure_device_tok

KeyboardInterrupt: 

In [None]:
## 리뷰 통합 데이터
acc=pd.read_csv("review_data_acc.csv")
cap=pd.read_csv("review_data_cap.csv")
outer=pd.read_csv("review_data_outer.csv")
top=pd.read_csv("review_data_top.csv")
bottom=pd.read_csv("review_data_bottom.csv")

review_total_df=pd.concat([acc, cap], ignore_index=True)
review_total_df=pd.concat([review_total_df, outer], ignore_index=True)
review_total_df=pd.concat([review_total_df, top], ignore_index=True)
review_total_df=pd.concat([review_total_df, bottom], ignore_index=True)

review_total_df.to_csv(f'review_data_total.csv', index=False, encoding='utf-8-sig')

FileNotFoundError: [Errno 2] No such file or directory: 'review_data_top.csv'