# Various attempts to parse products from Levi's using Selenium with different parallelization approaches.

In [None]:
from IPython.display import clear_output

In [None]:
!apt-get update
!pip install selenium
!pip install webdriver-manager
!apt install firefox-geckodriver
!cp /usr/lib/geckodriver /usr/bin
!cp /usr/lib/firefox /usr/bin
clear_output()

In [None]:
import os
from datetime import datetime
import time
from itertools import repeat

import numpy as np
import pandas as pd

from webdriver_manager.firefox import GeckoDriverManager

from selenium import webdriver
from selenium.webdriver.firefox.service import Service

from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.expected_conditions import presence_of_element_located as ele_present
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException

import multiprocessing as mp
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
os.environ["WDM_PROGRESS_BAR"] = "0"

In [None]:
driver_path = GeckoDriverManager().install()

In [None]:
def scroll_to_ele(driver, object):
    x = object.location["x"]
    y = object.location["y"]
    scroll_by_coord = f"window.scrollTo({x}, {y});"
    scroll_nav_out_of_way = "window.scrollBy(0, -120);"
    driver.execute_script(scroll_by_coord)
    driver.execute_script(scroll_nav_out_of_way)

In [None]:
def try_to_get_ele(driver, css_selector, sleep_timer=0.3, max_retries=None, return_text=False):
    if not max_retries:
      max_retries = int(30 / sleep_timer)

    try_n = 0
    success = False
    while (not success) and (try_n < max_retries):
      try:
        if return_text:
          element = driver.find_element(By.CSS_SELECTOR, css_selector).text
        else:
          element = driver.find_element(By.CSS_SELECTOR, css_selector)
        success = True
      except StaleElementReferenceException as e:
        try_n += 1
        time.sleep(sleep_timer)

    if try_n == max_retries:
      raise e

    return element

In [None]:
def setup_worker_browser(driver_path):
  global browser

  options = webdriver.FirefoxOptions()
  options.add_argument("start-maximized")
  options.add_argument("--headless")

  driver_service = Service(driver_path)

  browser = webdriver.Firefox(options=options, service=driver_service)
  browser.command_executor.set_timeout(30)

In [None]:
def get_products_list(section_url):
  all_product_names = []
  all_urls = []
  section = []

  error_log = []

  fails_count_threshold = 3
  fails_count = 0

  i = 0
  while True:
    if fails_count == fails_count_threshold:
      error = f"Failed to get products list on page {browser.current_url} after {fails_count_threshold} retries."
      error_log.append(error)
      fails_count = 0
      i += 1
      continue

    current_page = section_url + f"?page={i}"
    print("Current page:", current_page)

    browser.get(current_page)
    browser.refresh()

    header = WebDriverWait(browser, 60).until(ele_present((By.CSS_SELECTOR, "div[class*='search-header__wrapper']")))
    page_is_empty = header.find_elements(By.CSS_SELECTOR, "h1[class='search-header__nothing-found--message']")
    if page_is_empty:
      return [all_product_names, all_urls, section], error_log

    html_element = browser.find_element(By.TAG_NAME, "html")
    html_element.send_keys(Keys.END)

    try:
      WebDriverWait(browser, 15).until(ele_present((By.CSS_SELECTOR, "div[class='product-cell']")))
    except TimeoutException:
      fails_count += 1
      browser.refresh()
      continue

    every_product = browser.find_elements(By.CSS_SELECTOR, "div[class='product-cell']")

    for product in every_product:
      if "firefox" in browser.capabilities["browserName"]:
        scroll_to_ele(browser, product)

      product_name = product.find_element(By.CSS_SELECTOR, "div[class='product-name']").text
      product_url = product.find_element(By.CSS_SELECTOR, "a[class*='product-link']").get_attribute("href")

      all_product_names.append(product_name)
      all_urls.append(product_url)
      section.append(section_url.split("/")[-1])
    i += 1

  # browser.quit()
  return [all_product_names, all_urls, section], error_log

In [None]:
clothing_sections = [
    "https://www.levi.com/US/en_US/clothing/men/c/levi_clothing_men",
    "https://www.levi.com/US/en_US/clothing/women/c/levi_clothing_women"
    ]

with mp.Pool(initializer=setup_worker_browser, initargs=(driver_path,)) as p:
    results = p.map(get_products_list, clothing_sections)

Current page: https://www.levi.com/US/en_US/clothing/men/c/levi_clothing_men?page=0
Current page: https://www.levi.com/US/en_US/clothing/women/c/levi_clothing_women?page=0
Current page: https://www.levi.com/US/en_US/clothing/women/c/levi_clothing_women?page=1
Current page: https://www.levi.com/US/en_US/clothing/men/c/levi_clothing_men?page=1
Current page: https://www.levi.com/US/en_US/clothing/men/c/levi_clothing_men?page=2
Current page: https://www.levi.com/US/en_US/clothing/women/c/levi_clothing_women?page=2
Current page: https://www.levi.com/US/en_US/clothing/women/c/levi_clothing_women?page=3
Current page: https://www.levi.com/US/en_US/clothing/men/c/levi_clothing_men?page=3
Current page: https://www.levi.com/US/en_US/clothing/women/c/levi_clothing_women?page=4
Current page: https://www.levi.com/US/en_US/clothing/men/c/levi_clothing_men?page=4
Current page: https://www.levi.com/US/en_US/clothing/women/c/levi_clothing_women?page=5
Current page: https://www.levi.com/US/en_US/clothing

In [None]:
all_product_names = []
all_urls = []
section = []
all_errors = []

for subtask_result in results:
  all_product_names.extend(subtask_result[0][0])
  all_urls.extend(subtask_result[0][1])
  section.extend(subtask_result[0][2])

  all_errors.extend(subtask_result[1])

In [None]:
print("Products snatched total:", len(all_urls))
print(f"Errors encountered: {len(all_errors)}")

Products snatched total: 1366
Errors encountered: 0


In [None]:
all_products_df = pd.DataFrame(pd.DataFrame({"Product_Name": all_product_names,
                                             "Product_URL": all_urls,
                                             "Product_Section": section}))
all_products_df = all_products_df.drop_duplicates().reset_index(drop=True)

multi_cat = all_products_df[all_products_df.duplicated(subset=["Product_Name",
                                                               "Product_URL"], keep=False)]
multi_cat = multi_cat.groupby(by=["Product_Name", "Product_URL"], as_index=False).apply(lambda x: ", ".join(x["Product_Section"]))
multi_cat = multi_cat.rename(columns={None : "Product_Section"})

all_products_df = all_products_df.drop_duplicates(subset=["Product_Name", "Product_URL"], keep=False)
all_products_df = pd.concat([all_products_df, multi_cat]).sort_values(by=["Product_Section"]).reset_index(drop=True)
all_products_df

Unnamed: 0,Product_Name,Product_URL,Product_Section
0,Type III Sherpa Trucker Jacket,https://www.levi.com/US/en_US/p/163650162,levi_clothing_men
1,Stock Trucker Jacket,https://www.levi.com/US/en_US/p/A07300006,levi_clothing_men
2,"469 Loose Jean 12.5"" Men's Shorts (Big & Tall)",https://www.levi.com/US/en_US/p/547910007,levi_clothing_men
3,Relaxed Fit Short Sleeve T-Shirt,https://www.levi.com/US/en_US/p/161430729,levi_clothing_men
4,Levi's® x Vote Relaxed Pullover,https://www.levi.com/US/en_US/p/384790020,levi_clothing_men
...,...,...,...
1274,Baggy Pants,https://www.levi.com/US/en_US/p/A46740005,levi_clothing_women
1275,Gold Tab™ Tank Top,https://www.levi.com/US/en_US/p/A37150011,levi_clothing_women
1276,Slacker Trench Coat,https://www.levi.com/US/en_US/p/A44450000,levi_clothing_women
1277,Onion Quilted Liner Jacket,https://www.levi.com/US/en_US/p/595400012,levi_clothing_women


# Common variables for each experiment

In [None]:
all_products_df = pd.read_csv("products_levi.csv")
all_products_df

In [None]:
products_iter = list(all_products_df.itertuples(name=None))[:prod_num]

In [None]:
prod_num = 50

# Multiprocessing + multithreading

In [None]:
def get_images(df_row, local):
  browser = local.browser

  Product_Name = []
  Product_URL = []
  Product_original_URL = []
  Product_ID = []
  Product_Color = []
  Product_Categories = []
  Image_URL = []

  error_log_products = []
  error_log_swatches = []

  cur_name = df_row[1]
  cur_url = df_row[2]
  cur_section = df_row[3]

  try:
    print(f"Current product: {cur_name}, url: {cur_url}")
    browser.get(cur_url)

    url_template = cur_url.split("/")

    WebDriverWait(browser, 30).until(ele_present((By.CSS_SELECTOR, "div[class='lsco-row product-details__top']")))
    product_details = try_to_get_ele(browser, css_selector="div[class='lsco-row product-details__top']")

    if "firefox" in browser.capabilities["browserName"]:
      scroll_to_ele(browser, product_details)

    every_prod_swatch = WebDriverWait(product_details, 30).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,
                                                                                                      "li[class*='swatch swatch-wrapper']")))
    success = False
    try_n = 0
    sleep_timer = 0.2
    max_retries = int(5 / sleep_timer)
    while (not success) and (try_n < max_retries):
      try:
        swatch_codes = []
        swatch_colors = []
        for swatch in every_prod_swatch:
          swatch_border = swatch.find_element(By.TAG_NAME, "button")
          swatch_codes.append(swatch.get_attribute("code"))
          swatch_colors.append(swatch_border.get_attribute("aria-label"))
        success = True
      except StaleElementReferenceException:
        time.sleep(sleep_timer)
        every_prod_swatch = product_details.find_elements(By.CSS_SELECTOR, "li[class*='swatch swatch-wrapper']")
        try_n += 1

    if try_n == max_retries:
      raise TimeoutError(f"Failed to get swatch codes list")

    for swatch_code, swatch_color in zip(swatch_codes, swatch_colors):
      try:
        url = url_template.copy()
        url[-1] = f"{swatch_code}#swatch"
        swatch_url = "/".join(url)

        max_retries = 5
        try_n = 0
        success = False
        while (not success) and (try_n < max_retries):
          try:
            browser.get(swatch_url)
            browser.refresh()

            WebDriverWait(browser, 15).until(ele_present((By.CSS_SELECTOR, "div[class='lsco-row product-details__top']")))
            product_details = try_to_get_ele(browser, css_selector="div[class='lsco-row product-details__top']")
            success = True
          except TimeoutException:
            try_n += 1

        if try_n == max_retries:
          raise TimeoutError(f"Failed to get swatch")

        if "firefox" in browser.capabilities["browserName"]:
          scroll_to_ele(browser, product_details)

        product_details_breadcrumbs = try_to_get_ele(browser, css_selector="div[class='lsco-row product-details__breadcrumbs']")
        WebDriverWait(product_details_breadcrumbs, 30).until(ele_present((By.CSS_SELECTOR, "li[class='pdp-breadcrumbs__list-item'][aria-current='page']")))
        prod_categories_elements = product_details_breadcrumbs.find_elements(By.CSS_SELECTOR, "li[class='pdp-breadcrumbs__list-item']")
        prod_categories = [ele.get_attribute("aria-label") for ele in prod_categories_elements]

        img_urls = set()
        while True:
          every_prod_thumbnail = product_details.find_elements(By.CSS_SELECTOR, "button[class*='product-media-thumbnail']")
          for prod_thumbnail in every_prod_thumbnail:
            is_video = prod_thumbnail.find_elements(By.TAG_NAME, "svg")
            if not is_video:
              img_url = prod_thumbnail.find_element(By.TAG_NAME, "img").get_attribute("data-src")
              img_urls.add(img_url)
          next_btn = product_details.find_elements(By.CSS_SELECTOR, "button[class='view-next-btn']")
          if next_btn:
            browser.execute_script("arguments[0].click();", next_btn[0])
          else:
            break

        for img_url in img_urls:
          img_url = img_url.replace("wid=150&hei=150", "wid=900&hei=982")
          img_url = img_url.replace("fmt=avif", "fmt=jpg")

          Product_Name.append(prod_categories[-1])
          Product_URL.append(swatch_url)
          Product_original_URL.append(cur_url)
          Product_ID.append(swatch_code)
          Product_Color.append(swatch_color)
          Product_Categories.append(", ".join(prod_categories[:-1]))
          Image_URL.append(img_url)

      except Exception as e:
        print(f"Failed to get swatch {swatch_url} for product {cur_url}, skipping")
        print(e.__class__.__name__ )
        error = {}
        error["Product_URL"] = cur_url
        error["Product_Swatch_URL"] = swatch_url
        error["Product_Section"] = cur_section
        error["Error_Type"] = e.__class__.__name__
        error["Error_Message"] = e
        error_log_swatches.append(error)
  except Exception as e:
      print(f"Failed to get product {cur_url}, skipping")
      print(e.__class__.__name__ )
      error = {}
      error["Product_URL"] = cur_url
      error["Product_Section"] = cur_section
      error["Error_Type"] = e.__class__.__name__
      error["Error_Message"] = e
      error_log_products.append(error)
  return [Product_Name, Product_URL, Product_original_URL, Product_ID, Product_Color, Product_Categories, Image_URL], error_log_products, error_log_swatches

In [None]:
def initialize_thread_worker(local, driver_path):
  options = webdriver.FirefoxOptions()
  options.add_argument("start-maximized")
  options.add_argument("--headless")

  driver_service = Service(driver_path)

  browser = webdriver.Firefox(options=options, service=driver_service)
  browser.command_executor.set_timeout(30)
  local.browser = browser


def get_images_with_threads(split_prod_list, driver_path):
  split_prod_list = split_prod_list.tolist()

  Product_Name = []
  Product_URL = []
  Product_original_URL = []
  Product_ID = []
  Product_Color = []
  Product_Categories = []
  Image_URL = []

  error_log_products = []
  error_log_swatches = []

  local = threading.local()
  with ThreadPoolExecutor(max_workers=6, initializer=initialize_thread_worker, initargs=(local, driver_path,)) as pool:
    results = pool.map(get_images, split_prod_list, repeat(local))

  for subtask_result in results:
    Product_Name.extend(subtask_result[0][0])
    Product_URL.extend(subtask_result[0][1])
    Product_original_URL.extend(subtask_result[0][2])
    Product_ID.extend(subtask_result[0][3])
    Product_Color.extend(subtask_result[0][4])
    Product_Categories.extend(subtask_result[0][5])
    Image_URL.extend(subtask_result[0][6])

    error_log_products.extend(subtask_result[1])
    error_log_swatches.extend(subtask_result[2])
  return [Product_Name, Product_URL, Product_original_URL, Product_ID, Product_Color, Product_Categories, Image_URL], error_log_products, error_log_swatches

In [None]:
start = datetime.now()
print("Start:", start.strftime("%H:%M:%S"))

Start: 16:00:54


In [None]:
split_prod_list = np.array_split(products_iter, os.cpu_count())

with mp.Pool() as p:
    results = p.map(get_images_with_threads, split_prod_list, repeat(driver_path))

Current product: Classic Western Shirt (Big), url: https://www.levi.com/US/en_US/p/574230007
Current product: 501® Slim Taper Fit Men's Jeans, url: https://www.levi.com/US/en_US/p/288940241
Current product: Short Sleeve Slouchy Button Up Shirt, url: https://www.levi.com/US/en_US/p/A19210001
Current product: Type III Sherpa Trucker Jacket, url: https://www.levi.com/US/en_US/p/163650162
Current product: Stock Trucker Jacket, url: https://www.levi.com/US/en_US/p/A07300006
Current product: Stay Loose Pocket T-Shirt, url: https://www.levi.com/US/en_US/p/A40770000
Current product: 469 Loose Jean 12.5" Men's Shorts (Big & Tall), url: https://www.levi.com/US/en_US/p/547910007
Current product: Relaxed Fit Short Sleeve T-Shirt, url: https://www.levi.com/US/en_US/p/161430729
Current product: Trucker Jacket, url: https://www.levi.com/US/en_US/p/A31790001
Current product: Levi's® x Vote Relaxed Pullover, url: https://www.levi.com/US/en_US/p/384790020
Current product: Portola Chore Coat, url: https:

In [None]:
finish = datetime.now()
print("Finish:", finish.strftime("%H:%M:%S"))
print()
print("Time taken:", str(finish - start).split(".")[0])

Finish: 17:49:05

Time taken: 1:48:10


In [None]:
Product_Name = []
Product_URL = []
Product_original_URL = []
Product_ID = []
Product_Color = []
Product_Categories = []
Image_URL = []

error_log_products = []
error_log_swatches = []

for subtask_result in results:
  Product_Name.extend(subtask_result[0][0])
  Product_URL.extend(subtask_result[0][1])
  Product_original_URL.extend(subtask_result[0][2])
  Product_ID.extend(subtask_result[0][3])
  Product_Color.extend(subtask_result[0][4])
  Product_Categories.extend(subtask_result[0][5])
  Image_URL.extend(subtask_result[0][6])

  error_log_products.extend(subtask_result[1])
  error_log_swatches.extend(subtask_result[2])

In [None]:
print("Images snatched total:", len(Image_URL))
print(f"Errors encountered during product scrapping: {len(error_log_products)}")
print(f"Errors encountered during color swatch scrapping: {len(error_log_swatches)}")

Images snatched total: 264
Errors encountered during product scrapping: 1
Errors encountered during color swatch scrapping: 0


In [None]:
# products_dict_mp = {
#     "Product_Name": Product_Name,
#     "Product_URL": Product_URL,
#     "Product_original_URL": Product_original_URL,
#     "Product_ID": Product_ID,
#     "Product_Color": Product_Color,
#     "Product_Categories": Product_Categories,
#     "Image_URL": Image_URL
#     }

In [None]:
# df_mp = pd.DataFrame.from_dict(products_dict_mp)
# df_mp

In [None]:
# df_mp.groupby(by=["Product_original_URL"]).ngroups

# Multiprocessing

In [None]:
def get_images(df_row):
  Product_Name = []
  Product_URL = []
  Product_original_URL = []
  Product_ID = []
  Product_Color = []
  Product_Categories = []
  Image_URL = []

  error_log_products = []
  error_log_swatches = []

  cur_name = df_row[1]
  cur_url = df_row[2]
  cur_section = df_row[3]

  try:
    print(f"Current product: {cur_name}, url: {cur_url}")
    browser.get(cur_url)

    url_template = cur_url.split("/")

    WebDriverWait(browser, 30).until(ele_present((By.CSS_SELECTOR, "div[class='lsco-row product-details__top']")))
    product_details = try_to_get_ele(browser, css_selector="div[class='lsco-row product-details__top']")

    if "firefox" in browser.capabilities["browserName"]:
      scroll_to_ele(browser, product_details)

    every_prod_swatch = WebDriverWait(product_details, 30).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,
                                                                                                      "li[class*='swatch swatch-wrapper']")))
    success = False
    try_n = 0
    sleep_timer = 0.2
    max_retries = int(5 / sleep_timer)
    while (not success) and (try_n < max_retries):
      try:
        swatch_codes = []
        swatch_colors = []
        for swatch in every_prod_swatch:
          swatch_border = swatch.find_element(By.TAG_NAME, "button")
          swatch_codes.append(swatch.get_attribute("code"))
          swatch_colors.append(swatch_border.get_attribute("aria-label"))
        success = True
      except StaleElementReferenceException:
        time.sleep(sleep_timer)
        every_prod_swatch = product_details.find_elements(By.CSS_SELECTOR, "li[class*='swatch swatch-wrapper']")
        try_n += 1

    if try_n == max_retries:
      raise TimeoutError(f"Failed to get swatch codes list")

    for swatch_code, swatch_color in zip(swatch_codes, swatch_colors):
      try:
        url = url_template.copy()
        url[-1] = f"{swatch_code}#swatch"
        swatch_url = "/".join(url)

        max_retries = 5
        try_n = 0
        success = False
        while (not success) and (try_n < max_retries):
          try:
            browser.get(swatch_url)
            browser.refresh()

            WebDriverWait(browser, 15).until(ele_present((By.CSS_SELECTOR, "div[class='lsco-row product-details__top']")))
            product_details = try_to_get_ele(browser, css_selector="div[class='lsco-row product-details__top']")
            success = True
          except TimeoutException:
            try_n += 1

        if try_n == max_retries:
          raise TimeoutError(f"Failed to get swatch")

        if "firefox" in browser.capabilities["browserName"]:
          scroll_to_ele(browser, product_details)

        product_details_breadcrumbs = try_to_get_ele(browser, css_selector="div[class='lsco-row product-details__breadcrumbs']")
        WebDriverWait(product_details_breadcrumbs, 30).until(ele_present((By.CSS_SELECTOR, "li[class='pdp-breadcrumbs__list-item'][aria-current='page']")))
        prod_categories_elements = product_details_breadcrumbs.find_elements(By.CSS_SELECTOR, "li[class='pdp-breadcrumbs__list-item']")
        prod_categories = [ele.get_attribute("aria-label") for ele in prod_categories_elements]

        img_urls = set()
        while True:
          every_prod_thumbnail = product_details.find_elements(By.CSS_SELECTOR, "button[class*='product-media-thumbnail']")
          for prod_thumbnail in every_prod_thumbnail:
            is_video = prod_thumbnail.find_elements(By.TAG_NAME, "svg")
            if not is_video:
              img_url = prod_thumbnail.find_element(By.TAG_NAME, "img").get_attribute("data-src")
              img_urls.add(img_url)
          next_btn = product_details.find_elements(By.CSS_SELECTOR, "button[class='view-next-btn']")
          if next_btn:
            browser.execute_script("arguments[0].click();", next_btn[0])
          else:
            break

        for img_url in img_urls:
          img_url = img_url.replace("wid=150&hei=150", "wid=900&hei=982")
          img_url = img_url.replace("fmt=avif", "fmt=jpg")

          Product_Name.append(prod_categories[-1])
          Product_URL.append(swatch_url)
          Product_original_URL.append(cur_url)
          Product_ID.append(swatch_code)
          Product_Color.append(swatch_color)
          Product_Categories.append(", ".join(prod_categories[:-1]))
          Image_URL.append(img_url)

      except Exception as e:
        print(f"Failed to get swatch {swatch_url} for product {cur_url}, skipping")
        print(e.__class__.__name__ )
        error = {}
        error["Product_URL"] = cur_url
        error["Product_Swatch_URL"] = swatch_url
        error["Product_Section"] = cur_section
        error["Error_Type"] = e.__class__.__name__
        error["Error_Message"] = e
        error_log_swatches.append(error)
  except Exception as e:
      print(f"Failed to get product {cur_url}, skipping")
      print(e.__class__.__name__ )
      error = {}
      error["Product_URL"] = cur_url
      error["Product_Section"] = cur_section
      error["Error_Type"] = e.__class__.__name__
      error["Error_Message"] = e
      error_log_products.append(error)
  return [Product_Name, Product_URL, Product_original_URL, Product_ID, Product_Color, Product_Categories, Image_URL], error_log_products, error_log_swatches

In [None]:
start = datetime.now()
print("Start:", start.strftime("%H:%M:%S"))

Start: 19:58:12


In [None]:
with mp.Pool(initializer=setup_worker_browser, initargs=(driver_path,)) as p:
    results = p.map(get_images, products_iter)

Current product: Type III Sherpa Trucker Jacket, url: https://www.levi.com/US/en_US/p/163650162
Current product: Varsity Relaxed Raglan Crewneck Sweatshirt, url: https://www.levi.com/US/en_US/p/A33140002
Current product: Union Rugby Shirt, url: https://www.levi.com/US/en_US/p/A48450004
Current product: Stock Trucker Jacket, url: https://www.levi.com/US/en_US/p/A07300006
Current product: Chenango Reversible Belt, url: https://www.levi.com/US/en_US/p/380160352
Current product: 469 Loose Jean 12.5" Men's Shorts (Big & Tall), url: https://www.levi.com/US/en_US/p/547910007
Current product: Sportswear Logo Tee Shirt, url: https://www.levi.com/US/en_US/p/396360002
Current product: Relaxed Fit Short Sleeve T-Shirt, url: https://www.levi.com/US/en_US/p/161430729
Current product: Relaxed Pocket Tee, url: https://www.levi.com/US/en_US/p/343100026
Current product: Levi's® x Vote Relaxed Pullover, url: https://www.levi.com/US/en_US/p/384790020


In [None]:
finish = datetime.now()
print("Finish:", finish.strftime("%H:%M:%S"))
print()
print("Time taken:", str(finish - start).split(".")[0])

In [None]:
Product_Name = []
Product_URL = []
Product_original_URL = []
Product_ID = []
Product_Color = []
Product_Categories = []
Image_URL = []

error_log_products = []
error_log_swatches = []

for subtask_result in results:
  Product_Name.extend(subtask_result[0][0])
  Product_URL.extend(subtask_result[0][1])
  Product_original_URL.extend(subtask_result[0][2])
  Product_ID.extend(subtask_result[0][3])
  Product_Color.extend(subtask_result[0][4])
  Product_Categories.extend(subtask_result[0][5])
  Image_URL.extend(subtask_result[0][6])

  error_log_products.extend(subtask_result[1])
  error_log_swatches.extend(subtask_result[2])

In [None]:
print("Images snatched total:", len(Image_URL))
print(f"Errors encountered during product scrapping: {len(error_log_products)}")
print(f"Errors encountered during color swatch scrapping: {len(error_log_swatches)}")

# Single process

In [None]:
start = datetime.now()
print("Start:", start.strftime("%H:%M:%S"))

Start: 19:26:39


In [None]:
options = webdriver.FirefoxOptions()
options.add_argument("start-maximized")
options.add_argument("--headless")

driver_service = Service(driver_path)

browser = webdriver.Firefox(options=options, service=driver_service)
browser.command_executor.set_timeout(30)

In [None]:
Product_Name = []
Product_URL = []
Product_original_URL = []
Product_ID = []
Product_Color = []
Product_Categories = []
Image_URL = []

error_log_products = []
error_log_swatches = []

for df_row in products_iter:
  cur_name = df_row[1]
  cur_url = df_row[2]
  cur_section = df_row[3]

  try:
    print(f"Current product: {cur_name}, url: {cur_url}")
    browser.get(cur_url)

    url_template = cur_url.split("/")

    WebDriverWait(browser, 30).until(ele_present((By.CSS_SELECTOR, "div[class='lsco-row product-details__top']")))
    product_details = try_to_get_ele(browser, css_selector="div[class='lsco-row product-details__top']")

    if "firefox" in browser.capabilities["browserName"]:
      scroll_to_ele(browser, product_details)

    every_prod_swatch = WebDriverWait(product_details, 30).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,
                                                                                                      "li[class*='swatch swatch-wrapper']")))
    success = False
    try_n = 0
    sleep_timer = 0.2
    max_retries = int(5 / sleep_timer)
    while (not success) and (try_n < max_retries):
      try:
        swatch_codes = []
        swatch_colors = []
        for swatch in every_prod_swatch:
          swatch_border = swatch.find_element(By.TAG_NAME, "button")
          swatch_codes.append(swatch.get_attribute("code"))
          swatch_colors.append(swatch_border.get_attribute("aria-label"))
        success = True
      except StaleElementReferenceException:
        time.sleep(sleep_timer)
        every_prod_swatch = product_details.find_elements(By.CSS_SELECTOR, "li[class*='swatch swatch-wrapper']")
        try_n += 1

    if try_n == max_retries:
      raise TimeoutError(f"Failed to get swatch codes list")

    for swatch_code, swatch_color in zip(swatch_codes, swatch_colors):
      try:
        url = url_template.copy()
        url[-1] = f"{swatch_code}#swatch"
        swatch_url = "/".join(url)

        max_retries = 5
        try_n = 0
        success = False
        while (not success) and (try_n < max_retries):
          try:
            browser.get(swatch_url)
            browser.refresh()

            WebDriverWait(browser, 15).until(ele_present((By.CSS_SELECTOR, "div[class='lsco-row product-details__top']")))
            product_details = try_to_get_ele(browser, css_selector="div[class='lsco-row product-details__top']")
            success = True
          except TimeoutException:
            try_n += 1

        if try_n == max_retries:
          raise TimeoutError(f"Failed to get swatch")

        if "firefox" in browser.capabilities["browserName"]:
          scroll_to_ele(browser, product_details)

        product_details_breadcrumbs = try_to_get_ele(browser, css_selector="div[class='lsco-row product-details__breadcrumbs']")
        WebDriverWait(product_details_breadcrumbs, 30).until(ele_present((By.CSS_SELECTOR, "li[class='pdp-breadcrumbs__list-item'][aria-current='page']")))
        prod_categories_elements = product_details_breadcrumbs.find_elements(By.CSS_SELECTOR, "li[class='pdp-breadcrumbs__list-item']")
        prod_categories = [ele.get_attribute("aria-label") for ele in prod_categories_elements]

        img_urls = set()
        while True:
          every_prod_thumbnail = product_details.find_elements(By.CSS_SELECTOR, "button[class*='product-media-thumbnail']")
          for prod_thumbnail in every_prod_thumbnail:
            is_video = prod_thumbnail.find_elements(By.TAG_NAME, "svg")
            if not is_video:
              img_url = prod_thumbnail.find_element(By.TAG_NAME, "img").get_attribute("data-src")
              img_urls.add(img_url)
          next_btn = product_details.find_elements(By.CSS_SELECTOR, "button[class='view-next-btn']")
          if next_btn:
            browser.execute_script("arguments[0].click();", next_btn[0])
          else:
            break

        for img_url in img_urls:
          img_url = img_url.replace("wid=150&hei=150", "wid=900&hei=982")
          img_url = img_url.replace("fmt=avif", "fmt=jpg")

          Product_Name.append(prod_categories[-1])
          Product_URL.append(swatch_url)
          Product_original_URL.append(cur_url)
          Product_ID.append(swatch_code)
          Product_Color.append(swatch_color)
          Product_Categories.append(", ".join(prod_categories[:-1]))
          Image_URL.append(img_url)

      except Exception as e:
        print(f"Failed to get swatch {swatch_url} for product {cur_url}, skipping")
        print(e.__class__.__name__ )
        error = {}
        error["Product_URL"] = cur_url
        error["Product_Swatch_URL"] = swatch_url
        error["Product_Section"] = cur_section
        error["Error_Type"] = e.__class__.__name__
        error["Error_Message"] = e
        error_log_swatches.append(error)
  except Exception as e:
      print(f"Failed to get product {cur_url}, skipping")
      print(e.__class__.__name__ )
      error = {}
      error["Product_URL"] = cur_url
      error["Product_Section"] = cur_section
      error["Error_Type"] = e.__class__.__name__
      error["Error_Message"] = e
      error_log_products.append(error)

  Product_Name.extend(subtask_result[0][0])
  Product_URL.extend(subtask_result[0][1])
  Product_original_URL.extend(subtask_result[0][2])
  Product_ID.extend(subtask_result[0][3])
  Product_Color.extend(subtask_result[0][4])
  Product_Categories.extend(subtask_result[0][5])
  Image_URL.extend(subtask_result[0][6])

  error_log_products.extend(subtask_result[1])
  error_log_swatches.extend(subtask_result[2])

Current product: Type III Sherpa Trucker Jacket, url: https://www.levi.com/US/en_US/p/163650162
Current product: Stock Trucker Jacket, url: https://www.levi.com/US/en_US/p/A07300006
Current product: 469 Loose Jean 12.5" Men's Shorts (Big & Tall), url: https://www.levi.com/US/en_US/p/547910007
Current product: Relaxed Fit Short Sleeve T-Shirt, url: https://www.levi.com/US/en_US/p/161430729
Failed to get product https://www.levi.com/US/en_US/p/161430729, skipping
TimeoutException
Current product: Levi's® x Vote Relaxed Pullover, url: https://www.levi.com/US/en_US/p/384790020
Current product: Portola Chore Coat, url: https://www.levi.com/US/en_US/p/A06810002
Current product: Bartlett Utility Jacket, url: https://www.levi.com/US/en_US/p/A32080000
Current product: Varsity Relaxed Raglan Crewneck Sweatshirt, url: https://www.levi.com/US/en_US/p/A33140002
Current product: Union Rugby Shirt, url: https://www.levi.com/US/en_US/p/A48450004
Current product: Chenango Reversible Belt, url: https://

In [None]:
browser.quit()



In [None]:
finish = datetime.now()
print("Finish:", finish.strftime("%H:%M:%S"))
print()
print("Time taken:", str(finish - start).split(".")[0])

Finish: 19:48:02

Time taken: 0:21:22


In [None]:
print("Images snatched total:", len(Image_URL))
print(f"Errors encountered during product scrapping: {len(error_log_products)}")
print(f"Errors encountered during color swatch scrapping: {len(error_log_swatches)}")

Images snatched total: 374
Errors encountered during product scrapping: 3
Errors encountered during color swatch scrapping: 0
