## Imports

In [79]:
# Standard library imports
import csv
import http.client
import logging
import random
import re
import sys
import time
from datetime import datetime
from urllib.request import Request, urlopen

# Related third party imports
import numpy as np
import requests
from bs4 import BeautifulSoup
import extruct
from extruct.microformat import MicroformatExtractor
from selenium import webdriver
from w3lib.html import get_base_url
from random_user_agent.user_agent import UserAgent
from random_user_agent.params import SoftwareName, OperatingSystem

# Local application/library specific imports
# from utilities.scrapping_utility import get_user_agent

## Global Variables

In [43]:
ZARA_BASE_URL = "https://www.zara.com/us/"

# set user agent to imitate browser requests
STOPWORDS = {
    "+ info",
    "about",
    r"gift.*",
    "suppliers",
    "vr store",
    "press",
    r"news.*",
    "movie",
    "careers",
    "annual report",
    "company",
    "contact us",
    "help",
    "stores",
    "zaraseries",
    "stories",
    "zaratribute",
    r"community.*",
    r"join.*",
    "job",
}

## Functions

### Webdriver and Scrapping Utilities

In [64]:
def get_user_agent() -> str:
    """
    Defines a random user agent string for web scraping purposes.

    Returns:
        str: A randomly selected user agent string.
    """

    software_names = [
        SoftwareName.CHROME.value,
        SoftwareName.FIREFOX.value,
        SoftwareName.EDGE.value,
    ]
    operating_systems = [
        OperatingSystem.WINDOWS.value,
        OperatingSystem.MAC.value,
        OperatingSystem.LINUX.value,
        OperatingSystem.ANDROID.value,
        OperatingSystem.IOS.value,
    ]
    user_agent_rotator = UserAgent(
        software_names=software_names, operating_systems=operating_systems, limit=100
    )
    return user_agent_rotator.get_random_user_agent()


def get_metadata(url: str, metadata_type: str = "all-in-one") -> dict:
    """
    Retrieves the metadata from the specified URL.

    Args:
        url (str): The URL of the webpage.
        metadata_type (str): The type of metadata to extract. Supported values are "all-in-one" and "micro".

    Returns:
        dict: The extracted metadata.

    Raises:
        ValueError: If an unsupported metadata_type is provided.
        requests.exceptions.RequestException: If there is an error while making the network request.
    """
    if metadata_type not in ["all-in-one", "micro"]:
        raise ValueError(f"Unsupported metadata_type: {metadata_type}")

    user_agent = get_user_agent()

    try:
        r = requests.get(url, headers={"User-Agent": user_agent}, timeout=30)
    except requests.exceptions.RequestException as e:
        print(f"Error while making network request: {e}")
        raise

    base_url = get_base_url(r.text, r.url)
    data_extractors = {
        "all-in-one": lambda: extruct.extract(r.text, base_url=base_url),
        "micro": lambda: MicroformatExtractor().extract(r.text),
    }
    return data_extractors[metadata_type]()


def base_url_request(url: str) -> BeautifulSoup:
    """
    Sends a GET request to the specified URL and returns the response content as a string.

    Args:
        url (str): The URL to send the request to.

    Returns:
        str: The response content as a string.
    """
    user_agent = get_user_agent()
    driver_options = prep_driver_options(user_agent)
    r = requests.get(ZARA_BASE_URL, headers={"User-Agent": user_agent}, timeout=30)
    if response.status_code != http.client.OK:
        logging.error("Error retrieving %s: %s", url, response.status_code)
        sys.exit()
    return BeautifulSoup(r.content.decode("utf-8"), "html.parser")

In [None]:
def prep_driver_options(user_agent: str) -> webdriver.ChromeOptions:
    """
    Prepares and returns a ChromeOptions object with the specified user agent.

    Args:
        user_agent (str): The user agent string to be used by the Chrome driver.

    Returns:
        webdriver.ChromeOptions: A ChromeOptions object with the specified user agent and other default options.
    """
    options = webdriver.ChromeOptions()
    arguments = [
        "--headless",
        "--no-sandbox",
        "--incognito",
        "--start-maximized",
        "--enable-automation",
        "--ignore-certificate-errors",
        "--disable-notifications",
        "--disable-extensions",
        "--disable-infobars",
        f"user-agent={user_agent}",
    ]

    for argument in arguments:
        options.add_argument(argument)

    return options

## Zara scrapping

In [68]:
soup = base_url_request(ZARA_BASE_URL)
if len(soup.find_all("a")) = 0:
    print("No links found")

    # selenium driver

[]

In [80]:
driver = webdriver.Chrome(options=prep_driver_options(get_user_agent()))

driver.get(ZARA_BASE_URL)

In [86]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.proxy import Proxy, ProxyType

options = Options()
options.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
)

proxy = Proxy()
proxy.proxy_type = ProxyType.MANUAL
proxy.http_proxy = "ip:port"
proxy.ssl_proxy = "ip:port"

capabilities = webdriver.DesiredCapabilities.CHROME
# proxy.add_to_capabilities(capabilities)

driver = webdriver.Chrome(options=options)

driver.get(ZARA_BASE_URL)

In [69]:
driver = webdriver.Chrome(
    ChromeDriverManager().install(), chrome_options=prep_driver_options(get_user_agent())
)

TypeError: WebDriver.__init__() got an unexpected keyword argument 'chrome_options'

In [33]:
base_url = get_base_url(r.text,r.url)
data_extractors = {
    "all-in-one": lambda: extruct.extract(r.text, base_url=base_url),
    "micro": lambda: MicroformatExtractor().extract(r.text),
}

In [36]:
data_extractors["all-in-one"]()



  self.__doc__ = BeautifulSoup(doc)


{'microdata': [],
 'json-ld': [],
 'opengraph': [],
 'microformat': [],
 'rdfa': [],
 'dublincore': [{'namespaces': {}, 'elements': [], 'terms': []}]}