In [1]:
import requests
from bs4 import BeautifulSoup

# Scrape brand links from Sephora's brand list website 

In [2]:
# Convert brandlist website html as string
band_lst_link = "https://www.sephora.com/brands-list"
response = requests.get(band_lst_link)
html = response.text

# Use BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

# Scraping brand links and save them into a list
brand_link_lst = []
for brand in soup.find_all('li', class_="css-1hhsxaa"):
    brand_link_lst.append("https://www.sephora.com" +
                          brand.a.attrs['href']+"/all?pageSize=300")

# Write brand links into a file:
with open('brand_link.txt', 'w') as f:
    for item in brand_link_lst:
        f.write(f"{item}\n")

## How many brands on Sephora's website?

In [3]:
print(f'There are {len(brand_link_lst)} brands in total')

"""It indicates how many requests we need to do to scape all the product links. 
The result shows 300+ brands. Normally, this will not exceed the request times that Sephora allows per day. 
Therefore, we could scrape all the product links at one time."""

There are 345 brands in total


'It indicates how many requests we need to do to scape all the product links. \nThe result shows 300+ brands. Normally, this will not exceed the request times that Sephora allows per day. \nTherefore, we could scrape all the product links at one time.'

# Scrape product links from each brand link

In [4]:
def scape_product(brand_link):
    """A function to scape all the Sephora product links from a given brand link."""
    response = requests.get(brand_link)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    product_link_lst = []
    for product in soup.find_all('a', class_="css-ix8km1"):
        # use function split to remove text like "grid p12345"
        product_link_lst.append(
            "https://www.sephora.com"+product.attrs['href'].split()[0])
    return product_link_lst

In [5]:
# Scape all the product links from all the brands links and save them into a dictionary. 
# This will take some time!
product_link_dic = {}
for link in brand_link_lst:
    product_link_dic[link] = scape_product(link)

In [6]:
# See product number for each brand
for brand in product_link_dic.items():
    print(brand[0], len(brand[1]))

https://www.sephora.com/brand/acqua-di-parma/all?pageSize=300 12
https://www.sephora.com/brand/aerin-perfume/all?pageSize=300 12
https://www.sephora.com/brand/aether-beauty/all?pageSize=300 4
https://www.sephora.com/brand/algenist/all?pageSize=300 12
https://www.sephora.com/brand/alpha-h/all?pageSize=300 12
https://www.sephora.com/brand/alterna/all?pageSize=300 12
https://www.sephora.com/brand/amazing-cosmetics/all?pageSize=300 0
https://www.sephora.com/brand/amika/all?pageSize=300 12
https://www.sephora.com/brand/amorepacific/all?pageSize=300 12
https://www.sephora.com/brand/anastasia-beverly-hills/all?pageSize=300 12
https://www.sephora.com/brand/anthony-logistics-for-men/all?pageSize=300 12
https://www.sephora.com/brand/antonym/all?pageSize=300 6
https://www.sephora.com/brand/apivita/all?pageSize=300 0
https://www.sephora.com/brand/aquis/all?pageSize=300 10
https://www.sephora.com/brand/giorgio-armani-beauty/all?pageSize=300 12
https://www.sephora.com/brand/the-art-of-shaving/all?pa

**There are some brands with no products. So we remove these brand from our brand list.**

In [7]:
# Remove brands with no products
product_link_dic_new = {k: v for k, v in product_link_dic.items() if v}

# Collect all the product_links
flat_product_links = [link for lst in product_link_dic_new.values()
                      for link in lst]
print(f'There are {len(flat_product_links)} products in total')

There are 3093 products in total


In [8]:
# Write products link into a file:
with open('product_link.txt', 'w') as f:
    for item in flat_product_links:
        f.write(f"{item}\n")