In [1]:
import requests
import re
from bs4 import BeautifulSoup
import time
import json
import numpy as np
import pandas as pd

In [2]:
def get_data(product_link):
    response = requests.get(product_link)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')

    product_id = re.findall(R'P[0-9]{3,6}', product_link)[0]

    # Get brand name and product name
    for brand_and_name in soup.find_all('h1', class_='css-a1jw00'):
        names = [names.get_text() for names in brand_and_name.find_all('span')]
        brand = names[0]
        prd_name = names[1]

    # Get Category and price (use json to get categories and price),
    # Because it would give us sub categories and offer price.
    # But if that doesn't work, just get main category and price box.
    dic_json = soup.find_all(attrs={"type": "application/ld+json"})
    try:
        json_category = json.loads(dic_json[0].get_text())
        price = json.loads(dic_json[1].get_text())['offers'][0]['price']
        category_lst = [item['item']['name']
                        for item in json_category['itemListElement']]
        category = '/'.join(category_lst)

    except:
        try:
            category = [cate.get_text() for cate in soup.find_all('ol', class_='css-1doqpel')]
            price = soup.find('div', attrs={"data-comp": "Price Box"}).get_text()
        except:
            price = 'na'
            category = 'na'

    # Get item number and size
    size_and_item = soup.find(attrs={"data-comp": "SizeAndItemNumber Box"})
    # If there's only item number and no size info, it will be out of index
    try:
        item_num = size_and_item.contents[1].split(' ')[1]
    except:
        item_num = size_and_item.contents[0].split(' ')[1]
    # If there's no size at SizeAndItemNumber Box, then try use description area to see.
    try:
        size = size_and_item.span.contents[0].split('SIZE ')[1]
    except:
        try:
            size = soup.find(
                "span", attrs={"class": "css-12wl10d"}).contents[-1]
        except:
            size = 'na'

    # Get love counts
    try:
        love_counts = soup.find(
            'span', attrs={"data-at": "product_love_count"}).get_text()
    except:
        love_counts = 'na'

    # review nums and ratings
    link_json = soup.find(attrs={"id": "linkJSON"})
    json_str = str(link_json)
    ratings = re.findall(R'\"rating\"\:(.*?)\,', json_str)
    reviews = re.findall(R'\"reviews\"\:(.*?)\,', json_str)
    try:
        rating = ratings[0]
    except:
        rating = 'na'
    try:
        reviews_count = reviews[0]
    except:
        reviews_count = 'na'

    dic1 = {}
    dic1['category'] = category
    dic1['product_name'] = prd_name
    dic1['brand'] = brand
    dic1['price'] = price
    dic1['Product_Id'] = product_id
    dic1['size'] = size
    dic1['item_num'] = item_num
    dic1['love_count'] = love_counts
    dic1['rating'] = rating
    dic1['reviews_count'] = reviews_count
    dic1['link'] = product_link
    return dic1

In [4]:
#Test: get first 5 product info
products = open('product_link.txt')
i = 0
frame = []
for item in products:
    link = item.rstrip()
    product_data = get_data(link)
    df = pd.DataFrame(product_data,index=[i])
    frame.append(df)
    i+=1
    time.sleep(np.random.random()*3)
    if i == 5:
        break
result = pd.concat(frame)
result

Unnamed: 0,category,product_name,brand,price,Product_Id,size,item_num,love_count,rating,reviews_count,link
0,Fragrance/Value & Gift Sets/Perfume Gift Sets,Blu Mediterraneo MINIATURE Set,Acqua Di Parma,63.0,P443401,5 x 0.16oz/5mL,2218774,2588,4.0,4,https://www.sephora.com/product/blu-mediterran...
1,Fragrance/Men/Cologne,Colonia,Acqua Di Parma,63.0,P163604,0.7 oz/ 20 mL,2044816,2528,4.3067,75,https://www.sephora.com/product/colonia-P16360...
2,Fragrance/Women/Perfume,Mirto di Panarea,Acqua Di Parma,64.0,P307804,2.5 oz/ 74 mL,1417617,2622,4.6364,22,https://www.sephora.com/product/blu-mediterran...
3,Fragrance/Women/Perfume,Rosa Nobile,Acqua Di Parma,68.0,P388670,3.4 oz/ 101 mL,1638832,4569,4.5652,69,https://www.sephora.com/product/rosa-nobile-P3...
4,Fragrance/Value & Gift Sets/Perfume Gift Sets,Colonia Miniature Set,Acqua Di Parma,69.0,P443400,5 x 0.16oz/5mL,2218766,779,3.5,2,https://www.sephora.com/product/colonia-miniat...


In [5]:
## How many products? 
products = open('product_link.txt')
count = 0
for item in products:
    count += 1
print('Product counts: ', count)

Product counts:  3091


### There are over 3000 products on Sephora website. 
### But It seems that Sephora has set limits for request times each IP address and each day. 
### Therefore, we can try to scape all the product information using different IP addresses or different days.

In [6]:
## 1st time
products = open('product_link.txt')
product_links = []
for item in products:
    link = item.rstrip()
    product_links.append(link)
    
frame1 = []
i = 0
for link in product_links[:200]: ## replace 5 with 500
    print(link)
    product_data = get_data(link)
    df = pd.DataFrame(product_data,index=[i])
    frame1.append(df)
    i+=1
    time.sleep(np.random.random()*3)

result1 = pd.concat(frame1)
result1

https://www.sephora.com/product/blu-mediterraneo-minature-set-P443401?icid2=products
https://www.sephora.com/product/colonia-P163604?icid2=products
https://www.sephora.com/product/blu-mediterraneo-mirto-di-panarea-P307804?icid2=products
https://www.sephora.com/product/rosa-nobile-P388670?icid2=products
https://www.sephora.com/product/colonia-miniature-set-P443400?icid2=products
https://www.sephora.com/product/blu-mediterraneo-fico-di-amalfi-P307801?icid2=products
https://www.sephora.com/product/colonia-essenza-P269110?icid2=products
https://www.sephora.com/product/blu-mediterraneo-fico-di-amalfi-gift-set-P450815?icid2=products
https://www.sephora.com/product/le-nobili-collection-set-P431439?icid2=products
https://www.sephora.com/product/blu-mediterraneo-arancia-di-capri-gift-set-P450817?icid2=products
https://www.sephora.com/product/blu-mediterraneo-mandorlo-di-sicilia-P307803?icid2=products
https://www.sephora.com/product/blu-mediterraneo-arancia-di-capri-P375388?icid2=products
https:

https://www.sephora.com/product/jackie-aina-eyeshadow-palette-P449147?icid2=products
https://www.sephora.com/product/mini-loose-highlighter-set-P451598?icid2=products
https://www.sephora.com/product/norvina-pro-pigment-vol-1-P448161?icid2=products
https://www.sephora.com/product/invigorating-rush-hair-body-wash-P376886?icid2=products
https://www.sephora.com/product/ingrown-hair-treatment-P170545?icid2=products
https://www.sephora.com/product/glycolic-exfoliating-resurfacing-wipes-P397373?icid2=products
https://www.sephora.com/product/glycolic-facial-cleanser-P9038?icid2=products
https://www.sephora.com/product/after-shave-balm-P9024?icid2=products
https://www.sephora.com/product/facial-scrub-P9036?icid2=products
https://www.sephora.com/product/shave-cream-P9022?icid2=products
https://www.sephora.com/product/oil-free-facial-lotion-P112400?icid2=products
https://www.sephora.com/product/astringent-toner-pads-P189131?icid2=products
https://www.sephora.com/product/high-performance-continuou

https://www.sephora.com/product/bareminerals-all-over-face-color-P61005?icid2=products
https://www.sephora.com/product/bareminerals-nothing-beats-original-4-piece-get-started-kit-P421000?icid2=products
https://www.sephora.com/product/complexion-rescue-hydrating-foundation-stick-broad-spectrum-spf-25-P441829?icid2=products
https://www.sephora.com/product/pro-1-25-P451923?icid2=products
https://www.sephora.com/product/collagen-inner-beauty-boost-P420961?icid2=products
https://www.sephora.com/product/glow-advanced-inner-beauty-powder-P420964?icid2=products


Unnamed: 0,category,product_name,brand,price,Product_Id,size,item_num,love_count,rating,reviews_count,link
0,Fragrance/Value & Gift Sets/Perfume Gift Sets,Blu Mediterraneo MINIATURE Set,Acqua Di Parma,63.0,P443401,5 x 0.16oz/5mL,2218774,2588,4,4,https://www.sephora.com/product/blu-mediterran...
1,Fragrance/Men/Cologne,Colonia,Acqua Di Parma,63.0,P163604,0.7 oz/ 20 mL,2044816,2528,4.3067,75,https://www.sephora.com/product/colonia-P16360...
2,Fragrance/Women/Perfume,Mirto di Panarea,Acqua Di Parma,64.0,P307804,2.5 oz/ 74 mL,1417617,2622,4.6364,22,https://www.sephora.com/product/blu-mediterran...
3,Fragrance/Women/Perfume,Rosa Nobile,Acqua Di Parma,68.0,P388670,3.4 oz/ 101 mL,1638832,4569,4.5652,69,https://www.sephora.com/product/rosa-nobile-P3...
4,Fragrance/Value & Gift Sets/Perfume Gift Sets,Colonia Miniature Set,Acqua Di Parma,69.0,P443400,5 x 0.16oz/5mL,2218766,779,3.5,2,https://www.sephora.com/product/colonia-miniat...
...,...,...,...,...,...,...,...,...,...,...,...
195,Makeup/Face/Face Sets,Nothing Beats the Original™ Complexion Kit,bareMinerals,34.0,P421000,- For medium skin with neutral undertones (wi...,1958859,14100,3.8947,38,https://www.sephora.com/product/bareminerals-n...
196,Makeup/Face/Foundation,COMPLEXION RESCUE Hydrating Foundation Stick B...,bareMinerals,32.0,P441829,0.35 oz/ 10 g,2176774,19068,4.3209,1742,https://www.sephora.com/product/complexion-res...
197,Hair/Hair Tools/Curling Irons,"Pro 1.25"" Rotating Curling Iron",Beachwaver,199.0,P451923,na,2311744,2496,3.9286,14,https://www.sephora.com/product/pro-1-25-P4519...
198,Skincare/Wellness/Beauty Supplements,COLLAGEN Inner Beauty Boost,The Beauty Chef,25.0,P420961,16.9 oz/ 500 mL,1971845,11086,4.4128,109,https://www.sephora.com/product/collagen-inner...


In [7]:
result1.to_csv('result1.csv', index=False)

In [None]:
## 2nd time
frame2 = []
i = 5 ## replace 5 with 5000
for link in product_links[5:10]: ## replace [5:10] with [5000:]
    product_data = get_data(link)
    df = DataFrame(product_data,index=[i])
    frame2.append(df)
    i+=1
    time.sleep(np.random.random()*3)
result2 = pd.concat(frame2)
result2

In [None]:
result = pd.concat([result1,result2])
result

In [None]:
## Save the result as csv. 
result.to_csv('result.csv', index=False)

In [None]:
# Old version get_data function

def get_data(link):
    wb_data = requests.get(link)
    soup = bs4.BeautifulSoup(wb_data.text,'lxml')
    
    ## para1 is to get info of sort,price,product name and brand
    para1 = soup.find_all(attrs={"type": "application/ld+json"})
    sort = ''
    for content1 in para1[:2]:
        content1 = content1.string
        try:
            content1_dict = json.loads(content1)
        except:
            continue
        try:
            content1_lst = content1_dict['itemListElement']
            for item in content1_lst:
                sort= sort+item['item']['name']+ '/'
        except:
            price = content1_dict['offers'][0]['price'] 
            product_name = content1_dict['name']          
            brand = content1_dict['brand']
    
    ## para2 is to get info of size and item number
    para2 = soup.find(attrs={"class": "css-1qf1va5"})
    para2 = str(para2)
    try:
        size = re.findall(R'SIZE.+?\>(.*?)\<span',para2)[0]
    except:
        size ='na'
    try:
        item_num = re.findall(R'ITEM.+?\>(.*?)\</div\>',para2)[0]
    except:
        item_num = 'na'
    
    ## para3 is to get info of love counts
    para3 = soup.find(attrs={"data-at": "product_love_count"})
    para3 = str(para3)
    try:
        love_count = re.findall(R'\>([0-9].+)\</span\>',para3)[0]
    except:
        love_count = 'na'
    
    ## para4 is to get info of rating and review counts
    para4 = soup.find(attrs={"id": "linkJSON"})
    para4 = str(para4)
    ratings = re.findall(R'\"rating\"\:(.*?)\,',para4)
    reviews = re.findall(R'\"reviews\"\:(.*?)\,',para4)
    
    
    try:
        rating = ratings[0]
    except:
        rating = 'na'
    
    try:
        reviews_count = reviews[0]
    except:
        reviews_count = 'na'
    
    if size == []:
            size = 'na'
    if item_num ==[]:
        item_num = 'na'
    if love_count == []:
        love_count = 'na'
     
    dic1 = {}
    dic1['category'] = sort
    dic1['product_name']= product_name
    dic1['brand'] = brand
    dic1['price'] = price
    dic1['Product_Id'] = re.findall(R'P[0-9]{3,6}',link)
    dic1['size'] = size
    dic1['item_num'] = item_num
    dic1['love_count'] = love_count
    dic1['rating'] = rating
    dic1['reviews_count'] = reviews_count
    dic1['link'] = link
    
    return dic1