Scrape beauty product infomation from [beautypedia](beautypedia.com), a website where Paula Begoun and her team post their review on beauty products.

In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import time
import re
import requests

Collect product links from main page. The product links are dynamically generated and cannot be found in html sourse file using requests package. We will use selenium to simulate browsers and get the links.

In [2]:
def get_product_links(start_page):
    product_links = []
    product_names = []

    browser = webdriver.Firefox()
    browser.get(start_page)

    # select 96 items per page so we can loop less pages
    el = browser.find_element_by_class_name('results-per-page')
    for option in el.find_elements_by_tag_name('option'):
        if option.text == '96':
            option.click() # select() in earlier versions of webdriver
            break

    # find how many pages we have to loop
    i=1
    npage = int(browser.find_element_by_class_name("archive-pagination-select").text.replace('\n',' ').split()[-1])
    print('page (%d total) - first product'%npage)
    while True:
        soup = BeautifulSoup(browser.page_source,"html5lib")
        links = soup.find_all('a',class_="review-product")
        print("%6d  %s"%(i, links[0].text))
        product_links += [link['href'] for link in links]
        product_names += [link.text for link in links]
        if i==npage:
            break
        else:
            time.sleep(5)  # wait a few seconds -- be gentle to the server
            browser.find_element_by_class_name('next-page').click() #click next-page button
            i=i+1
            time.sleep(5)
    browser.close()
    
    return product_links, product_names

Go to each product's page and collect information.

In [3]:
def get_product_by_category(category_list, excluded_category):
    
    mega_data = pd.DataFrame()
    for item in category_list:
        category = item.text.replace('\n','').replace('\t','')
        if category in excluded_category:
            continue
        print('collecting data for ', category)
        product_links, product_names = get_product_links(item['href'])
        df = pd.DataFrame({'product_links':product_links, 'product_names':product_names, 'product_category':category})
        mega_data = mega_data.append(df)

    mega_data.reset_index(drop=True, inplace=True)

    mega_data['brand'] = None
    mega_data['ingredient'] = None
    mega_data['size'] = None
    mega_data['price'] = None

    for i in tqdm(range(mega_data.shape[0])):
        url = mega_data['product_links'].iloc[i]
        r = requests.get(url)
        soup = BeautifulSoup(r.text)

        #brand
        brand = soup.find('h2')
        if brand is not None:
            mega_data['brand'].iloc[i] = brand.text

        #ingredient
        ingredient = soup.find('div',class_=re.compile("content-item ingredients"))
        if ingredient is not None:
            mega_data['ingredient'].iloc[i] = ingredient.text.strip()
        
        #size
        size = soup.find('span', class_=re.compile("size"))
        if size is not None:
            mega_data['size'].iloc[i] = size.text

        #price
        price = soup.find('span', class_=re.compile("price"))
        if price is not None:
            mega_data['price'].iloc[i] = price.text
    
    return mega_data

Collect skin care products.

In [4]:
url = 'https://www.beautypedia.com/skin-care'
r = requests.get(url)
soup = BeautifulSoup(r.text)
skin_care_cat = soup.find_all('a', class_="submenu-item", href=re.compile('/skin-care/'))[:30]
excluded_category = ['Best & Worst Skin Care Products','Moisturizer']
skin_care_products = get_product_by_category(skin_care_cat, excluded_category)
skin_care_products.to_csv('skin_care_products.csv',index=False)

collecting data for  Acne & Blemish Treatment
page (3 total) - first product
     1  Clear Complexion Spot Treatment
     2  Essential Control for Blemishes
     3  Alpha Beta Daily Face Peel Two-Step System
collecting data for  Cleansers
page (11 total) - first product
     1  Jelly Glow Ball Radiance-Boosting Cleanser
     2  Age Defying Classic Cleanser
     3  Creme Douceur Cream-to-Oil Massage Cleanser All Skin Types
     4  Gentle Cleansing Bar
     5  HYDRALIGHT One Step Face Cleanser
     6  Apricot Probiotic Cleansing Milk
     7  Luminous Clean Daily Exfoliating Cleanser
     8  Complexion Refining Deep Clean Mousse
     9  Miel-En Mousse Foaming Face Cleanser & Makeup Remover
    10  Cleansing Cream, For All Skin Conditions
    11  Clear Pore Cleanser/Mask
collecting data for  Cleansing Brushes & Devices
page (1 total) - first product
     1  Microdermabrasion System
collecting data for  Face Wipes
page (1 total) - first product
     1  I-Bronze Self-Tanning Cloths
collectin

100%|██████████| 6182/6182 [1:08:14<00:00,  1.51it/s]


In [5]:
print("# of skin care products:", skin_care_products.shape)
skin_care_products.head()

# of skin care products: (6182, 7)


Unnamed: 0,product_links,product_names,product_category,brand,ingredient,size,price
0,https://www.beautypedia.com/products/clear-com...,Clear Complexion Spot Treatment,Acne & Blemish Treatment,Merle Norman,"Active: Salicylic Acid (2%), Other: Alcohol D...",0.50 fl. oz.,20.0
1,https://www.beautypedia.com/products/acne-solu...,Acne Solutions Emergency Gel Lotion,Acne & Blemish Treatment,Clinique,"Active: Benzoyl Peroxide (5%), Other: Water P...",0.50 fl. oz.,17.0
2,https://www.beautypedia.com/products/resist-da...,RESIST Daily Pore-Refining Solution 2% BHA,Acne & Blemish Treatment,Paula's Choice Skincare,"Water (Aqua), Dipropylene Glycol, Salicylic ...",3.00 fl. oz.,33.0
3,https://www.beautypedia.com/products/max-compl...,Max Complexion Correction Pads,Acne & Blemish Treatment,Peter Thomas Roth,"Active: Salicylic Acid (2%), Other: Alcohol D...",60.00,40.0
4,https://www.beautypedia.com/products/naturals-...,Naturals Acne Spot Treatment,Acne & Blemish Treatment,Neutrogena,Active Ingredient: Salicylic Acid 1%. Inactive...,0.75 fl. oz.,8.49


Collect body care products.

In [6]:
url = 'https://www.beautypedia.com/body-care'
r = requests.get(url)
soup = BeautifulSoup(r.text)
body_care_cat = soup.find_all('a', class_="submenu-item", href=re.compile('/body-care/'))[:6]
excluded_category = []
body_care_products = get_product_by_category(body_care_cat, excluded_category)
body_care_products.to_csv('body_care_products.csv',index=False)

collecting data for  Body Lotion
page (4 total) - first product
     1  Bust Beauty Extra-Lift Gel
     2  Diabetics Dry Skin Relief Lotion
     3  Deadsea Plants Dry Oil Body Mist
     4  Extra-Firming Body Lotion
collecting data for  Body Scrub & Exfoliant
page (1 total) - first product
     1  Sparkle Skin Body Exfoliator
collecting data for  Body Wash
page (2 total) - first product
     1  Yes to Grapefruit Exfoliating Body Wash
     2  Almond Cookie Body Cleansing Gel
collecting data for  Hand Cream
page (1 total) - first product
     1  Hand and Nail Cream
collecting data for  Self Tanner
page (1 total) - first product
     1  I-Bronze Self-Tanning Cloths
collecting data for  Shaving
page (1 total) - first product
     1  Modern Classic Shave Cream


100%|██████████| 420/420 [04:39<00:00,  1.50it/s]


In [7]:
print("# of body care products:", body_care_products.shape)
body_care_products.head()

# of body care products: (420, 7)


Unnamed: 0,product_links,product_names,product_category,brand,ingredient,size,price
0,https://www.beautypedia.com/products/bust-beau...,Bust Beauty Extra-Lift Gel,Body Lotion,Clarins,"Water, Glycerin, PEG-40 Hydrogenated Castor ...",1.70 fl. oz.,59.0
1,https://www.beautypedia.com/products/sa-renewi...,SA Renewing Lotion,Body Lotion,CeraVe,"Purified Water, Glycerin, Mineral Oil, Ammo...",8.00 fl. oz.,15.99
2,https://www.beautypedia.com/products/resurfix-...,Resurfix Skin Barrier Healing Ointment,Body Lotion,Replenix,Active Ingredient: Petrolatum U.S.P. Inactive ...,1.40 fl. oz.,19.0
3,https://www.beautypedia.com/products/soothing-...,Soothing Aloe Refreshing Moisturizer,Body Lotion,Jergens,"Water, Glycerin, Cetearyl Alcohol, Cetyl Es...",16.80 fl. oz.,7.49
4,https://www.beautypedia.com/products/summer-sh...,Summer Shine Body Lotion,Body Lotion,Mario Badescu,"Deionized Water, Peanut Oil, Octyl Palmitate...",6.00 fl. oz.,10.0


In [8]:
url = 'https://www.beautypedia.com/makeup'
r = requests.get(url)
soup = BeautifulSoup(r.text)
makeup_cat = soup.find_all('a', class_="submenu-item", href=re.compile('/makeup/'))[:28]
excluded_category = ['Best & Worst Makeup Products', 'Eyes', 'Lips', 'Face']
makeup_products = get_product_by_category(makeup_cat, excluded_category)
makeup_products.to_csv('makeup_products.csv',index=False)

collecting data for  Eyebrow
page (2 total) - first product
     1  Brow Zings
     2  Eye Brow Defining Pencil
collecting data for  Eyelash Primer & Treatment
page (1 total) - first product
     1  Faux Cils Longest Lash Mascara
collecting data for  Eyeliner
page (4 total) - first product
     1  Round The Clock Waterproof Eyeliner
     2  Eyeliner Pencil
     3  Liquid Eyeliner
     4  Liquid Eyeliner
collecting data for  Eyeshadow
page (3 total) - first product
     1  Eyeshadow
     2  Queen Collection 1-Kit Eye Shadow
     3  Lid Lacquer
collecting data for  Eyeshadow Palette
page (1 total) - first product
     1  Superhero Eye Transforming Anti-Aging Super Palette
collecting data for  Eyeshadow Primer & Base
page (1 total) - first product
     1  Aqua Seal
collecting data for  Mascara
page (4 total) - first product
     1  Hypnose Star Mascara Volume Ultra-Glamorous Regard Cinema Show-Stopping Volume Mascara
     2  Faux Lash Mascara
     3  Hypnose Doll Lashes Mascara
     4  Gr

100%|██████████| 2669/2669 [29:17<00:00,  1.52it/s]


In [9]:
print("# of makeup products:", makeup_products.shape)
makeup_products.head()

# of makeup products: (2669, 7)


Unnamed: 0,product_links,product_names,product_category,brand,ingredient,size,price
0,https://www.beautypedia.com/products/brow-zing...,Brow Zings,Eyebrow,Benefit,,,32.0
1,https://www.beautypedia.com/products/micro-bro...,Micro Brow Pencil,Eyebrow,NYX Cosmetics,"Polyethylene, C12-15 Alkyl Ethylhexanoate, H...",,10.0
2,https://www.beautypedia.com/products/natural-b...,Natural Brow Shaper,Eyebrow,Bobbi Brown,,,22.0
3,https://www.beautypedia.com/products/bc-color-...,BC Color Brow Perfecting Pencil,Eyebrow,BeautiControl,,,12.0
4,https://www.beautypedia.com/products/bombshell...,Bombshell Pow-Der Brow + Liner,Eyebrow,CoverGirl,"Boron Nitride, Dimethicone, Silica, Polyeth...",0.02 fl. oz.,11.99
