Scrape beauty product infomation from [beautypedia](beautypedia.com), a website where Paula Begoun and her team post their review on beauty products.

In [1]:
#from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import time
import re
import requests
import os

Collect product links from main page. The product links are dynamically generated and cannot be found in html sourse file using requests package. We will use selenium to simulate browsers and get the links.

In [2]:
def get_product_links(start_page):
    product_links = []
    product_names = []

    browser = webdriver.Firefox()
    browser.get(start_page)

    # select 96 items per page so we can loop less pages
    el = browser.find_element_by_class_name('results-per-page')
    for option in el.find_elements_by_tag_name('option'):
        if option.text == '96':
            option.click() # select() in earlier versions of webdriver
            break

    # find how many pages we have to loop
    i=1
    npage = int(browser.find_element_by_class_name("archive-pagination-select").text.replace('\n',' ').split()[-1])
    print('page (%d total) - first product'%npage)
    while True:
        soup = BeautifulSoup(browser.page_source,"html5lib")
        links = soup.find_all('a',class_="review-product")
        print("%6d  %s"%(i, links[0].text))
        product_links += [link['href'] for link in links]
        product_names += [link.text for link in links]
        if i==npage:
            break
        else:
            time.sleep(5)  # wait a few seconds -- be gentle to the server
            browser.find_element_by_class_name('next-page').click() #click next-page button
            i=i+1
            time.sleep(5)
    browser.close()
    
    return product_links, product_names

def get_product_by_category(category_list, excluded_category):
    
    mega_data = pd.DataFrame()
    for item in category_list:
        category = item.text.replace('\n','').replace('\t','')
        if category in excluded_category:
            continue
        print('collecting data for ', category)
        product_links, product_names = get_product_links(item['href'])
        df = pd.DataFrame({'product_links':product_links, 'product_names':product_names, 'product_category':category})
        mega_data = mega_data.append(df)

    mega_data.reset_index(drop=True, inplace=True)
    
    return mega_data

Go to each product's page and collect information.

In [3]:
def fetch_product(mega_data, image_folder="images/"):
    
    if not os.path.isdir(image_folder):
        os.system("mkdir "+image_folder)
    
    mega_data['brand'] = None
    mega_data['ingredient'] = None
    mega_data['size'] = None
    mega_data['price'] = None
    mega_data['claims'] = None

    for i in tqdm(range(mega_data.shape[0])):
        url = mega_data['product_links'].iloc[i]
        r = requests.get(url)
        soup = BeautifulSoup(r.text)

        #brand
        brand = soup.find('h2')
        if brand is not None:
            mega_data['brand'].iloc[i] = brand.text

        #ingredient
        ingredient = soup.find('div',class_=re.compile("content-item ingredients"))
        if ingredient is not None:
            mega_data['ingredient'].iloc[i] = ingredient.text.strip()
        
        #size
        size = soup.find('span', class_=re.compile("size"))
        if size is not None:
            mega_data['size'].iloc[i] = size.text

        #price
        price = soup.find('span', class_=re.compile("price"))
        if price is not None:
            mega_data['price'].iloc[i] = price.text
            
        #claims
        claims = soup.find('div', id="claims")
        if claims is not None:
            mega_data['claims'].iloc[i] = claims.text
        
        #image
        img_url = soup.find('div', class_="product-image").find('img')['src']
        if claims is not None:
            r_img = requests.get(img_url)
            if r_img.status_code == 200:
                image_path = image_folder+mega_data['product_names'][i].replace(' ','-').replace('/','-')+'_'+mega_data['brand'][i].replace(' ','-').replace('/','-')+'.jpg'
                with open(image_path, 'wb') as f:
                    f.write(r_img.content)
                mega_data['image_path'] = image_path 

Collect skin care products.

In [4]:
if os.path.isfile("skin_care_products.csv"):
    cols = ['product_names','product_category','brand','product_links']
    skin_care_products = pd.read_csv('skin_care_products.csv', usecols=cols)
else:
    url = 'https://www.beautypedia.com/skin-care'
    r = requests.get(url)
    soup = BeautifulSoup(r.text)
    skin_care_cat = soup.find_all('a', class_="submenu-item", href=re.compile('/skin-care/'))[:30]
    excluded_category = ['Best & Worst Skin Care Products','Moisturizer']
    skin_care_products = get_product_by_category(skin_care_cat, excluded_category)

fetch_product(skin_care_products, image_folder="images/skin_care/")

In [5]:
skin_care_products.to_csv('skin_care_products.csv',index=False)
print("# of skin care products:", skin_care_products.shape)
skin_care_products.head()

# of skin care products: (6182, 9)


Unnamed: 0,product_links,product_names,product_category,brand,ingredient,size,price,claims,image_path
0,https://www.beautypedia.com/products/clear-com...,Clear Complexion Spot Treatment,Acne & Blemish Treatment,Merle Norman,"Active: Salicylic Acid (2%), Other: Alcohol D...",0.50 fl. oz.,20.0,"Fast-drying, on-the-spot solution with Salicyl...",images/skin_care/Fresh-Pressed-Daily-Booster-w...
1,https://www.beautypedia.com/products/acne-solu...,Acne Solutions Emergency Gel Lotion,Acne & Blemish Treatment,Clinique,"Active: Benzoyl Peroxide (5%), Other: Water P...",0.50 fl. oz.,17.0,Clinique's medicated troubleshooter with benzo...,images/skin_care/Fresh-Pressed-Daily-Booster-w...
2,https://www.beautypedia.com/products/resist-da...,RESIST Daily Pore-Refining Solution 2% BHA,Acne & Blemish Treatment,Paula's Choice Skincare,"Water (Aqua), Dipropylene Glycol, Salicylic ...",3.00 fl. oz.,33.0,This daily-use liquid exfoliant is ideal for t...,images/skin_care/Fresh-Pressed-Daily-Booster-w...
3,https://www.beautypedia.com/products/max-compl...,Max Complexion Correction Pads,Acne & Blemish Treatment,Peter Thomas Roth,"Active: Salicylic Acid (2%), Other: Alcohol D...",60.00,40.0,Regulates healthy cell turnover and delivers p...,images/skin_care/Fresh-Pressed-Daily-Booster-w...
4,https://www.beautypedia.com/products/naturals-...,Naturals Acne Spot Treatment,Acne & Blemish Treatment,Neutrogena,Active Ingredient: Salicylic Acid 1%. Inactive...,0.75 fl. oz.,8.49,Clinically proven acne medicine treats and hel...,images/skin_care/Fresh-Pressed-Daily-Booster-w...


Collect body care products.

In [6]:
if os.path.isfile("body_care_products.csv"):
    cols = ['product_names','product_category','brand','product_links']
    body_care_products = pd.read_csv('body_care_products.csv', usecols=cols)
else:
    url = 'https://www.beautypedia.com/body-care'
    r = requests.get(url)
    soup = BeautifulSoup(r.text)
    body_care_cat = soup.find_all('a', class_="submenu-item", href=re.compile('/body-care/'))[:6]
    excluded_category = []
    body_care_products = get_product_by_category(body_care_cat, excluded_category)

fetch_product(body_care_products, image_folder="images/body_care/")

In [7]:
body_care_products.to_csv('body_care_products.csv',index=False)
print("# of body care products:", body_care_products.shape)
body_care_products.head()

# of body care products: (420, 9)


Unnamed: 0,product_links,product_names,product_category,brand,ingredient,size,price,claims,image_path
0,https://www.beautypedia.com/products/bust-beau...,Bust Beauty Extra-Lift Gel,Body Lotion,Clarins,"Water, Glycerin, PEG-40 Hydrogenated Castor ...",1.70 fl. oz.,59.0,A non-oily gel containing special plant extrac...,images/body_care/razor-sharp-extra-close-silic...
1,https://www.beautypedia.com/products/sa-renewi...,SA Renewing Lotion,Body Lotion,CeraVe,"Purified Water, Glycerin, Mineral Oil, Ammo...",8.00 fl. oz.,15.99,CeraVe SA is your daily defense against extrem...,images/body_care/razor-sharp-extra-close-silic...
2,https://www.beautypedia.com/products/resurfix-...,Resurfix Skin Barrier Healing Ointment,Body Lotion,Replenix,Active Ingredient: Petrolatum U.S.P. Inactive ...,1.40 fl. oz.,19.0,Replenix Resurfix Skin Barrier Healing Ointmen...,images/body_care/razor-sharp-extra-close-silic...
3,https://www.beautypedia.com/products/soothing-...,Soothing Aloe Refreshing Moisturizer,Body Lotion,Jergens,"Water, Glycerin, Cetearyl Alcohol, Cetyl Es...",16.80 fl. oz.,7.49,Jergens Soothing Aloe Relief Skin Cooling Mois...,images/body_care/razor-sharp-extra-close-silic...
4,https://www.beautypedia.com/products/summer-sh...,Summer Shine Body Lotion,Body Lotion,Mario Badescu,"Deionized Water, Peanut Oil, Octyl Palmitate...",6.00 fl. oz.,10.0,"Enriched with Vitamin A, this greaseless body ...",images/body_care/razor-sharp-extra-close-silic...


Collect makeup products.

In [8]:
if os.path.isfile("makeup_products.csv"):
    cols = ['product_names','product_category','brand','product_links']
    makeup_products = pd.read_csv('makeup_products.csv', usecols=cols)
else:
    url = 'https://www.beautypedia.com/makeup'
    r = requests.get(url)
    soup = BeautifulSoup(r.text)
    makeup_cat = soup.find_all('a', class_="submenu-item", href=re.compile('/makeup/'))[:28]
    excluded_category = ['Best & Worst Makeup Products', 'Eyes', 'Lips', 'Face']
    makeup_products = get_product_by_category(makeup_cat, excluded_category)

fetch_product(makeup_products, image_folder="images/makeup/")

In [9]:
makeup_products.to_csv('makeup_products.csv',index=False)
print("# of makeup products:", makeup_products.shape)
makeup_products.head()

# of makeup products: (2669, 9)


Unnamed: 0,product_links,product_names,product_category,brand,ingredient,size,price,claims,image_path
0,https://www.beautypedia.com/products/brow-zing...,Brow Zings,Eyebrow,Benefit,,,32.0,,images/makeup/beautyblender-original_beautyble...
1,https://www.beautypedia.com/products/micro-bro...,Micro Brow Pencil,Eyebrow,NYX Cosmetics,"Polyethylene, C12-15 Alkyl Ethylhexanoate, H...",,10.0,Build full beautiful brows with our ultra-thin...,images/makeup/beautyblender-original_beautyble...
2,https://www.beautypedia.com/products/natural-b...,Natural Brow Shaper,Eyebrow,Bobbi Brown,,,22.0,,images/makeup/beautyblender-original_beautyble...
3,https://www.beautypedia.com/products/bc-color-...,BC Color Brow Perfecting Pencil,Eyebrow,BeautiControl,,,12.0,,images/makeup/beautyblender-original_beautyble...
4,https://www.beautypedia.com/products/bombshell...,Bombshell Pow-Der Brow + Liner,Eyebrow,CoverGirl,"Boron Nitride, Dimethicone, Silica, Polyeth...",0.02 fl. oz.,11.99,This versatile 2-in-1 brow liner& soft eyebrow...,images/makeup/beautyblender-original_beautyble...
