Scrape beauty product infomation from [beautypedia](beautypedia.com), a website where Paula Begoun and her team post their review on beauty products.

In [1]:
#from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import time
import re
import requests
import os

Collect product links from main page. The product links are dynamically generated and cannot be found in html sourse file using requests package. We will use selenium to simulate browsers and get the links.

In [2]:
def get_product_links(start_page):
    product_links = []
    product_names = []

    browser = webdriver.Firefox()
    browser.get(start_page)

    # select 96 items per page so we can loop less pages
    el = browser.find_element_by_class_name('results-per-page')
    for option in el.find_elements_by_tag_name('option'):
        if option.text == '96':
            option.click() # select() in earlier versions of webdriver
            break

    # find how many pages we have to loop
    i=1
    npage = int(browser.find_element_by_class_name("archive-pagination-select").text.replace('\n',' ').split()[-1])
    print('page (%d total) - first product'%npage)
    while True:
        soup = BeautifulSoup(browser.page_source,"html5lib")
        links = soup.find_all('a',class_="review-product")
        print("%6d  %s"%(i, links[0].text))
        product_links += [link['href'] for link in links]
        product_names += [link.text for link in links]
        if i==npage:
            break
        else:
            time.sleep(5)  # wait a few seconds -- be gentle to the server
            browser.find_element_by_class_name('next-page').click() #click next-page button
            i=i+1
            time.sleep(5)
    browser.close()
    
    return product_links, product_names

def get_product_by_category(category_list, excluded_category):
    
    mega_data = pd.DataFrame()
    for item in category_list:
        category = item.text.replace('\n','').replace('\t','')
        if category in excluded_category:
            continue
        print('collecting data for ', category)
        product_links, product_names = get_product_links(item['href'])
        df = pd.DataFrame({'product_links':product_links, 'product_names':product_names, 'product_category':category})
        mega_data = mega_data.append(df)

    mega_data.reset_index(drop=True, inplace=True)
    
    return mega_data

Go to each product's page and collect information.

In [3]:
def fetch_product(mega_data, image_folder="images/"):
    
    if not os.path.isdir(image_folder):
        os.system("mkdir "+image_folder)
    
    mega_data['brand'] = None
    mega_data['ingredient'] = None
    mega_data['size'] = None
    mega_data['price'] = None
    mega_data['claims'] = None
    mega_data['image_path'] = None

    for i in tqdm(range(mega_data.shape[0])):
        url = mega_data['product_links'].iloc[i]
        r = requests.get(url)
        soup = BeautifulSoup(r.text)

        #brand
        brand = soup.find('h2')
        if brand is not None:
            mega_data['brand'].iloc[i] = brand.text

        #ingredient
        ingredient = soup.find('div',class_=re.compile("content-item ingredients"))
        if ingredient is not None:
            mega_data['ingredient'].iloc[i] = ingredient.text.strip()
        
        #size
        size = soup.find('span', class_=re.compile("size"))
        if size is not None:
            mega_data['size'].iloc[i] = size.text

        #price
        price = soup.find('span', class_=re.compile("price"))
        if price is not None:
            mega_data['price'].iloc[i] = price.text
            
        #claims
        claims = soup.find('div', id="claims")
        if claims is not None:
            mega_data['claims'].iloc[i] = claims.text
        
        #image
        img_url = soup.find('div', class_="product-image").find('img')['src']
        if img_url is not None:
            try:
                r_img = requests.get(img_url)
            except requests.exceptions.ConnectionError:
                time.sleep(2)
            if r_img.status_code == 200:
                image_path = image_folder+mega_data['product_names'].iloc[i].replace(' ','-').replace('/','-')+'_'+mega_data['brand'].iloc[i].replace(' ','-').replace('/','-')+'.jpg'
                with open(image_path, 'wb') as f:
                    f.write(r_img.content)
                mega_data['image_path'].iloc[i] = image_path 
        
        time.sleep(2)

Collect skin care products.

In [None]:
if os.path.isfile("skin_care_products.csv"):
    cols = ['product_names','product_category','brand','product_links']
    skin_care_products = pd.read_csv('skin_care_products.csv', usecols=cols)
else:
    url = 'https://www.beautypedia.com/skin-care'
    r = requests.get(url)
    soup = BeautifulSoup(r.text)
    skin_care_cat = soup.find_all('a', class_="submenu-item", href=re.compile('/skin-care/'))[:30]
    excluded_category = ['Best & Worst Skin Care Products','Moisturizer']
    skin_care_products = get_product_by_category(skin_care_cat, excluded_category)

fetch_product(skin_care_products, image_folder="images/skin_care/")

 20%|█▉        | 1222/6182 [56:04<3:43:43,  2.71s/it]

In [None]:
skin_care_products.to_csv('skin_care_products.csv',index=False)
print("# of skin care products:", skin_care_products.shape)
skin_care_products.head()

Collect body care products.

In [None]:
if os.path.isfile("body_care_products.csv"):
    cols = ['product_names','product_category','brand','product_links']
    body_care_products = pd.read_csv('body_care_products.csv', usecols=cols)
else:
    url = 'https://www.beautypedia.com/body-care'
    r = requests.get(url)
    soup = BeautifulSoup(r.text)
    body_care_cat = soup.find_all('a', class_="submenu-item", href=re.compile('/body-care/'))[:6]
    excluded_category = []
    body_care_products = get_product_by_category(body_care_cat, excluded_category)

fetch_product(body_care_products, image_folder="images/body_care/")

In [None]:
body_care_products

In [None]:
body_care_products.to_csv('body_care_products.csv',index=False)
print("# of body care products:", body_care_products.shape)
body_care_products.head()

Collect makeup products.

In [None]:
if os.path.isfile("makeup_products.csv"):
    cols = ['product_names','product_category','brand','product_links']
    makeup_products = pd.read_csv('makeup_products.csv', usecols=cols)
else:
    url = 'https://www.beautypedia.com/makeup'
    r = requests.get(url)
    soup = BeautifulSoup(r.text)
    makeup_cat = soup.find_all('a', class_="submenu-item", href=re.compile('/makeup/'))[:28]
    excluded_category = ['Best & Worst Makeup Products', 'Eyes', 'Lips', 'Face']
    makeup_products = get_product_by_category(makeup_cat, excluded_category)

fetch_product(makeup_products, image_folder="images/makeup/")

In [None]:
makeup_products.to_csv('makeup_products.csv',index=False)
print("# of makeup products:", makeup_products.shape)
makeup_products.head()