Collect ingredient information from https://www.paulaschoice.com/ingredient-dictionary

In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import time
import re
import requests

Get ingredients' name, rating, category and link to individual page.

In [2]:
browser = webdriver.Firefox()
browser.get('https://www.paulaschoice.com/ingredient-dictionary')
soup = BeautifulSoup(browser.page_source,"html5lib")
browser.close()

In [3]:
#name and link
ingredients = soup.find_all('h2',class_="name ingredient-name")
ingredient_links = [item.find('a')['href'] for item in ingredients]
ingredient_names = [item.text.strip() for item in ingredients]
#category
categories = soup.find_all('div', class_="categories ingredient-categories")
category_names = [[i.text for i in item.find_all('a')] for item in categories]
#rating
ratings_items = soup.find_all('td', class_=re.compile("col-rating ingredient-rating"))
ratings = [item.text for item in ratings_items]

In [4]:
ingredient_df = pd.DataFrame({'name':ingredient_names, 
                              'link':ingredient_links, 
                              'category':category_names,
                              'rating': ratings})
ingredient_df.to_csv('ingredients.csv')
ingredient_df.head()

Unnamed: 0,name,link,category,rating
0,"1, 2-Hexanediol ...",https://www.paulaschoice.com/ingredient-dictio...,[Preservatives],Good
1,10-Hydroxydecanoic Acid ...,https://www.paulaschoice.com/ingredient-dictio...,[Emollients],Good
2,4-T-butylcyclohexanol ...,https://www.paulaschoice.com/ingredient-dictio...,"[Emollients, Skin-Soothing]",Good
3,Acacia farnesiana extract ...,https://www.paulaschoice.com/ingredient-dictio...,"[Plant Extracts, Fragrance: Synthetic and Frag...",Poor
4,acacia senegal gum ...,https://www.paulaschoice.com/ingredient-dictio...,"[Texture Enhancer, Plant Extracts, Skin-Soothing]",Good


Go to each ingredient's page and find discriptions

In [5]:
def get_ingredient_discription(ingredient_df):
    
    discription_df = pd.DataFrame(ingredient_df['name'])
    discription_df['discription'] = None
    
    for i in tqdm(range(ingredient_df.shape[0])):
        try:
            r = requests.get(ingredient_df['link'].iloc[i])
        except Exception as e:
            continue
            
        soup = BeautifulSoup(r.text)
        discription = soup.find('div', class_="upper-body")
        if discription is not None:
            discription_df['discription'].iloc[i] = discription.text.strip()
            
    return discription_df

ingredient_discription_df = get_ingredient_discription(ingredient_df)
ingredient_discription_df.to_csv('ingredient_discriptions.csv')
ingredient_discription_df.head()

100%|██████████| 1750/1750 [12:51<00:00,  2.27it/s]


Unnamed: 0,name,discription
0,"1, 2-Hexanediol ...",A synthetic preservative and moisture-binding ...
1,10-Hydroxydecanoic Acid ...,A synthetic ingredient that functions as a ski...
2,4-T-butylcyclohexanol ...,A synthetic fatty alcohol that functions as an...
3,Acacia farnesiana extract ...,A fragrant extract from a type of acacia tree....
4,acacia senegal gum ...,"Herb that can have skin soothing properties, b..."
