In [1]:
from time import sleep
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

browser = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
browser.implicitly_wait(10)

Product_name = []
Product_type_list = []
Product_price = []
Product_ingredients = []
Source_url = []


Product_type = ["Face wash", "Face Oil", "Face serum",
                "Face Cleanser - Milk Based", "Face Cleanser - Water Based",
                "Face Cream - Day creams", "Face Cream - Night Cream", "Sunscreen", "Toner"]

Product_type_page = ["https://www.amazon.in/s?k=Face+wash&rh=n%3A1374414031&ref=nb_sb_noss",
                     "https://www.amazon.in/s?k=Face+Oil&rh=n%3A1374414031&ref=nb_sb_noss",
                     "https://www.amazon.in/s?k=Face+serum&rh=n%3A1374414031&ref=nb_sb_noss",
                     "https://www.amazon.in/s?k=Milk+Based+Face+Cleanser&rh=n%3A1374414031&ref=nb_sb_noss",
                     "https://www.amazon.in/s?k=Water+Based+Face+Cleanser&rh=n%3A1374414031&ref=nb_sb_noss",
                     "https://www.amazon.in/s?k=Day+creams+Face+Cream&rh=n%3A1374414031&ref=nb_sb_noss",
                     "https://www.amazon.in/s?k=Night+creams+Face+Cream&rh=n%3A1374414031&ref=nb_sb_noss",
                     "https://www.amazon.in/s?k=Sunscreen&rh=n%3A1374414031&ref=nb_sb_noss",
                     "https://www.amazon.in/s?k=Toner&rh=n%3A1374414031&ref=nb_sb_noss"]


def get_product_urls(soup):
    product_urls = []
    links = soup.find_all("a", class_="a-link-normal s-no-outline")
    for link in links:
        href = link.get('href')
        if href and href.startswith("/"):
            product_urls.append("https://www.amazon.in" + href)
    return product_urls

def get_title(soup):
    title = ""
    try:
        title_element = soup.find("span", id="productTitle")
        if title_element:
            title = title_element.text.strip()
    except Exception as e:
        print(f"Error extracting title: {e}")
    return title

def get_price(soup):
    price = ""
    try:
        price_element = soup.find("span", class_="a-price-whole")
        if price_element:
            price = price_element.text.strip()
    except Exception as e:
        print(f"Error extracting price: {e}")
    return price



def get_ingredients(soup):
    ingredients = ""
    try:
        important_info = soup.find("div", id="important-information")
        if important_info:
            div_children = important_info.find_all("div", recursive=False)
            for content_div in div_children:
                # Check if div contains the ingredients header
                if "ingredients" in content_div.text.lower():
                    p_tags = content_div.find_all("p")
                    if p_tags:
                        ingredients = " ".join([p.text.strip() for p in p_tags])
                        break  
    except Exception as e:
        print(f"Error extracting ingredients: {e}")
    return ingredients



def get_source_url(soup):
    source_url = ""
    try:
        canonical_link = soup.find('link', rel='canonical')

        if canonical_link:
            source_url = canonical_link.get('href')
            return source_url
        else:
            return None
    except Exception as e:
        print(f"Error extracting source url: {e}")
    return source_url

# Loop for each product type and corresponding URL
for p_type, p_url in zip(Product_type, Product_type_page):
    for i in range(1, 3):  #first 2 pages for each product type
        url = f"{p_url}&page={i}"
        browser.get(url)
        sleep(np.random.randint(2, 10))
        
        html_source_code = browser.execute_script("return document.body.innerHTML;")
        html_soup = BeautifulSoup(html_source_code, "html.parser")
        
        #product URLs
        product_urls = get_product_urls(html_soup)
        
        for product_url in product_urls:
            try:
                browser.get(product_url)
                sleep(np.random.randint(2, 10))
                
                product_page_html = browser.execute_script("return document.body.innerHTML;")
                product_page_soup = BeautifulSoup(product_page_html, "html.parser")
                
                
                title = get_title(product_page_soup)
                price = get_price(product_page_soup)
                ingredients = get_ingredients(product_page_soup)
                source_url = get_source_url(product_page_soup)
                
                
                Product_name.append(title)
                Product_price.append(price)
                Product_ingredients.append(ingredients)
                Product_type_list.append(p_type)
                Source_url.append(source_url)
                
                browser.back()
                sleep(np.random.randint(2, 10))
            except Exception as e:
                print(f"Error processing product URL {product_url}: {e}")


browser.quit()

#Dataframe
data = {
    "Product Name": Product_name,
    "Product Type": Product_type_list,
    "Ingredients": Product_ingredients,
    "Price": Product_price,
    "Source URL": Source_url
}
df = pd.DataFrame(data)

print(df)

df.to_csv("skin-care.csv")


                                           Product Name Product Type  \
0     Mamaearth Rice Face Wash With Rice Water & Nia...    Face wash   
1     Neutrogena Deep Clean Foaming Cleanser- Advanc...    Face wash   
2     DOT & KEY Barrier Repair + Hydrating Gentle Fa...    Face wash   
3     DOT & KEY Vitamin C + E Super Bright Gel Face ...    Face wash   
4             Himalaya Purifying Neem Face Wash, 400 ml    Face wash   
...                                                 ...          ...   
1139  L'Oreal Paris Revitalift Crystal Micro-Essence...        Toner   
1140  Lacto Calamine Rose Water Toner For Glowing Sk...        Toner   
1141  Khadi Essentials 100% Pure Wild Rose Water For...        Toner   
1142  Seer Secrets Rose Face Toner Mist Spray Made w...        Toner   
1143  Lotus Herbals Rosetone Rose Petals Facial Skin...        Toner   

                                            Ingredients Price  \
0      Myristic Acid, Glycerin, Aqua, Potassium Hydr...  232.   
1    

In [5]:
df.insert(1, 'Category', 'Skin Care')
df.to_csv("skin_care.csv")