## WebScrapping

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import os

csv_file = "incidecoder_hair_products.csv"

# Create CSV with headers if it doesn't exist
if not os.path.exists(csv_file):
    pd.DataFrame(columns=["Brand", "Product Title", "Details", "Ingredient", "What-it-does"]).to_csv(
        csv_file, index=False
    )

# Loop through all 94 pages
for page in range(1, 95):
    print(f"\nðŸ”Ž Visiting page {page}")
    url = f"https://incidecoder.com/search?query=hair&activetab=products&ppage={page}"

    # Start a fresh driver for each page
    options = webdriver.ChromeOptions()
    options.add_argument("--headless=new")   # run without GUI
    driver = webdriver.Chrome(options=options)
    wait = WebDriverWait(driver, 10)

    page_data = []  # store one pageâ€™s results

    try:
        driver.get(url)
        time.sleep(2)

        # Collect product cards
        product_cards = wait.until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a.klavika.simpletextlistitem"))
        )
        product_links = [card.get_attribute("href") for card in product_cards]
        print(f"âž¡ Found {len(product_links)} products")

        # Scrape each product
        for link in product_links:
            driver.get(link)
            time.sleep(2)

            try:
                brand = wait.until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "span#product-brand-title"))
                ).text.strip()
            except:
                brand = "N/A"

            try:
                product_title = wait.until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "span#product-title"))
                ).text.strip()
            except:
                product_title = "N/A"

            try:
                details = driver.find_element(By.CSS_SELECTOR, "span#product-details").text.strip()
            except:
                details = "N/A"

            try:
                table = wait.until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "table.product-skim.fs16 tbody"))
                )
                rows = table.find_elements(By.TAG_NAME, "tr")
                for row in rows:
                    cells = row.find_elements(By.TAG_NAME, "td")
                    if len(cells) >= 2:
                        page_data.append({
                            "Brand": brand,
                            "Product Title": product_title,
                            "Details": details,
                            "Ingredient": cells[0].text.strip(),
                            "What-it-does": cells[1].text.strip()
                        })
            except:
                page_data.append({
                    "Brand": brand,
                    "Product Title": product_title,
                    "Details": details,
                    "Ingredient": "N/A",
                    "What-it-does": "N/A"
                })

            print(f"âœ” Done: {product_title}")

    except Exception as e:
        print(f"âš  Failed on page {page}: {e}")

    finally:
        driver.quit()

    # ðŸ”¥ Save after every page (append to CSV)
    if page_data:
        df = pd.DataFrame(page_data)
        df.to_csv(csv_file, mode="a", header=False, index=False)
        print(f"ðŸ’¾ Saved {len(page_data)} rows from page {page} to CSV")

print(f"âœ… Final data saved to {csv_file}")


## Remove the empty data

In [35]:
import pandas as pd

prod_df = pd.read_csv('incidecoder_hair_products.csv')
prod_df.head()


Unnamed: 0,Brand,Product Title,Details,Ingredient,What-it-does
0,Hair Gain,Hair Mask,Nourishing miracle treatment for your hair. Lo...,Aqua (Water),solvent
1,Hair Gain,Hair Mask,Nourishing miracle treatment for your hair. Lo...,Cetearyl Alcohol,"emollient, viscosity controlling, emulsifying,..."
2,Hair Gain,Hair Mask,Nourishing miracle treatment for your hair. Lo...,Betaine,moisturizer/humectant
3,Hair Gain,Hair Mask,Nourishing miracle treatment for your hair. Lo...,Behentrimonium Chloride,preservative
4,Hair Gain,Hair Mask,Nourishing miracle treatment for your hair. Lo...,,


## remove all the line that have no ingridients and what-it-does

In [39]:
import pandas as pd

# Load the file
csv_file = "incidecoder_hair_products.csv"
df = pd.read_csv(csv_file)

# Drop rows where both Ingredient and What-it-does are N/A or empty
df_clean = df[~(
    (df["Ingredient"].isna() | (df["Ingredient"].str.strip() == "") | (df["Ingredient"] == "N/A")) |
    (df["What-it-does"].isna() | (df["What-it-does"].str.strip() == "") | (df["What-it-does"] == "N/A"))
)]

# Save cleaned CSV
clean_file = "incidecoder_hair_products_clean.csv"
df_clean.to_csv(clean_file, index=False)

print(f"âœ… Cleaned file saved as: {clean_file}")
print(f"ðŸ“Š Rows before: {len(df)}, Rows after cleaning: {len(df_clean)}")


âœ… Cleaned file saved as: incidecoder_hair_products_clean.csv
ðŸ“Š Rows before: 84663, Rows after cleaning: 12519


## List out all the unique what-it-does

In [45]:

# Ensure column is string
df["What-it-does"] = df["What-it-does"].fillna("N/A").astype(str)

# Split multiple functions (comma-separated), flatten them, and get unique values
unique_functions = (
    df["What-it-does"]
    .str.split(",")               # split by comma
    .explode()                    # flatten into rows
    .str.strip()                  # remove extra spaces
    .unique()                     # unique values
)

# Convert to a sorted list
unique_functions = sorted([func for func in unique_functions if func != "N/A"])

print("Unique What-it-does values:")
for func in unique_functions:
    print("-", func)


Unique What-it-does values:
- abrasive/scrub
- anti-acne
- antimicrobial/antibacterial
- antioxidant
- buffering
- cell-communicating ingredient
- chelating
- colorant
- emollient
- emulsifying
- exfoliant
- moisturizer/humectant
- perfuming
- preservative
- skin brightening
- skin-identical ingredient
- solvent
- soothing
- sunscreen
- surfactant/cleansing
- viscosity controlling
