### Project Summary:
 This project analyzes a collection of cosmetic or product data to identify relationships 
 between products based on shared ingredients. The workflow involves:
 - Flattening nested product information from JSON format into a structured DataFrame.
 - Extracting barcode information (UPC/EAN) and consolidating into a single 'BARCODE' column.
 - Identifying and matching products that share similar ingredients.
 - Counting and listing shared ingredients and matching products.
 - Grouping and labeling products with common ingredients for better interpretability.
 - Visualizing the distribution of shared ingredients using an interactive histogram.
 This analysis helps reveal ingredient-level similarities across products, useful for 
 clustering, recommendation systems, or product formulation insights.


In [62]:
#Import all necessary Libraries
import requests
from bs4 import BeautifulSoup
import time
import re
import pandas as pd

#visualization libraries
import matplotlib.pylab as plt
import seaborn as sns

In [63]:
%%time

# This are common ingredient use in skin care product
ingredient_list = [
    "aqua", "glycerin", "hyaluronic acid", "niacinamide", "vitamin b3","retinol", "vitamin a", 
    "ascorbic acid", "vitamin c", "tocopherol", "vitamin e","salicylic acid", "glycolic acid", "lactic acid",
    "ceramides", "peptides", "panthenol", "dimethicone", "petrolatum", "shea butter", "cocoa butter",
    "aloe vera", "squalane", "zinc oxide", "titanium dioxide", "benzoyl peroxide", "urea", "allantoin", 
    "green tea extract", "chamomile extract", "licorice root extract", "tea tree oil", "jojoba oil", 
    "argan oil", "rosehip oil", "coconut oil", "avocado oil", "almond oil", "castor oil", "witch hazel", 
    "alcohol denat", "fragrance", "sodium hyaluronate", "beta-glucan", "centella asiatica extract",
    "ferulic acid", "azelaic acid", "alpha arbutin", "kojic acid", "resveratrol", "bha", "aha", "pha", 
    "retinaldehyde", "retinyl palmitate", "tranexamic acid", "caffeine", "sulfur", "kaolin", "bentonite clay",
    "charcoal", "manuka honey","seaweed extract", "propolis", "snail mucin", "bakuchiol", "olive oil", "bees wax"
    ]

headers = {'User-Agent': 'Mozilla/5.0'} # Helps bypass basic blocks
response = requests.get(page_url, headers=headers)


def get_product_links(page_url):
    """
    This function extract all the product link from the provided site to scrap
    And returns all the links set for better optimization
    """
    soup = BeautifulSoup(response.text, 'html.parser')

    product_links = set()
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        if "/en-gb/product/" in href:
            if href.startswith("/"):
                href = "https://snapklik.com" + href
            product_links.add(href)
    return product_links

def extract_product_details(product_url):
    """
    This function extract all required information from each product link
    and it return a dictionary of the product information that we are able to scrap
    """
    time.sleep(5)  # This allow the webpage we are scrapping to load before the scrapping process begins
    response = requests.get(product_url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract image_url    
    first_div = soup.find('div', class_='carousel-image')
    product_image = first_div.find('img') if first_div else None     # Get the img inside it (only the first one)
    image_url = product_image['src'] if product_image else "N/A"

    
    # Extract title
    full_title = soup.find('h1')
    full_title = full_title.text.strip() if full_title else "N/A"
    title = re.split(r'[#,@|;\.]', full_title)[0] # This split the title just to extract the main product name
    Product_Line_Name = re.split(r'[#,@|;\.]', full_title)[1:] # The reminded of the title that was splitted is the product line name

    # Extract price of each product
    price_tag = soup.find('span', class_='product-price large')
    price = price_tag.text.strip() if price_tag else "N/A"

    # Extract Description
    product_detail = soup.find('li', class_='mat-subtitle-2 ng-star-inserted')
    text = product_detail.get_text(strip=True)  # Removes leading/trailing whitespace
    description = text.split('.')[0] #This split the product details in order to extract the first (first two) sentence(s)
    if len(description) < 100:
        description = description + text.split('.')[1]
    
    # Match ingredients from predefined list
    page_text = soup.get_text(separator=' ').lower()
    ingredients = [i for i in ingredient_list if i in page_text]

    # Extract table data
    table = soup.find('table')
    table_data = {}
    if table:
        for row in table.find_all('tr'):
            cells = row.find_all('td')
            if len(cells) == 2:
                key = cells[0].get_text(strip=True)
                value = cells[1].get_text(strip=True)
                table_data[key] = value

    return {
        "url": product_url,
        "title": title,
        "image_url": image_url,
        "price": price,
        "description": description,
        "ingredients": ingredients,
        "Product_Line_Name": Product_Line_Name,
        "table_data": table_data
    }

# Scraping only products with ingredients from first 3 pages since the project is more
#specific on product with the same 
base_url = "https://snapklik.com/en-gb/g/c/skin-care?i={}&n=8&id=11060451"
all_products_with_ingredients = []

for i in range(0, 3):  # Adjust range as needed
    page_url = base_url.format(i * 16)
    product_links = get_product_links(page_url)
    for link in product_links:
        product = extract_product_details(link)
        if product["ingredients"] != "N/A":
            all_products_with_ingredients.append(product)



CPU times: total: 30.8 s
Wall time: 6min 32s


### Summary:
 - This cell flattens a list of product dictionaries into a structured pandas DataFrame.
 - Extracts key product details (e.g., title, image, price, description, ingredients, etc.).
 - Merges additional metadata from the 'table_data' field if it exists.
 - Each product becomes a row in the final DataFrame.
 - The result is a clean and analysis-ready DataFrame for further processing or visualization.


In [72]:

# Flatten the product dictionaries
flat_products = []
for prod in all_products_with_ingredients[:]:
    flat_dict = {
        "Source_URL": prod.get("url", "N/A"),
        "Product_Name": prod.get("title", "N/A"),
        "Product_image": prod.get("image_url", "N/A"),
        "Price": "$" + prod.get("price", "N/A"),
        "Product_Description": prod.get("description", "N/A"),
        "Ingredients": prod.get("ingredients", "N/A"),
        "Product_Line_Name:": prod["Product_Line_Name"]

    }
    
    # Merge table_data (expand keys like SKID, Manufacturer, etc.)
    if "table_data" in prod and isinstance(prod["table_data"], dict):
        flat_dict.update(prod["table_data"])
    
    flat_products.append(flat_dict)
    


# Convert to DataFrame
df = pd.DataFrame(flat_products)

df.head()

Unnamed: 0,Source_URL,Product_Name,Product_image,Price,Product_Description,Ingredients,Product_Line_Name:,Item weight,SKID,Manufacturer,Model,EAN,UPC,Part number
0,https://snapklik.com/en-gb/product/cerave-pm-f...,CeraVe PM Facial Moisturizing Lotion,https://m.media-amazon.com/images/I/71jdcrAdwb...,$29,[ OVERNIGHT MOISTURIZER ] Cerave’s PM face cre...,"[hyaluronic acid, niacinamide, ceramides, frag...",[ Night Cream],0.1 kg,05W04PP7I6G15,CeraVe,VALEANT432229,3606000537453.0,,
1,https://snapklik.com/en-gb/product/la-roche-po...,La Roche-Posay Toleriane Hydrating Gentle Faci...,https://m.media-amazon.com/images/I/61E45LOdhW...,$29,Its gentle cream formula helps restore skin co...,"[aqua, glycerin, niacinamide, tocopherol, cera...",[ Daily Face Wash],0.2 kg,02P64PW7RHMY5,AmazonUs/LOSQH,B07WS6BTDK,3337875685900.0,,
2,https://snapklik.com/en-gb/product/basics-liqu...,Basics Liquid Hand Soap Refill,https://m.media-amazon.com/images/I/61p2PTBG7k...,$24,One 50 fluid ounce bottle of milk and honey li...,[],"[ Milk And Honey Scent, Triclosan-free, 50 F...",,09DP4PI7PZ8Z5,Amazon.com Services LLC.,,,195515025225.0,
3,https://snapklik.com/en-gb/product/anua-heartl...,ANUA Heartleaf Quercetinol Pore Deep Cleansing...,https://m.media-amazon.com/images/I/51Kpw2r-pI...,$29,05% BHA : BHA effectively exfoliate dead skin ...,"[glycerin, hyaluronic acid, bha]","[ Facial Cleanser, For Double Cleansing, BHA...",0.1 kg,0PP84PF7VDKD5,,AN000089,8809640733642.0,,
4,https://snapklik.com/en-gb/product/hawaiian-tr...,Hawaiian Tropic After Sun Body Butter,https://m.media-amazon.com/images/I/71PfiAX-J+...,$24,THOUGHTFULLY made — Hawaiian Tropic after sun ...,"[shea butter, coconut oil, avocado oil, fragra...",[],0.3 kg,02W44P37CZWP5,Edgewell Personal Care,Hawaiian Tropic,,75486091668.0,


### Summary:
 This cell creates a unified 'BARCODE' column by prioritizing the 'UPC' value.
 - If 'UPC' is available, it is used.
 - If 'UPC' is missing but 'EAN' is available, 'EAN' is used instead.
 - If both are missing, the value is set to None.
 - The original 'UPC' and 'EAN' columns are then dropped from the DataFrame to avoid redundancy.


In [73]:
def barcode(row):
    if pd.notna(row["UPC"]):
        return row["UPC"]
    elif pd.notna(row["EAN"]):
        return row["EAN"]
    else:
        return None  

df["BARCODE"] = df.apply(barcode, axis=1)

df = df.drop(['UPC', "EAN"], axis=1)

df.columns

In [65]:
df.columns

Index(['Source_URL', 'Product_Name', 'Product_image', 'Price',
       'Product_Description', 'Ingredients', 'Product_Line_Name:',
       'Item weight', 'SKID', 'Manufacturer', 'Model', 'EAN', 'UPC',
       'Part number'],
      dtype='object')

In [74]:
df.rename(columns={"Item weight": "Size/Volume"}, inplace=True)
df.rename(columns={"Part number": "Product ID"}, inplace=True)
df.rename(columns={"Manufacturer": "Brand Name "}, inplace=True)



In [81]:
df.to_csv("Scraped product data.csv")

### Summary:
This cell defines a function to find products that share at least one ingredient with a given product.
For each product in the DataFrame:
 - It compares the product's ingredient list with all other products.
 - It identifies products with overlapping ingredients (excluding self-comparison).
 - It collects:
     - A comma-separated list of matching product names.
     - A comma-separated list of shared ingredients.
     - The number of matching products.
     - The number of unique shared ingredients.
 The results are stored in four new columns:
 'Products_With_Same_Ingredients', 'Shared_Ingredients', 
 'Num_Matching_Products', and 'Num_Unique_Shared_Ingredients'.


In [75]:
def find_matches_and_ingredients(current_index, current_ingredients, df):
    product_matches = []
    shared_ingredients = []
    current_set = set(current_ingredients)
    
    for idx, row in df.iterrows():
        if idx == current_index:
            continue
        other_set = set(row["Ingredients"])
        intersection = current_set & other_set
        if intersection:
            product_matches.append(row["Product_Name"])
            shared_ingredients.extend(intersection)
    
    # Convert lists to comma-separated strings (avoid storing raw lists)
    products_str = ", ".join(product_matches)
    shared_ingredients_str = ", ".join(sorted(set(shared_ingredients)))

    return (
        products_str,                          # Comma-separated product names
        shared_ingredients_str,                # Comma-separated shared ingredients
        len(product_matches),                  # Number of matched products
        len(set(shared_ingredients))           # Number of unique shared ingredients
    )

# Apply the function and assign to 4 columns
df[[
    "Products_With_Same_Ingredients", 
    "Shared_Ingredients", 
    "Num_Matching_Products", 
    "Num_Unique_Shared_Ingredients"
]] = df.apply(
    lambda row: pd.Series(find_matches_and_ingredients(row.name, row["Ingredients"], df)),
    axis=1
)
df.head()

### Summary:
 This cell sorts the DataFrame (`df`) in descending order based on the 
 number of unique shared ingredients (`Num_Unique_Shared_Ingredients`).
 - The result is stored in a new DataFrame called `df_sorted`, with the index reset for clarity.
 - This helps prioritize or rank products by the richness of shared ingredient connections.


In [77]:
df_sorted = df.sort_values(by='Num_Unique_Shared_Ingredients', ascending=False).reset_index(drop=True)
df_sorted


Unnamed: 0,Source_URL,Product_Name,Product_image,Price,Product_Description,Ingredients,Product_Line_Name:,Size/Volume,SKID,Brand Name,Model,Product ID,BARCODE,Products_With_Same_Ingredients,Shared_Ingredients,Num_Matching_Products,Num_Unique_Shared_Ingredients
0,https://snapklik.com/en-gb/product/la-roche-po...,La Roche-Posay Toleriane Hydrating Gentle Faci...,https://m.media-amazon.com/images/I/61E45LOdhW...,$29,Its gentle cream formula helps restore skin co...,"[aqua, glycerin, niacinamide, tocopherol, cera...",[ Daily Face Wash],0.2 kg,02P64PW7RHMY5,AmazonUs/LOSQH,B07WS6BTDK,,3337875685900,"CeraVe PM Facial Moisturizing Lotion, La Roche...","aqua, ceramides, fragrance, glycerin, niacinam...",26,7
1,https://snapklik.com/en-gb/product/la-roche-po...,La Roche-Posay Toleriane Hydrating Gentle Faci...,https://m.media-amazon.com/images/I/61E45LOdhW...,$29,Its gentle cream formula helps restore skin co...,"[aqua, glycerin, niacinamide, tocopherol, cera...",[ Daily Face Wash],0.2 kg,02P64PW7RHMY5,AmazonUs/LOSQH,B07WS6BTDK,,3337875685900,"CeraVe PM Facial Moisturizing Lotion, ANUA Hea...","aqua, ceramides, fragrance, glycerin, niacinam...",26,7
2,https://snapklik.com/en-gb/product/la-roche-po...,La Roche-Posay Toleriane Hydrating Gentle Faci...,https://m.media-amazon.com/images/I/61E45LOdhW...,$29,Formulated with La Roche-Posay prebiotic therm...,"[aqua, glycerin, niacinamide, tocopherol, cera...",[ Daily Face Wash],0.2 kg,02P64PW7RHMY5,AmazonUs/LOSQH,B07WS6BTDK,,3337875685900,"CeraVe PM Facial Moisturizing Lotion, La Roche...","aqua, ceramides, fragrance, glycerin, niacinam...",26,7
3,https://snapklik.com/en-gb/product/cerave-skin...,Skin Renewing Night Cream,https://m.media-amazon.com/images/I/71r7bLsvaV...,$34,[ PEPTIDE COMPLEX ] Help reduced the look of f...,"[hyaluronic acid, niacinamide, ceramides, pept...","[ Niacinamide, Peptide Complex, And Hyaluron...",0.3 kg,05IJ4PB7FKHH5,CeraVe,B00SNPCSUY,,3606000537606,"CeraVe PM Facial Moisturizing Lotion, La Roche...","ceramides, fragrance, hyaluronic acid, niacina...",23,5
4,https://snapklik.com/en-gb/product/cerave-skin...,Skin Renewing Night Cream,https://m.media-amazon.com/images/I/71r7bLsvaV...,$34,[ PEPTIDE COMPLEX ] Help reduced the look of f...,"[hyaluronic acid, niacinamide, ceramides, pept...","[ Niacinamide, Peptide Complex, And Hyaluron...",0.3 kg,05IJ4PB7FKHH5,CeraVe,B00SNPCSUY,,3606000537606,"CeraVe PM Facial Moisturizing Lotion, La Roche...","ceramides, fragrance, hyaluronic acid, niacina...",23,5
5,https://snapklik.com/en-gb/product/cerave-skin...,Skin Renewing Night Cream,https://m.media-amazon.com/images/I/71r7bLsvaV...,$34,[ PEPTIDE COMPLEX ] Help reduced the look of f...,"[hyaluronic acid, niacinamide, ceramides, pept...","[ Niacinamide, Peptide Complex, And Hyaluron...",0.3 kg,05IJ4PB7FKHH5,CeraVe,B00SNPCSUY,,3606000537606,"CeraVe PM Facial Moisturizing Lotion, La Roche...","ceramides, fragrance, hyaluronic acid, niacina...",23,5
6,https://snapklik.com/en-gb/product/cerave-pm-f...,CeraVe PM Facial Moisturizing Lotion,https://m.media-amazon.com/images/I/71jdcrAdwb...,$29,[ OVERNIGHT MOISTURIZER ] Cerave’s PM face cre...,"[hyaluronic acid, niacinamide, ceramides, frag...",[ Night Cream],0.1 kg,05W04PP7I6G15,CeraVe,VALEANT432229,,3606000537453,"CeraVe PM Facial Moisturizing Lotion, La Roche...","ceramides, fragrance, hyaluronic acid, niacina...",23,4
7,https://snapklik.com/en-gb/product/hawaiian-tr...,Hawaiian Tropic After Sun Body Butter,https://m.media-amazon.com/images/I/71PfiAX-J+...,$24,THOUGHTFULLY made — Hawaiian Tropic after sun ...,"[shea butter, coconut oil, avocado oil, fragra...",[],0.3 kg,02W44P37CZWP5,Edgewell Personal Care,Hawaiian Tropic,,75486091668,"CeraVe PM Facial Moisturizing Lotion, La Roche...","avocado oil, coconut oil, fragrance, shea butter",17,4
8,https://snapklik.com/en-gb/product/cerave-pm-f...,CeraVe PM Facial Moisturizing Lotion,https://m.media-amazon.com/images/I/71jdcrAdwb...,$29,[ OVERNIGHT MOISTURIZER ] Cerave’s PM face cre...,"[hyaluronic acid, niacinamide, ceramides, frag...",[ Night Cream],0.1 kg,05W04PP7I6G15,CeraVe,VALEANT432229,,3606000537453,"CeraVe PM Facial Moisturizing Lotion, La Roche...","ceramides, fragrance, hyaluronic acid, niacina...",23,4
9,https://snapklik.com/en-gb/product/hawaiian-tr...,Hawaiian Tropic After Sun Body Butter,https://m.media-amazon.com/images/I/71PfiAX-J+...,$24,ULTRA-RICH MOISTURIZERS leave skin feeling sil...,"[shea butter, coconut oil, avocado oil, fragra...",[],0.3 kg,02W44P37CZWP5,Edgewell Personal Care,Hawaiian Tropic,,75486091668,"CeraVe PM Facial Moisturizing Lotion, La Roche...","avocado oil, coconut oil, fragrance, shea butter",17,4


### Summary:
 - This code creates an interactive histogram showing the distribution of the
 - 'Num_Unique_Shared_Ingredients' values in the dataset.
 - It divides the data into 30 bins, colors the bars blue, and adds spacing between bars
 - for clearer visualization.
 - The chart helps identify how common various counts of unique shared ingredients are among products.


In [78]:
import plotly.express as px

fig = px.histogram(
    df,
    x="Num_Unique_Shared_Ingredients",
    nbins=30,
    title="Distribution of Unique Shared Ingredients",
    labels={"Num_Unique_Shared_Ingredients": "Number of Unique Shared Ingredients"},
    color_discrete_sequence=["#636EFA"]
)

fig.update_layout(bargap=0.2)
fig.show()


### Summary:
- This code groups products by their unique shared ingredients and creates a summary table.
- It combines all product names that share the same ingredients.
- It removes duplicates and sorts product names alphabetically.
- It assigns each group a label (A, B, C, ...) for easier reference.
- The final output is a clean, readable DataFrame with:
Group | Shared Ingredients | Product Names


In [79]:
import string

# Group by unique Shared_Ingredients
grouped_df = df.groupby("Shared_Ingredients")["Products_With_Same_Ingredients"].apply(lambda x: ", ".join(sorted(set(", ".join(x).split(", "))))).reset_index()

# Assign Group Labels: A, B, C...
grouped_df.insert(0, "Group", list(string.ascii_uppercase[:len(grouped_df)]))

# Rename columns to match desired output
grouped_df.columns = ["Group", "Shared Ingredients", "Product Names"]

# Sort alphabetically by Shared Ingredients
grouped_df = grouped_df.sort_values("Shared Ingredients").reset_index(drop=True)


In [80]:
grouped_df.to_csv("Grouped Ingredients.csv")
grouped_df

Unnamed: 0,Group,Shared Ingredients,Product Names
0,A,,
1,B,"aqua, ceramides, fragrance, glycerin, niacinam...",ANUA Heartleaf Quercetinol Pore Deep Cleansing...
2,C,"aqua, glycerin, petrolatum",ANUA Heartleaf Quercetinol Pore Deep Cleansing...
3,D,"avocado oil, coconut oil, fragrance, shea butter","CeraVe PM Facial Moisturizing Lotion, Dove Bar..."
4,E,"bha, glycerin, hyaluronic acid",ANUA Heartleaf Quercetinol Pore Deep Cleansing...
5,F,"ceramides, fragrance, hyaluronic acid, niacina...",ANUA Heartleaf Quercetinol Pore Deep Cleansing...
6,G,"ceramides, fragrance, hyaluronic acid, niacina...",ANUA Heartleaf Quercetinol Pore Deep Cleansing...
7,H,fragrance,"CeraVe PM Facial Moisturizing Lotion, Dove Bar..."
8,I,niacinamide,"CeraVe PM Facial Moisturizing Lotion, Grace & ..."
9,J,retinol,Glass Skin Starter Set
