In [8]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

# Config
base_url = "https://www.muscleblaze.com"
LIMIT = 100  #adjust
to_visit = [base_url]
visited_urls = set()
products = []
seen_products = set()

def crawler(verbose=True):
    crawl_count = 0

    while to_visit and crawl_count < LIMIT:
        current_url = to_visit.pop()
        print(f'Crawling: {current_url}')

        if current_url in visited_urls:
            continue

        try:
            response = requests.get(current_url, timeout=10)
            soup = BeautifulSoup(response.content, "html.parser")
        except Exception as e:
            print(f"Failed to fetch {current_url}: {e}")
            continue

        visited_urls.add(current_url)
        crawl_count += 1

        product_blocks = soup.find_all("div", class_="variant-container")
        for block in product_blocks:
            try:
                # --- Extract Title ---
                info_elem = block.find("div", class_="info")
                brand = info_elem.get_text(strip=True) if info_elem else ""

                flav_elem = block.find("span", class_="flavor-wt")
                flavor = flav_elem.get_text(strip=True) if flav_elem else ""

                title = f"{brand} | {flavor}".strip()

                # --- Extract Price ---
                price_elem = block.find("div", class_="offer-price")
                price = price_elem.get_text(strip=True) if price_elem else "N/A"

                # --- Extract Product URL ---
                a_tag = block.find("a", class_="float-wrapper", href=True)
                product_url = urljoin(base_url, a_tag["href"]) if a_tag else "N/A"

                # --- Extract Image URL ---
                image_container = block.find("div", class_="image-container")
                img_tag = image_container.find("img") if image_container else None
                img_url = urljoin(base_url, img_tag.get("data-original", img_tag.get("src", ""))) if img_tag else "N/A"


                # --- Unique Product Check ---
                unique_key = (title, price)
                if unique_key not in seen_products:
                    products.append({
                        "name": title,
                        "price": price,
                        "url": product_url,
                        "img": img_url
                    })
                    seen_products.add(unique_key)
                    if verbose:
                        print(f"{title} | {price} | {product_url} | {img_url}")

            except Exception as e:
                print(f"Error extracting product from {current_url}: {e}")

        # --- Find and Queue New URLs ---
        for link in soup.find_all('a', href=True):
            href = link['href']
            full_url = urljoin(base_url, href)
            if urlparse(full_url).netloc == urlparse(base_url).netloc:
                if full_url not in visited_urls and full_url not in to_visit:
                    to_visit.append(full_url)

    print(f"\nCrawling complete. Visited {crawl_count} pages.")
    print(f"Total Unique Products Scraped: {len(products)}")

# Run the crawler
crawler(False)


Crawling: https://www.muscleblaze.com
Crawling: https://www.muscleblaze.com/sale/oats-range?itracker=w:home|top-banner-slider|;p:6|;e:122893|;
Crawling: https://www.muscleblaze.com/sv/muscleblaze-high-protein-oats/SP-98284?navKey=VRNT-183068
Crawling: https://www.muscleblaze.com/loyality?itracker=w:|pdp|;c:loyality|;
Crawling: https://www.muscleblaze.com/categories/fit-foods/oats-and-cereals?navKey=CL-5718&itracker=w:||;
Crawling: https://www.muscleblaze.com/categories/pre-post-workout?navKey=CL-1709
Crawling: https://www.muscleblaze.com/categories/pre-post-workout/pre-workout?navKey=CL-5710
Crawling: https://www.muscleblaze.com/categories/pre-post-workout/creatine?navKey=CL-5714
Crawling: https://www.muscleblaze.com/categories/pre-post-workout/bcaa-n-eaa?navKey=CL-5712
Crawling: https://www.muscleblaze.com/categories/fit-foods/protein-bars?navKey=CL-5722
Crawling: https://www.muscleblaze.com/categories/proteins/whey-proteins?navKey=CL-5692
Crawling: https://www.muscleblaze.com/categor

Crawling: https://www.muscleblaze.com/sv/koshaveda-prash-pro-by-muscleblaze/SP-124837?navKey=VRNT-238169&itracker=w:menuLanding||;e:238169|;
Crawling: https://www.muscleblaze.com/categories/ayurveda/chyawanprash?navKey=CL-5915&itracker=w:||;
Crawling: https://www.muscleblaze.com/sv/muscleblaze-biozyme-5-in-1-multivitamin/SP-103974?navKey=VRNT-194572&itracker=w:menuLanding||;e:194572|;
Crawling: https://www.muscleblaze.com/pk/muscleblaze-creatine-monohydrate-creamp-100g-n-koshaveda-shilajit-pro-20g-combo?navKey=PA-130391&itracker=w:menuLanding||;e:130391|;
Crawling: https://www.muscleblaze.com/sv/muscleblaze-creatine-monohydrate-creamp/SP-33852?navKey=VRNT-63864
Crawling: https://www.muscleblaze.com/categories/pre-post-workout/creatine?navKey=CL-5714&itracker=w:||;
Crawling: https://www.muscleblaze.com/sv/muscleblaze-fish-oil-1000mg-n-mb-vite-multivitamin-combo/SP-84955?navKey=VRNT-159103&itracker=w:menuLanding||;e:159103|;
Crawling: https://www.muscleblaze.com/sv/koshaveda-shilajit-pro

In [9]:
import pandas as pd
df = pd.DataFrame(products)
df

Unnamed: 0,name,price,url,img
0,"MuscleBlaze High Protein Oats, 2 kg, Dark Choc...","₹1,199",,https://img10.hkrtcdn.com/30236/prd_3023529-Mu...
1,"MuscleBlaze High Protein Oats, 1 kg, Dark Choc...",₹649,,https://img8.hkrtcdn.com/39672/prd_3967117-Mus...
2,"MuscleBlaze High Protein Oats, 1 kg, Unflavoured",₹499,,https://img6.hkrtcdn.com/29836/prd_2983545_c_s...
3,"MuscleBlaze High Protein Oats, 1 kg, Fruit & S...",₹649,,https://img10.hkrtcdn.com/39607/prd_3960649-Mu...
4,"MuscleBlaze Instant Oats, 1 kg, Unflavoured",₹275,,https://img8.hkrtcdn.com/39080/prd_3907967-Mus...
...,...,...,...,...
96,"MuscleBlazeBiozyme 5 in 1 Multivitamin, 90 tab...",₹949,https://www.muscleblaze.com/sv/muscleblaze-bio...,https://img4.hkrtcdn.com/35430/prd_3542923-Mus...
97,"MuscleBlazeMB-VITE Daily Multivitamin, for Enh...",₹949,https://www.muscleblaze.com/sv/muscleblaze-mb-...,https://img8.hkrtcdn.com/40262/prd_4026127-Mus...
98,"MuscleBlazeMB-VITE Daily Multivitamin, for Enh...",₹329,https://www.muscleblaze.com/sv/muscleblaze-mb-...,https://img2.hkrtcdn.com/30875/prd_3087491-Mus...
99,MuscleBlazeFish Oil 1000mg & MB-Vite Multivita...,₹899,https://www.muscleblaze.com/sv/muscleblaze-fis...,https://img4.hkrtcdn.com/40259/prd_4025883-Mus...


In [12]:
def product_lookup(dataframe, name_lookup):
    df = pd.DataFrame(dataframe).copy()

    # Remove rs, commas and spaces
    df['price'] = (
        df['price']
        .astype(str)
        .str.replace("₹", "", regex=False)
        .str.replace(",", "", regex=False)
        .str.replace(" ", "", regex=False)
        .astype(int)
    )

    result = df[df['name'].str.contains(name_lookup, case=False, na=False)].sort_values('price')
    return result.reset_index(drop=True)


In [17]:
a = product_lookup(df, 'Protein')

In [23]:
a.img

0     https://img4.hkrtcdn.com/31085/prd_3108483-Mus...
1     https://img4.hkrtcdn.com/31085/prd_3108483-Mus...
2     https://img10.hkrtcdn.com/38703/prd_3870289-Mu...
3     https://img8.hkrtcdn.com/31056/prd_3105507-Mus...
4     https://img3.hkrtcdn.com/22853/prd_2285202-Mus...
5     https://img6.hkrtcdn.com/29836/prd_2983545_c_s...
6     https://img6.hkrtcdn.com/29836/prd_2983545_c_s...
7     https://img10.hkrtcdn.com/39607/prd_3960649-Mu...
8     https://img8.hkrtcdn.com/39672/prd_3967117-Mus...
9     https://img10.hkrtcdn.com/39607/prd_3960649-Mu...
10    https://img8.hkrtcdn.com/39672/prd_3967117-Mus...
11    https://img9.hkrtcdn.com/22852/prd_2285188-Mus...
12    https://img2.hkrtcdn.com/35786/prd_3578531-Mus...
13    https://img10.hkrtcdn.com/35963/prd_3596259-Mu...
14    https://img4.hkrtcdn.com/35871/pck_3587003_c_s...
15    https://img10.hkrtcdn.com/30236/prd_3023529-Mu...
16    https://img10.hkrtcdn.com/30236/prd_3023529-Mu...
17    https://img2.hkrtcdn.com/30239/pck_3023841