In [1]:
%cd "D:\GenAI-powered Smart Retail Experience"

D:\GenAI-powered Smart Retail Experience


In [4]:
pip install requests beautifulsoup4 pandas

Note: you may need to restart the kernel to use updated packages.


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import time
import re

# 🗂 Create folder for all images
os.makedirs("fashion_images", exist_ok=True)

# 🔍 Fashion categories to scrape
queries = [
    "men+t+shirts",
    "men+jeans",
    "men+shoes",
    "women+dresses",
    "women+tops",
    "women+sarees",
    "women+handbags",
    "unisex+sneakers",
    "kids+clothes"
]

# 🧾 List to hold all data
all_data = []

# 🌍 Loop through each category
for query in queries:
    base_url = f"https://www.flipkart.com/search?q={query}"
    print(f"\n==============================")
    print(f"🔹 SCRAPING CATEGORY: {query.upper()}")
    print(f"==============================")

    for page in range(1, 3):  # scrape first 2 pages per category
        print(f"\n➡ Page {page}")
        url = base_url + f"&page={page}"
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')

        # collect all product page links
        product_links = []
        for link in soup.find_all("a", {"class": "IRpwTa"}):
            product_links.append("https://www.flipkart.com" + link['href'])
        for link in soup.find_all("a", {"class": "WKTcLC"}):
            product_links.append("https://www.flipkart.com" + link['href'])

        for product_url in product_links:
            try:
                res = requests.get(product_url)
                psoup = BeautifulSoup(res.text, 'html.parser')
                
                # name
                try:
                    name = psoup.find("span", {"class": "B_NuCI"}).text.strip()
                except:
                    name = "Unknown"
                
                # brand
                brand = name.split()[0] if name != "Unknown" else "Unknown"
                
                # price
                try:
                    price = psoup.find("div", {"class": "_30jeq3 _16Jk6d"}).text.replace("₹", "").replace(",", "")
                except:
                    price = None
                
                # rating
                try:
                    rating = psoup.find("div", {"class": "_3LWZlK"}).text
                except:
                    rating = None
                
                # description
                try:
                    highlights = psoup.find_all("li", {"class": "_21Ahn-"})
                    description = ", ".join([h.text for h in highlights])
                except:
                    description = "No description"
                
                # image
                try:
                    image_tag = psoup.find("img", {"class": "_396cs4 _2amPTt _3qGmMb"})
                    if not image_tag:
                        image_tag = psoup.find("img", {"class": "_396cs4 _2amPTt"})
                    image_url = image_tag["src"]
                except:
                    image_url = None
                
                if not image_url or not price:
                    continue
                
                # download image
                image_name = re.sub(r'\W+', '_', name)[:40] + ".jpg"
                image_path = os.path.join("fashion_images", image_name)
                
                try:
                    img_data = requests.get(image_url).content
                    with open(image_path, "wb") as f:
                        f.write(img_data)
                except:
                    continue
                
                all_data.append({
                    "category": query.replace("+", " "),
                    "product_name": name,
                    "brand": brand,
                    "price": price,
                    "rating": rating,
                    "description": description,
                    "image_name": image_name,
                    "image_url": image_url,
                    "product_link": product_url
                })
                
                print(f"✅ Saved: {name[:40]}")
                time.sleep(1)

            except Exception as e:
                print("❌ Error:", e)
                continue

# 📦 Save final dataset
df = pd.DataFrame(all_data)
df.to_csv("fashion_dataset_full.csv", index=False)
print("\n✅ ALL DONE!")
print(f"Total products scraped: {len(df)}")
print("Saved as fashion_dataset_full.csv")



🔹 SCRAPING CATEGORY: MEN+T+SHIRTS

➡ Page 1

➡ Page 2
