In [1]:
# üè≠ B2B Marketplace Data Scraper & Analysis
# TradeIndia Web Crawler with Anti-Blocking Mechanisms

"""
This notebook implements a robust web scraping solution for B2B marketplaces
with the following features:
- Multiple product category targeting
- Anti-blocking mechanisms (rotating user agents, delays, session management)
- Clean structured data output (JSON/CSV)
- Comprehensive EDA with animated visualizations
"""

# Import Required Libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json
import csv
import time
import random
import re
from datetime import datetime
from urllib.parse import urljoin, quote
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Core libraries imported successfully!")

‚úÖ Core libraries imported successfully!


In [41]:
# üé® Enhanced Visualization Theme & Color Configuration
# Professional styling for all charts in this notebook

import plotly.io as pio
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Custom Professional Color Palette
CHART_THEME = {
    # Primary colors - Modern gradient palette
    'primary': '#667eea',
    'secondary': '#764ba2',
    'accent': '#f093fb',
    'success': '#00d9a5',
    'warning': '#ffd93d',
    'danger': '#ff6b6b',
    
    # Background colors
    'bg_dark': '#1a1a2e',
    'bg_medium': '#16213e',
    'bg_light': '#0f3460',
    'bg_paper': 'rgba(26, 26, 46, 0.95)',
    
    # Text colors
    'text_primary': '#ffffff',
    'text_secondary': '#a0aec0',
    
    # Chart-specific palettes
    'gradient_warm': ['#ff6b6b', '#ffd93d', '#6bcb77', '#4d96ff'],
    'gradient_cool': ['#667eea', '#764ba2', '#f093fb', '#00d9a5'],
    'rainbow': ['#ff6b6b', '#ff9f43', '#ffd93d', '#6bcb77', '#4d96ff', '#a55eea', '#fd79a8'],
    'categorical': ['#667eea', '#00d9a5', '#ff6b6b', '#ffd93d', '#764ba2', '#4d96ff', '#ff9f43', '#a55eea', '#6bcb77', '#fd79a8'],
}

# Custom Plotly template
def create_custom_template():
    """Creates a beautiful dark theme template for Plotly"""
    custom_template = go.layout.Template()
    
    custom_template.layout = go.Layout(
        paper_bgcolor='rgba(26, 26, 46, 0.97)',
        plot_bgcolor='rgba(22, 33, 62, 0.85)',
        font=dict(
            family='Segoe UI, Arial, sans-serif',
            size=12,
            color='#ffffff'
        ),
        title=dict(
            font=dict(size=22, color='#ffffff', family='Segoe UI Semibold'),
            x=0.5,
            xanchor='center'
        ),
        xaxis=dict(
            gridcolor='rgba(160, 174, 192, 0.15)',
            linecolor='rgba(160, 174, 192, 0.3)',
            tickfont=dict(color='#a0aec0'),
            title=dict(font=dict(color='#ffffff', size=14))
        ),
        yaxis=dict(
            gridcolor='rgba(160, 174, 192, 0.15)',
            linecolor='rgba(160, 174, 192, 0.3)',
            tickfont=dict(color='#a0aec0'),
            title=dict(font=dict(color='#ffffff', size=14))
        ),
        legend=dict(
            bgcolor='rgba(22, 33, 62, 0.8)',
            bordercolor='rgba(160, 174, 192, 0.3)',
            font=dict(color='#ffffff')
        ),
        hoverlabel=dict(
            bgcolor='rgba(26, 26, 46, 0.95)',
            bordercolor='rgba(102, 126, 234, 0.8)',
            font=dict(color='#ffffff', size=13)
        ),
        colorway=CHART_THEME['categorical']
    )
    
    return custom_template

# Register custom template
pio.templates['slooze_dark'] = create_custom_template()
pio.templates.default = 'slooze_dark'

# Helper function for consistent chart styling
def style_figure(fig, title, height=500, show_legend=True):
    """Apply consistent professional styling to any Plotly figure"""
    fig.update_layout(
        title=dict(
            text=title,
            font=dict(size=20, color='#ffffff', family='Segoe UI Semibold'),
            x=0.5,
            xanchor='center',
            y=0.95
        ),
        height=height,
        margin=dict(t=80, b=60, l=60, r=40),
        showlegend=show_legend,
        legend=dict(
            orientation='h',
            yanchor='bottom',
            y=-0.25,
            xanchor='center',
            x=0.5,
            bgcolor='rgba(22, 33, 62, 0.8)',
            bordercolor='rgba(102, 126, 234, 0.5)',
            borderwidth=1
        ),
        hoverlabel=dict(
            bgcolor='rgba(26, 26, 46, 0.95)',
            bordercolor='#667eea',
            font=dict(size=13, color='white')
        )
    )
    return fig

def add_gradient_fill(fig, trace_idx=0, color1='#667eea', color2='#764ba2'):
    """Add gradient styling to traces"""
    fig.data[trace_idx].update(
        marker=dict(
            color=f'rgba(102, 126, 234, 0.8)',
            line=dict(color='white', width=1)
        )
    )
    return fig

print("‚úÖ Enhanced visualization theme loaded!")
print("üé® Custom dark theme: 'slooze_dark'")
print("üåà Color palettes available: categorical, rainbow, gradient_warm, gradient_cool")

‚úÖ Enhanced visualization theme loaded!
üé® Custom dark theme: 'slooze_dark'
üåà Color palettes available: categorical, rainbow, gradient_warm, gradient_cool


## üì¶ Section 1: Configuration & Anti-Blocking Setup

This section sets up the core configuration for the web scraper including:
- Rotating User Agents to mimic different browsers
- Request headers to appear as legitimate traffic
- Delay mechanisms to avoid rate limiting
- Session management for persistent connections

In [21]:
# üîß Anti-Blocking Configuration Class

class ScraperConfig:
    """Configuration class for web scraper with anti-blocking mechanisms"""
    
    # Rotating User Agents Pool (mimics different browsers/devices)
    USER_AGENTS = [
        # Chrome on Windows
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
        # Firefox on Windows
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0',
        # Edge on Windows
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
        # Chrome on Mac
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        # Safari on Mac
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
        # Chrome on Linux
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    ]
    
    # Target Product Categories - Using search keywords that work on TradeIndia
    PRODUCT_CATEGORIES = {
        'industrial_machinery': 'industrial machinery',
        'electronics': 'electronic components',
        'textiles': 'textile machinery',
        'chemicals': 'chemical equipment',
        'food_beverages': 'food processing machine',
        'construction': 'construction equipment'
    }
    
    # Request delay settings (seconds) - Increased to be more respectful
    MIN_DELAY = 3
    MAX_DELAY = 6
    
    # Maximum retries for failed requests
    MAX_RETRIES = 3
    
    # Timeout settings
    REQUEST_TIMEOUT = 30
    
    # Base URL
    BASE_URL = "https://www.tradeindia.com"
    
    @classmethod
    def get_random_user_agent(cls):
        """Returns a random user agent from the pool"""
        return random.choice(cls.USER_AGENTS)
    
    @classmethod
    def get_headers(cls):
        """Returns randomized headers for requests"""
        return {
            'User-Agent': cls.get_random_user_agent(),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Cache-Control': 'max-age=0',
            'DNT': '1',
            'Referer': 'https://www.tradeindia.com/',
        }
    
    @classmethod
    def get_random_delay(cls):
        """Returns a random delay value to avoid rate limiting"""
        return random.uniform(cls.MIN_DELAY, cls.MAX_DELAY)

print("‚úÖ Scraper configuration loaded!")
print(f"üìã Available categories: {list(ScraperConfig.PRODUCT_CATEGORIES.keys())}")

‚úÖ Scraper configuration loaded!
üìã Available categories: ['industrial_machinery', 'electronics', 'textiles', 'chemicals', 'food_beverages', 'construction']


## üåê Section 2: Session Manager & Request Handler

This section implements:
- Persistent session management for efficient requests
- Automatic retry logic with exponential backoff
- Error handling for network issues
- Request logging for debugging

In [22]:
# üåê Session Manager with Anti-Blocking Features

class SessionManager:
    """Manages HTTP sessions with anti-blocking mechanisms"""
    
    def __init__(self):
        self.session = requests.Session()
        self.request_count = 0
        self.failed_requests = 0
        self.successful_requests = 0
        
    def refresh_session(self):
        """Creates a new session to avoid session-based blocking"""
        self.session = requests.Session()
        print("üîÑ Session refreshed")
        
    def make_request(self, url, method='GET', **kwargs):
        """
        Makes an HTTP request with retry logic and anti-blocking measures
        
        Args:
            url: Target URL
            method: HTTP method (GET/POST)
            **kwargs: Additional request parameters
        
        Returns:
            Response object or None if failed
        """
        for attempt in range(ScraperConfig.MAX_RETRIES):
            try:
                # Add random delay before request
                delay = ScraperConfig.get_random_delay()
                time.sleep(delay)
                
                # Get fresh headers for each request
                headers = ScraperConfig.get_headers()
                
                # Make the request
                if method.upper() == 'GET':
                    response = self.session.get(
                        url,
                        headers=headers,
                        timeout=ScraperConfig.REQUEST_TIMEOUT,
                        **kwargs
                    )
                else:
                    response = self.session.post(
                        url,
                        headers=headers,
                        timeout=ScraperConfig.REQUEST_TIMEOUT,
                        **kwargs
                    )
                
                self.request_count += 1
                
                # Check for blocking indicators
                if response.status_code == 403:
                    print(f"‚ö†Ô∏è Access forbidden (403) - Attempt {attempt + 1}/{ScraperConfig.MAX_RETRIES}")
                    self.refresh_session()
                    time.sleep(delay * 2)  # Exponential backoff
                    continue
                    
                if response.status_code == 429:
                    print(f"‚ö†Ô∏è Rate limited (429) - Waiting longer...")
                    time.sleep(delay * 3)
                    continue
                    
                if response.status_code == 200:
                    self.successful_requests += 1
                    return response
                    
                print(f"‚ö†Ô∏è Unexpected status code: {response.status_code}")
                
            except requests.exceptions.Timeout:
                print(f"‚è±Ô∏è Request timeout - Attempt {attempt + 1}/{ScraperConfig.MAX_RETRIES}")
            except requests.exceptions.ConnectionError:
                print(f"üîå Connection error - Attempt {attempt + 1}/{ScraperConfig.MAX_RETRIES}")
            except Exception as e:
                print(f"‚ùå Error: {str(e)} - Attempt {attempt + 1}/{ScraperConfig.MAX_RETRIES}")
        
        self.failed_requests += 1
        return None
    
    def get_stats(self):
        """Returns request statistics"""
        return {
            'total_requests': self.request_count,
            'successful': self.successful_requests,
            'failed': self.failed_requests,
            'success_rate': f"{(self.successful_requests/max(1, self.request_count))*100:.1f}%"
        }

# Initialize session manager
session_manager = SessionManager()
print("‚úÖ Session manager initialized!")
print(f"üìä Initial stats: {session_manager.get_stats()}")

‚úÖ Session manager initialized!
üìä Initial stats: {'total_requests': 0, 'successful': 0, 'failed': 0, 'success_rate': '0.0%'}


## üï∑Ô∏è Section 3: Core Web Scraper Classes

This section contains the main scraping logic:
- HTML Parser for extracting product data
- Product data model with all relevant fields
- Category crawler for navigating product listings
- Pagination handling for complete data extraction

In [28]:
# üï∑Ô∏è TradeIndia Product Scraper - Enhanced Parser v2

class TradeIndiaParser:
    """Enhanced parser for extracting product data from TradeIndia pages"""
    
    # Indian cities for location detection
    INDIAN_CITIES = [
        'Mumbai', 'Delhi', 'Ahmedabad', 'Chennai', 'Kolkata', 'Bengaluru', 
        'Bangalore', 'Pune', 'Hyderabad', 'Jaipur', 'Surat', 'Kanpur',
        'Coimbatore', 'Rajkot', 'Vadodara', 'Jamshedpur', 'Ballabgarh',
        'Chhatrapati Sambhajinagar', 'Ludhiana', 'Nagpur', 'Indore', 'Thane',
        'Bhopal', 'Visakhapatnam', 'Patna', 'Agra', 'Varanasi', 'Meerut',
        'Nashik', 'Faridabad', 'Ghaziabad', 'Aurangabad', 'Jodhpur', 'Kochi',
        'Guwahati', 'Chandigarh', 'Thiruvananthapuram', 'Gurugram', 'Noida',
        'Ranchi', 'Jalandhar', 'Howrah', 'Amritsar', 'Allahabad', 'Raipur',
        'Dehradun', 'Madurai', 'Vijayawada', 'Mysore', 'Tiruchirappalli',
        'Warangal', 'Guntur', 'Hubli', 'Bikaner', 'Udaipur', 'Bhilai',
        'Bhavnagar', 'Moradabad', 'Saharanpur', 'Siliguri', 'Tiruppur',
        'Sambalpur', 'Bilaspur', 'Kakinada', 'Rohtak', 'Bhiwandi', 'Anand',
        'Vapi', 'Morbi', 'Gandhinagar', 'Silvassa', 'Panipat', 'Sonipat'
    ]
    
    @staticmethod
    def clean_text(text):
        """Cleans and normalizes text content"""
        if not text:
            return ""
        text = re.sub(r'\s+', ' ', text.strip())
        text = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text)
        return text.strip()
    
    @staticmethod
    def extract_price(text):
        """Extracts numerical price from price string"""
        if not text:
            return None, 'INR', None
        
        text = TradeIndiaParser.clean_text(text)
        
        # Try different price patterns
        # Pattern: "7500 INR" or "‚Çπ7500" or "Rs 7500"
        price_patterns = [
            r'(\d[\d,]*(?:\.\d+)?)\s*(?:INR|Rs\.?|‚Çπ)',
            r'(?:INR|Rs\.?|‚Çπ)\s*(\d[\d,]*(?:\.\d+)?)',
            r'Price\s*[:\s]*(\d[\d,]*(?:\.\d+)?)',
            r'(\d[\d,]*(?:\.\d+)?)\s*(?:Approx)',
        ]
        
        for pattern in price_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                try:
                    price = float(match.group(1).replace(',', ''))
                    # Get unit
                    unit_match = re.search(r'/(\w+)', text)
                    unit = unit_match.group(1) if unit_match else 'Unit'
                    return price, 'INR', unit
                except:
                    continue
        
        return None, 'INR', None
    
    @staticmethod
    def extract_moq(text):
        """Extract minimum order quantity"""
        if not text:
            return None
        
        patterns = [
            r'MOQ[:\s-]*(\d+)',
            r'(\d+)\s*(?:Unit|Piece|Set|Box)/(?:Unit|Piece|Set|Box)',
            r'Min[.\s]*Order[:\s]*(\d+)',
        ]
        
        for pattern in patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                return int(match.group(1))
        
        return None
    
    @staticmethod
    def extract_years_in_business(text):
        """Extract years in business"""
        if not text:
            return None
        
        match = re.search(r'(\d+)\s*Years?', text, re.IGNORECASE)
        if match:
            return int(match.group(1))
        return None
    
    @staticmethod
    def extract_response_rate(text):
        """Extract response rate percentage"""
        if not text:
            return None
        
        match = re.search(r'Response\s*Rate[:\s]*(\d+(?:\.\d+)?)\s*%', text, re.IGNORECASE)
        if match:
            return float(match.group(1))
        return None
    
    @staticmethod
    def find_location(text):
        """Find location/city in text"""
        if not text:
            return None
        
        for city in TradeIndiaParser.INDIAN_CITIES:
            if city.lower() in text.lower():
                return city
        return None
    
    @staticmethod
    def parse_product_from_html(html_content, base_url):
        """Parse product information from the raw HTML content"""
        products = []
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Get the full page text for reference
        page_text = soup.get_text(' ', strip=True)
        
        # Find all product links - TradeIndia uses links to /products/ pages
        product_links = soup.find_all('a', href=re.compile(r'/products/[^/]+\.html'))
        
        # Track processed URLs to avoid duplicates
        processed_urls = set()
        
        for link in product_links:
            href = link.get('href', '')
            if not href:
                continue
            
            # Get full product URL
            product_url = urljoin(base_url, href)
            
            # Skip duplicates
            if product_url in processed_urls:
                continue
            
            # Get product name from link text
            product_name = TradeIndiaParser.clean_text(link.get_text())
            
            # Skip invalid entries
            if not product_name or len(product_name) < 5:
                continue
            
            # Skip navigation/action links
            skip_words = ['send inquiry', 'view number', 'next', 'prev', 'click here', 
                          'read more', 'see all', 'view all', 'show more']
            if any(sw in product_name.lower() for sw in skip_words):
                continue
            
            processed_urls.add(product_url)
            
            product = {
                'scraped_at': datetime.now().isoformat(),
                'product_name': product_name,
                'product_url': product_url,
                'price': None,
                'price_currency': 'INR',
                'price_unit': None,
                'min_order_qty': None,
                'supplier_name': None,
                'supplier_location': None,
                'supplier_url': None,
                'product_image': None,
                'description': None,
                'category': None,
                'verified_supplier': False,
                'response_rate': None,
                'years_in_business': None
            }
            
            # Walk up the DOM to find container with more data
            # TradeIndia typically has product cards with all info in a parent div
            container = None
            parent = link.parent
            depth = 0
            while parent and depth < 10:
                parent_text = parent.get_text(' ', strip=True) if parent else ''
                # Found a good container if it has price or company info
                if parent_text and ('INR' in parent_text or 'Years' in parent_text or 'MOQ' in parent_text):
                    container = parent
                    break
                parent = parent.parent
                depth += 1
            
            if container:
                container_text = container.get_text(' ', strip=True)
                
                # Extract price
                price, currency, unit = TradeIndiaParser.extract_price(container_text)
                if price:
                    product['price'] = price
                    product['price_currency'] = currency
                    product['price_unit'] = unit
                
                # Extract MOQ
                product['min_order_qty'] = TradeIndiaParser.extract_moq(container_text)
                
                # Extract years in business
                product['years_in_business'] = TradeIndiaParser.extract_years_in_business(container_text)
                
                # Extract response rate
                product['response_rate'] = TradeIndiaParser.extract_response_rate(container_text)
                
                # Check for verified/trusted seller
                if 'TrustedSeller' in container_text or 'Trusted Seller' in container_text:
                    product['verified_supplier'] = True
                
                # Find location
                product['supplier_location'] = TradeIndiaParser.find_location(container_text)
                
                # Find supplier name - look for company links
                # Pattern: links ending with company ID like /company-name-12345/
                supplier_links = container.find_all('a', href=re.compile(r'tradeindia\.com/[a-z0-9-]+-\d+/?$'))
                for sup_link in supplier_links:
                    sup_name = TradeIndiaParser.clean_text(sup_link.get_text())
                    if sup_name and len(sup_name) > 3 and sup_name != product_name:
                        # Check it's likely a company name (has uppercase or common suffixes)
                        if any(x in sup_name.upper() for x in ['LTD', 'PVT', 'PRIVATE', 'LIMITED', 'INDUSTRIES', 
                                                                'ENGINEERING', 'ENTERPRISE', 'COMPANY', 'CO.', 
                                                                'WORKS', 'SYSTEMS', 'SOLUTIONS', 'EQUIPMENT',
                                                                'MANUFACTURERS', 'TRADERS', 'AGENCIES']):
                            product['supplier_name'] = sup_name
                            product['supplier_url'] = urljoin(base_url, sup_link.get('href', ''))
                            break
                
                # If no supplier found with pattern, try looking for all caps text
                if not product['supplier_name']:
                    # Look for text patterns that look like company names
                    all_text = container.get_text(' ')
                    # Company names often appear in all caps or after product info
                    company_patterns = [
                        r'\n([A-Z][A-Z0-9\s\.&,]+(?:LTD|PVT|LIMITED|PRIVATE|INDUSTRIES|ENGINEERING|ENTERPRISE|WORKS|SYSTEMS)\.?)',
                        r'([A-Z][A-Za-z0-9\s\.&,]+(?:Ltd|Pvt|Limited|Private|Industries|Engineering|Enterprise|Works|Systems)\.?)',
                    ]
                    for cp in company_patterns:
                        match = re.search(cp, all_text)
                        if match:
                            potential_name = TradeIndiaParser.clean_text(match.group(1))
                            if potential_name and len(potential_name) > 5:
                                product['supplier_name'] = potential_name
                                break
                
                # Find product image
                imgs = container.find_all('img')
                for img in imgs:
                    img_src = img.get('data-src') or img.get('src') or img.get('data-lazy')
                    if img_src and not img_src.startswith('data:'):
                        # Skip small icons
                        if 'icon' not in img_src.lower() and 'logo' not in img_src.lower():
                            product['product_image'] = urljoin(base_url, img_src)
                            break
            
            products.append(product)
        
        return products

print("‚úÖ TradeIndia parser loaded (enhanced v2)!")

‚úÖ TradeIndia parser loaded (enhanced v2)!


In [29]:
# üîç Main Scraper Class - TradeIndia Crawler (Improved)

class TradeIndiaScraper:
    """Main scraper class for TradeIndia B2B marketplace - Updated version"""
    
    def __init__(self, session_manager):
        self.session = session_manager
        self.all_products = []
        self.scrape_log = []
        
    def get_search_url(self, query, page=1):
        """Generates search URL for a product query - Fixed URL structure"""
        encoded_query = quote(query)
        # TradeIndia uses this URL format: /search.html/?keyword=...&page=
        return f"{ScraperConfig.BASE_URL}/search.html/?keyword={encoded_query}&page={page}"
    
    def scrape_listing_page(self, url, category_name=None):
        """Scrapes a single listing page for products using improved parser"""
        products = []
        
        print(f"üìÑ Scraping: {url}")
        response = self.session.make_request(url)
        
        if not response:
            self.scrape_log.append({
                'url': url,
                'status': 'failed',
                'timestamp': datetime.now().isoformat()
            })
            return products
        
        # Use the improved parser
        products = TradeIndiaParser.parse_product_from_html(
            response.content, 
            ScraperConfig.BASE_URL
        )
        
        # Add category to all products
        for product in products:
            product['category'] = category_name
            product['source_url'] = url
        
        print(f"   Found {len(products)} products")
        
        self.scrape_log.append({
            'url': url,
            'status': 'success',
            'products_found': len(products),
            'timestamp': datetime.now().isoformat()
        })
        
        return products
    
    def scrape_category(self, category_key, max_pages=3):
        """Scrapes a category using search queries"""
        if category_key not in ScraperConfig.PRODUCT_CATEGORIES:
            print(f"‚ùå Unknown category: {category_key}")
            return []
        
        search_query = ScraperConfig.PRODUCT_CATEGORIES[category_key]
        category_products = []
        
        print(f"\n{'='*60}")
        print(f"üè∑Ô∏è Scraping category: {category_key}")
        print(f"   Search query: {search_query}")
        print(f"{'='*60}")
        
        for page in range(1, max_pages + 1):
            url = self.get_search_url(search_query, page)
            products = self.scrape_listing_page(url, category_key)
            
            # Remove duplicates based on product_url
            existing_urls = {p['product_url'] for p in category_products}
            new_products = [p for p in products if p['product_url'] not in existing_urls]
            category_products.extend(new_products)
            
            print(f"   Page {page}: {len(new_products)} unique products added (Total: {len(category_products)})")
            
            # If no new products found, likely reached end of listings
            if len(new_products) == 0:
                print(f"   No new products found, stopping pagination")
                break
            
            # Respect rate limits
            time.sleep(ScraperConfig.get_random_delay())
        
        self.all_products.extend(category_products)
        return category_products
    
    def scrape_search(self, query, max_pages=3):
        """Scrapes search results for a specific query"""
        search_products = []
        
        print(f"\n{'='*60}")
        print(f"üîç Searching for: {query}")
        print(f"{'='*60}")
        
        for page in range(1, max_pages + 1):
            url = self.get_search_url(query, page)
            products = self.scrape_listing_page(url, f"search:{query}")
            
            # Remove duplicates
            existing_urls = {p['product_url'] for p in search_products}
            new_products = [p for p in products if p['product_url'] not in existing_urls]
            search_products.extend(new_products)
            
            print(f"   Page {page}: {len(new_products)} unique products added")
            
            if len(new_products) == 0:
                break
            
            time.sleep(ScraperConfig.get_random_delay())
        
        self.all_products.extend(search_products)
        return search_products
    
    def scrape_all_categories(self, max_pages_per_category=2):
        """Scrapes all configured categories"""
        print("\n" + "üöÄ " + "="*58)
        print("   STARTING FULL CATEGORY SCRAPE")
        print("üöÄ " + "="*58)
        
        for category_key in ScraperConfig.PRODUCT_CATEGORIES.keys():
            self.scrape_category(category_key, max_pages_per_category)
            # Extra delay between categories
            time.sleep(ScraperConfig.get_random_delay())
        
        print(f"\n‚úÖ Scraping complete! Total products: {len(self.all_products)}")
        return self.all_products
    
    def get_dataframe(self):
        """Returns collected products as a pandas DataFrame"""
        return pd.DataFrame(self.all_products)
    
    def export_to_json(self, filename=None):
        """Exports collected data to JSON file"""
        if filename is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"tradeindia_products_{timestamp}.json"
        
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(self.all_products, f, indent=2, ensure_ascii=False)
        
        print(f"üíæ Data exported to: {filename}")
        return filename
    
    def export_to_csv(self, filename=None):
        """Exports collected data to CSV file"""
        if filename is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"tradeindia_products_{timestamp}.csv"
        
        df = self.get_dataframe()
        df.to_csv(filename, index=False, encoding='utf-8-sig')
        
        print(f"üíæ Data exported to: {filename}")
        return filename
    
    def get_scrape_summary(self):
        """Returns a summary of the scraping session"""
        df = self.get_dataframe()
        
        summary = {
            'total_products': len(self.all_products),
            'unique_suppliers': df['supplier_name'].nunique() if 'supplier_name' in df.columns else 0,
            'categories_scraped': df['category'].nunique() if 'category' in df.columns else 0,
            'products_with_price': df['price'].notna().sum() if 'price' in df.columns else 0,
            'verified_suppliers': df['verified_supplier'].sum() if 'verified_supplier' in df.columns else 0,
            'unique_locations': df['supplier_location'].nunique() if 'supplier_location' in df.columns else 0,
            'session_stats': self.session.get_stats()
        }
        
        return summary

# Initialize scraper
scraper = TradeIndiaScraper(session_manager)
print("‚úÖ TradeIndia scraper initialized!")

‚úÖ TradeIndia scraper initialized!


## üöÄ Section 4: Enhanced Scraper with Direct Product Links

This section implements an enhanced scraping approach that:
- Directly targets product detail pages for more complete data
- Uses multiple search queries per category
- Extracts detailed product specifications
- Handles dynamic content more effectively

In [6]:
# üöÄ Enhanced Product Detail Scraper

class EnhancedProductScraper:
    """
    Enhanced scraper that extracts detailed product information
    by visiting individual product pages
    """
    
    def __init__(self, session_manager):
        self.session = session_manager
        self.detailed_products = []
        
    def scrape_product_detail(self, product_url):
        """Scrapes detailed information from a product page"""
        response = self.session.make_request(product_url)
        
        if not response:
            return None
            
        soup = BeautifulSoup(response.content, 'html.parser')
        
        detail = {
            'url': product_url,
            'scraped_at': datetime.now().isoformat()
        }
        
        # Extract detailed specifications
        spec_tables = soup.find_all(['table', 'div'], class_=re.compile(r'spec|detail|feature|attr', re.I))
        specs = {}
        
        for table in spec_tables:
            rows = table.find_all(['tr', 'div'])
            for row in rows:
                cells = row.find_all(['td', 'span', 'div'])
                if len(cells) >= 2:
                    key = TradeIndiaParser.clean_text(cells[0].get_text())
                    value = TradeIndiaParser.clean_text(cells[1].get_text())
                    if key and value:
                        specs[key] = value
        
        detail['specifications'] = specs
        
        # Extract full description
        desc_elem = soup.find(class_=re.compile(r'description|about|detail-text', re.I))
        if desc_elem:
            detail['full_description'] = TradeIndiaParser.clean_text(desc_elem.get_text())[:2000]
        
        # Extract all images
        images = []
        for img in soup.find_all('img', src=re.compile(r'product|image', re.I)):
            src = img.get('data-src') or img.get('src')
            if src and not src.startswith('data:'):
                images.append(urljoin(ScraperConfig.BASE_URL, src))
        detail['all_images'] = images[:10]  # Limit to 10 images
        
        # Extract contact/supplier details
        contact_elem = soup.find(class_=re.compile(r'contact|supplier-info|seller-info', re.I))
        if contact_elem:
            detail['contact_info'] = TradeIndiaParser.clean_text(contact_elem.get_text())[:500]
        
        return detail

# Initialize enhanced scraper
enhanced_scraper = EnhancedProductScraper(session_manager)
print("‚úÖ Enhanced product scraper initialized!")

‚úÖ Enhanced product scraper initialized!


## üéØ Section 5: Live Scraping Execution

This section runs the actual live scraping:
- Executes real-time data collection from TradeIndia
- Targets multiple product categories
- Collects comprehensive product details
- Exports data in structured formats (JSON/CSV)

In [30]:
# üéØ LIVE SCRAPING EXECUTION (IMPROVED)
# This cell performs real-time data collection from TradeIndia with better parsing

print("üöÄ Starting Live Scraping Session (Improved)...")
print(f"‚è∞ Start Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*60)

# Reset scraper for fresh run
session_manager = SessionManager()  # Fresh session
scraper = TradeIndiaScraper(session_manager)

# Define search queries for comprehensive data collection
# Using specific product terms that work well on TradeIndia
search_queries = [
    "industrial pumps",
    "cnc machine",
    "textile machinery",
    "packaging machine",
    "solar panels",
    "led lights",
    "food processing equipment",
    "printing machine",
    "agricultural equipment",
    "construction materials",
    "hydraulic press",
    "electric motors"
]

# Execute search-based scraping for more targeted results
all_collected_products = []

for query in search_queries:
    try:
        products = scraper.scrape_search(query, max_pages=2)
        print(f"   ‚úì '{query}': {len(products)} products found")
    except Exception as e:
        print(f"   ‚úó Error with '{query}': {str(e)}")

# Remove global duplicates based on product_url
seen_urls = set()
unique_products = []
for product in scraper.all_products:
    url = product.get('product_url')
    if url and url not in seen_urls:
        seen_urls.add(url)
        unique_products.append(product)

scraper.all_products = unique_products

print("\n" + "="*60)
print(f"‚è∞ End Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"üìä Total Unique Products Collected: {len(scraper.all_products)}")
print(f"üìà Session Stats: {session_manager.get_stats()}")

# Show data quality summary
df_temp = scraper.get_dataframe()
if len(df_temp) > 0:
    print("\nüìã Data Quality Summary:")
    print(f"   ‚Ä¢ Products with supplier name: {df_temp['supplier_name'].notna().sum()}")
    print(f"   ‚Ä¢ Products with supplier location: {df_temp['supplier_location'].notna().sum()}")
    print(f"   ‚Ä¢ Products with price: {df_temp['price'].notna().sum()}")
    print(f"   ‚Ä¢ Verified suppliers: {df_temp['verified_supplier'].sum()}")
    print(f"   ‚Ä¢ Products with years in business: {df_temp['years_in_business'].notna().sum()}")
print("="*60)

üöÄ Starting Live Scraping Session (Improved)...
‚è∞ Start Time: 2025-12-24 23:47:28

üîç Searching for: industrial pumps
üìÑ Scraping: https://www.tradeindia.com/search.html/?keyword=industrial%20pumps&page=1
   Found 36 products
   Page 1: 36 unique products added
üìÑ Scraping: https://www.tradeindia.com/search.html/?keyword=industrial%20pumps&page=2
   Found 35 products
   Page 2: 27 unique products added
   ‚úì 'industrial pumps': 63 products found

üîç Searching for: cnc machine
üìÑ Scraping: https://www.tradeindia.com/search.html/?keyword=cnc%20machine&page=1
   Found 39 products
   Page 1: 39 unique products added
üìÑ Scraping: https://www.tradeindia.com/search.html/?keyword=cnc%20machine&page=2
   Found 40 products
   Page 2: 28 unique products added
   ‚úì 'cnc machine': 67 products found

üîç Searching for: textile machinery
üìÑ Scraping: https://www.tradeindia.com/search.html/?keyword=textile%20machinery&page=1
   Found 29 products
   Page 1: 29 unique products adde

In [31]:
# üíæ DATA EXPORT - Save collected data to files

# Create DataFrame from collected products
df_products = scraper.get_dataframe()

# Display data sample
print("üìä Data Sample (First 10 rows):")
print("="*60)
if len(df_products) > 0:
    display(df_products.head(10))
else:
    print("No products collected. Using sample data for demonstration.")
    # Create sample data for demonstration if scraping didn't collect real data
    sample_data = []
    categories = ['industrial_machinery', 'electronics', 'textiles', 'chemicals', 'construction']
    locations = ['Mumbai, Maharashtra', 'Delhi, Delhi', 'Chennai, Tamil Nadu', 'Ahmedabad, Gujarat', 
                 'Bangalore, Karnataka', 'Pune, Maharashtra', 'Kolkata, West Bengal', 'Hyderabad, Telangana',
                 'Jaipur, Rajasthan', 'Surat, Gujarat']
    
    for i in range(100):
        sample_data.append({
            'scraped_at': datetime.now().isoformat(),
            'product_name': f"Product {i+1} - {random.choice(['Machine', 'Equipment', 'Tool', 'Device', 'System'])}",
            'product_url': f"https://www.tradeindia.com/product{i+1}",
            'price': random.uniform(1000, 500000) if random.random() > 0.3 else None,
            'price_currency': 'INR',
            'price_unit': random.choice(['piece', 'kg', 'unit', 'set', 'lot']),
            'min_order_qty': random.randint(1, 100) if random.random() > 0.4 else None,
            'supplier_name': f"Supplier Company {random.randint(1, 50)}",
            'supplier_location': random.choice(locations),
            'supplier_url': f"https://www.tradeindia.com/supplier{random.randint(1, 50)}",
            'product_image': f"https://images.tradeindia.com/product{i+1}.jpg",
            'description': f"High quality industrial product with excellent features and specifications.",
            'category': random.choice(categories),
            'verified_supplier': random.random() > 0.5,
            'response_rate': random.randint(70, 100) if random.random() > 0.3 else None,
            'years_in_business': random.randint(1, 25) if random.random() > 0.4 else None
        })
    
    df_products = pd.DataFrame(sample_data)
    print("üìù Generated sample data for demonstration purposes")
    display(df_products.head(10))

# Export data
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Export to CSV
csv_filename = f"tradeindia_products_{timestamp}.csv"
df_products.to_csv(csv_filename, index=False, encoding='utf-8-sig')
print(f"\nüíæ CSV exported: {csv_filename}")

# Export to JSON
json_filename = f"tradeindia_products_{timestamp}.json"
df_products.to_json(json_filename, orient='records', indent=2, force_ascii=False)
print(f"üíæ JSON exported: {json_filename}")

print(f"\nüìä Total records exported: {len(df_products)}")

üìä Data Sample (First 10 rows):


Unnamed: 0,scraped_at,product_name,product_url,price,price_currency,price_unit,min_order_qty,supplier_name,supplier_location,supplier_url,product_image,description,category,verified_supplier,response_rate,years_in_business,source_url
0,2025-12-24T23:47:35.917377,Black Jts-Ctss Series Ss Centrifugal Monoblock...,https://www.tradeindia.com/products/jts-ctss-s...,7500.0,INR,Units,1.0,,,,,,search:industrial pumps,False,,2.0,https://www.tradeindia.com/search.html/?keywor...
1,2025-12-24T23:47:35.918316,Centrifugal Feed Water Pump - Capacity up to 2...,https://www.tradeindia.com/products/centrifuga...,12000.0,INR,hr,1.0,,,,,,search:industrial pumps,False,,2.0,https://www.tradeindia.com/search.html/?keywor...
2,2025-12-24T23:47:35.918722,Honda Gx80 Petrol Engine Pumpset - Application...,https://www.tradeindia.com/products/honda-gx80...,15800.0,INR,Pieces,1.0,,,,,,search:industrial pumps,False,,4.0,https://www.tradeindia.com/search.html/?keywor...
3,2025-12-24T23:47:35.918933,Ec240 Hydraulic Pump - Color: Red,https://www.tradeindia.com/products/ec240-hydr...,,INR,,1000.0,,,,,,search:industrial pumps,False,,2.0,https://www.tradeindia.com/search.html/?keywor...
4,2025-12-24T23:47:35.919553,10Hp Industrial Pump - Color: Grey,https://www.tradeindia.com/products/10hp-indus...,18000.0,INR,Unit,,Tech Systems,,,,,search:industrial pumps,False,,,https://www.tradeindia.com/search.html/?keywor...
5,2025-12-24T23:47:35.919919,Industrial Pump - Discharge Pressure: 10 Lpm,https://www.tradeindia.com/products/industrial...,20000.0,INR,Unit,,Deshanjay Exports Pvt. Ltd.,,,,,search:industrial pumps,False,,,https://www.tradeindia.com/search.html/?keywor...
6,2025-12-24T23:47:35.920070,Industrial Industrial Monoblock Pump - Color: ...,https://www.tradeindia.com/products/industrial...,8000.0,INR,Unit,,,,,,,search:industrial pumps,False,,,https://www.tradeindia.com/search.html/?keywor...
7,2025-12-24T23:47:35.920230,Electric Industrial Pump - 220-240 Volt Rated ...,https://www.tradeindia.com/products/electric-i...,25000.0,INR,Unit,,Duty Use in Various Industries,,,,,search:industrial pumps,False,,,https://www.tradeindia.com/search.html/?keywor...
8,2025-12-24T23:47:35.920383,Triplex High Pressure Plunger Pumps Equipment ...,https://www.tradeindia.com/products/triplex-hi...,118000.0,INR,Sets,1.0,,,,https://cpimg.tistatic.com/02592233/b/16/Tripl...,,search:industrial pumps,False,,,https://www.tradeindia.com/search.html/?keywor...
9,2025-12-24T23:47:35.920537,Industrial Rubber Bellows - Color: Black,https://www.tradeindia.com/products/industrial...,400.0,INR,Units,1.0,,,,https://cpimg.tistatic.com/03402603/b/5/Indust...,,search:industrial pumps,False,,,https://www.tradeindia.com/search.html/?keywor...



üíæ CSV exported: tradeindia_products_20251224_235318.csv
üíæ JSON exported: tradeindia_products_20251224_235318.json

üìä Total records exported: 761


---

# üìä PART B: Exploratory Data Analysis (EDA)

This section performs comprehensive exploratory data analysis on the scraped data to uncover meaningful insights including:
- Summary statistics and data quality assessment
- Category and price distribution analysis
- Regional/geographic patterns
- Supplier analysis and verification trends
- Animated visualizations for better insights

In [9]:
# üìä EDA Setup - Import Visualization Libraries

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio

# Set default template for beautiful charts
pio.templates.default = "plotly_white"

# Color palette for consistent styling
COLORS = {
    'primary': '#3498db',
    'secondary': '#e74c3c',
    'success': '#2ecc71',
    'warning': '#f39c12',
    'info': '#9b59b6',
    'dark': '#34495e',
    'gradient': ['#667eea', '#764ba2', '#f093fb', '#f5576c', '#4facfe']
}

print("‚úÖ Visualization libraries loaded!")
print("üìä Ready for Exploratory Data Analysis")

‚úÖ Visualization libraries loaded!
üìä Ready for Exploratory Data Analysis


## üìà Section 6: Data Overview & Summary Statistics

Comprehensive overview of the collected dataset including:
- Data shape and structure
- Missing value analysis
- Basic statistical measures
- Data type information

In [32]:
# üìà Data Overview & Summary Statistics

print("=" * 70)
print("üìä DATASET OVERVIEW")
print("=" * 70)

# Basic info
print(f"\nüìå Dataset Shape: {df_products.shape[0]} rows √ó {df_products.shape[1]} columns")
print(f"üìå Memory Usage: {df_products.memory_usage(deep=True).sum() / 1024:.2f} KB")

# Column information
print("\n" + "=" * 70)
print("üìã COLUMN INFORMATION")
print("=" * 70)
print(f"\n{'Column':<25} {'Type':<15} {'Non-Null':<12} {'Null %':<10}")
print("-" * 62)

for col in df_products.columns:
    non_null = df_products[col].notna().sum()
    null_pct = (df_products[col].isna().sum() / len(df_products)) * 100
    print(f"{col:<25} {str(df_products[col].dtype):<15} {non_null:<12} {null_pct:.1f}%")

# Numerical statistics
print("\n" + "=" * 70)
print("üìä NUMERICAL STATISTICS")
print("=" * 70)

numerical_cols = df_products.select_dtypes(include=[np.number]).columns.tolist()
if numerical_cols:
    display(df_products[numerical_cols].describe().round(2))
else:
    print("No numerical columns found")
    
# Categorical statistics
print("\n" + "=" * 70)
print("üìä CATEGORICAL STATISTICS")
print("=" * 70)

if 'category' in df_products.columns:
    print("\nüè∑Ô∏è Products by Category:")
    print(df_products['category'].value_counts().to_string())
    
if 'supplier_location' in df_products.columns:
    print("\nüìç Top 10 Supplier Locations:")
    print(df_products['supplier_location'].value_counts().head(10).to_string())

üìä DATASET OVERVIEW

üìå Dataset Shape: 761 rows √ó 17 columns
üìå Memory Usage: 659.20 KB

üìã COLUMN INFORMATION

Column                    Type            Non-Null     Null %    
--------------------------------------------------------------
scraped_at                object          761          0.0%
product_name              object          761          0.0%
product_url               object          761          0.0%
price                     float64         539          29.2%
price_currency            object          761          0.0%
price_unit                object          539          29.2%
min_order_qty             float64         597          21.6%
supplier_name             object          126          83.4%
supplier_location         object          123          83.8%
supplier_url              object          108          85.8%
product_image             object          674          11.4%
description               object          0            100.0%
category             

Unnamed: 0,price,min_order_qty,response_rate,years_in_business
count,539.0,597.0,20.0,202.0
mean,464486800.0,58.74,88.5,9.36
std,10768240000.0,313.28,4.43,7.61
min,1.0,1.0,71.39,1.0
25%,4500.0,1.0,87.5,2.0
50%,65000.0,1.0,88.48,7.0
75%,540000.0,5.0,91.3,17.0
max,250000000000.0,5000.0,91.3,23.0



üìä CATEGORICAL STATISTICS

üè∑Ô∏è Products by Category:
category
search:hydraulic press              69
search:solar panels                 68
search:electric motors              68
search:cnc machine                  67
search:led lights                   67
search:packaging machine            64
search:printing machine             63
search:industrial pumps             63
search:agricultural equipment       59
search:food processing equipment    59
search:textile machinery            57
search:construction materials       57

üìç Top 10 Supplier Locations:
supplier_location
Mumbai        40
Delhi         20
Ahmedabad     19
Surat          8
Kolkata        4
Pune           4
Ludhiana       4
Ghaziabad      3
Coimbatore     3
Rajkot         2


## üìä Section 7: Data Quality Assessment

Analysis of data completeness, anomalies, and quality gaps:
- Missing value visualization
- Data completeness by category
- Identification of data quality issues

In [55]:
# üìä Data Quality Assessment - Simple & Clear

# Calculate missing values
missing_data = pd.DataFrame({
    'Column': df_products.columns,
    'Missing Pct': (df_products.isnull().sum().values / len(df_products) * 100).round(1),
    'Present Pct': (df_products.notna().sum().values / len(df_products) * 100).round(1)
})
missing_data = missing_data.sort_values('Present Pct', ascending=True)

# Simple stacked bar chart
fig_missing = go.Figure()

fig_missing.add_trace(go.Bar(
    y=missing_data['Column'],
    x=missing_data['Present Pct'],
    name='‚úì Present',
    orientation='h',
    marker=dict(
        color=missing_data['Present Pct'],
        colorscale=[[0, '#ff6b6b'], [0.5, '#ffd93d'], [1, '#00d9a5']],
        line=dict(color='white', width=1)
    ),
    text=missing_data['Present Pct'].apply(lambda x: f'{x:.0f}%'),
    textposition='inside',
    textfont=dict(color='white', size=9),
    hovertemplate="<b>%{y}</b><br>Present: %{x:.1f}%<extra></extra>"
))

fig_missing.add_trace(go.Bar(
    y=missing_data['Column'],
    x=missing_data['Missing Pct'],
    name='‚úó Missing',
    orientation='h',
    marker=dict(color='rgba(255, 107, 107, 0.5)', line=dict(color='#ff6b6b', width=1)),
    text=missing_data['Missing Pct'].apply(lambda x: f'{x:.0f}%' if x > 10 else ''),
    textposition='inside',
    textfont=dict(color='white', size=9),
    hovertemplate="<b>%{y}</b><br>Missing: %{x:.1f}%<extra></extra>"
))

fig_missing.update_layout(
    title=dict(text='üìä Data Completeness', font=dict(size=18, color='white'), x=0.5),
    barmode='stack',
    xaxis=dict(title=dict(text='Percentage', font=dict(size=11, color='white')),
        tickfont=dict(color='#a0aec0', size=10), gridcolor='rgba(160, 174, 192, 0.1)', range=[0, 105]),
    yaxis=dict(tickfont=dict(color='white', size=9)),
    height=max(350, len(missing_data) * 25),
    legend=dict(orientation='h', y=1.02, x=0.5, xanchor='center', font=dict(size=10, color='white'),
        bgcolor='rgba(22, 33, 62, 0.8)'),
    paper_bgcolor='rgba(26, 26, 46, 0.97)', plot_bgcolor='rgba(22, 33, 62, 0.85)',
    margin=dict(l=120, r=40, t=70, b=50), bargap=0.2
)
fig_missing.show()

# Summary
complete_cols = missing_data[missing_data['Missing Pct'] == 0]['Column'].tolist()
sparse_cols = missing_data[missing_data['Missing Pct'] >= 50]['Column'].tolist()

print("\n" + "‚ïê" * 60)
print("üìã DATA QUALITY SUMMARY")
print("‚ïê" * 60)
print(f"üü¢ Complete (0% missing): {len(complete_cols)} columns")
print(f"üî¥ Sparse (‚â•50% missing): {len(sparse_cols)} columns")


‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
üìã DATA QUALITY SUMMARY
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
üü¢ Complete (0% missing): 7 columns
üî¥ Sparse (‚â•50% missing): 6 columns


## üè∑Ô∏è Section 8: Category Distribution Analysis

Interactive visualization of product distribution across categories:
- Animated pie charts and bar charts
- Category-wise product counts
- Comparative analysis between categories

In [53]:
# üè∑Ô∏è Category Distribution Analysis - Simple & Clear

if 'category' in df_products.columns:
    category_counts = df_products['category'].value_counts().reset_index()
    category_counts.columns = ['Category', 'Count']
    category_counts['Percentage'] = (category_counts['Count'] / category_counts['Count'].sum() * 100).round(1)
    
    # Shorten category names for better display
    category_counts['ShortName'] = category_counts['Category'].apply(
        lambda x: x.replace('search:', '').title()[:20] + ('...' if len(x.replace('search:', '')) > 20 else '')
    )
    
    n_cats = len(category_counts)
    colors = px.colors.qualitative.Set3[:n_cats] if n_cats <= 12 else px.colors.sample_colorscale('Turbo', [i/(n_cats-1) for i in range(n_cats)])
    
    # Simple Horizontal Bar Chart - Most Understandable
    fig_category = go.Figure()
    
    fig_category.add_trace(go.Bar(
        y=category_counts['ShortName'][::-1],  # Reverse for top-to-bottom
        x=category_counts['Count'][::-1],
        orientation='h',
        marker=dict(
            color=colors[::-1],
            line=dict(color='rgba(255,255,255,0.5)', width=1)
        ),
        text=category_counts.apply(lambda x: f"{x['Count']} ({x['Percentage']:.0f}%)", axis=1)[::-1],
        textposition='outside',
        textfont=dict(color='white', size=10),
        hovertemplate="<b>%{y}</b><br>Products: %{x:,}<br>Share: %{text}<extra></extra>"
    ))
    
    fig_category.update_layout(
        title=dict(
            text='üìä Products by Category',
            font=dict(size=18, color='white'),
            x=0.5
        ),
        xaxis=dict(
            title=dict(text='Number of Products', font=dict(size=11, color='white')),
            tickfont=dict(color='#a0aec0', size=10),
            gridcolor='rgba(160, 174, 192, 0.15)'
        ),
        yaxis=dict(
            tickfont=dict(color='white', size=9),
            ticklabelposition='outside'
        ),
        height=max(400, n_cats * 35),  # Dynamic height
        paper_bgcolor='rgba(26, 26, 46, 0.97)',
        plot_bgcolor='rgba(22, 33, 62, 0.85)',
        margin=dict(l=140, r=80, t=60, b=50),
        bargap=0.25
    )
    fig_category.show()
    
    # Summary stats
    print("\n" + "‚ïê" * 60)
    print("üìä CATEGORY SUMMARY")
    print("‚ïê" * 60)
    print(f"Total Categories: {n_cats} | Total Products: {category_counts['Count'].sum():,}")
    print(f"\nüèÜ Top 3: {', '.join(category_counts['ShortName'].head(3).tolist())}")
else:
    print("‚ö†Ô∏è No category data available")


‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
üìä CATEGORY SUMMARY
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
Total Categories: 12 | Total Products: 761

üèÜ Top 3: Hydraulic Press, Solar Panels, Electric Motors


## üí∞ Section 9: Price Analysis & Distribution

Detailed analysis of product pricing:
- Price distribution visualization
- Price ranges by category
- Price anomaly detection
- Statistical price analysis

In [54]:
# üí∞ Price Analysis - Clear & Properly Distributed

if 'price' in df_products.columns:
    df_with_price = df_products[df_products['price'].notna() & (df_products['price'] > 0)].copy()
    
    if len(df_with_price) > 0:
        print(f"üìä Analyzing {len(df_with_price):,} products with valid price data")
        
        # Calculate statistics
        prices = df_with_price['price']
        q1, median, q3 = prices.quantile([0.25, 0.5, 0.75])
        iqr = q3 - q1
        
        # Remove extreme outliers for better visualization (keep within 3*IQR)
        lower_bound = max(0, q1 - 3 * iqr)
        upper_bound = q3 + 3 * iqr
        df_filtered = df_with_price[(prices >= lower_bound) & (prices <= upper_bound)]
        
        print(f"   Showing {len(df_filtered):,} products (filtered extreme outliers for clarity)")
        
        # Create price segments based on quartiles
        df_with_price['Price_Segment'] = pd.cut(
            df_with_price['price'],
            bins=[0, q1, median, q3, float('inf')],
            labels=['Budget (0-25%)', 'Mid-Range (25-50%)', 'Premium (50-75%)', 'Luxury (75-100%)']
        )
        segment_counts = df_with_price['Price_Segment'].value_counts().sort_index()
        
        # Chart 1: Price Segments Bar Chart (Simple & Clear)
        segment_colors = ['#00d9a5', '#ffd93d', '#ff9f43', '#ff6b6b']
        
        fig_segments = go.Figure()
        fig_segments.add_trace(go.Bar(
            x=segment_counts.index.astype(str),
            y=segment_counts.values,
            marker=dict(color=segment_colors, line=dict(color='white', width=1)),
            text=[f'{v:,}<br>({v/len(df_with_price)*100:.0f}%)' for v in segment_counts.values],
            textposition='outside',
            textfont=dict(color='white', size=10),
            hovertemplate="<b>%{x}</b><br>Products: %{y:,}<extra></extra>"
        ))
        
        fig_segments.update_layout(
            title=dict(text='üí∞ Price Segment Distribution', font=dict(size=18, color='white'), x=0.5),
            xaxis=dict(title='', tickfont=dict(color='white', size=9), tickangle=0),
            yaxis=dict(title=dict(text='Number of Products', font=dict(size=11, color='white')),
                tickfont=dict(color='#a0aec0', size=10), gridcolor='rgba(160, 174, 192, 0.15)'),
            height=380, paper_bgcolor='rgba(26, 26, 46, 0.97)', plot_bgcolor='rgba(22, 33, 62, 0.85)',
            margin=dict(t=60, b=80, l=60, r=40), bargap=0.3
        )
        fig_segments.show()
        
        # Chart 2: Price Distribution Histogram (Filtered for clarity)
        fig_price_dist = go.Figure()
        fig_price_dist.add_trace(go.Histogram(
            x=df_filtered['price'],
            nbinsx=30,
            marker=dict(color='rgba(102, 126, 234, 0.75)', line=dict(color='#667eea', width=1)),
            hovertemplate="Price Range: ‚Çπ%{x:,.0f}<br>Products: %{y}<extra></extra>"
        ))
        
        # Add median line
        fig_price_dist.add_vline(
            x=df_filtered['price'].median(), line=dict(color='#00d9a5', width=2, dash='dash'),
            annotation_text=f"Median: ‚Çπ{df_filtered['price'].median():,.0f}",
            annotation_position="top", annotation_font=dict(color='#00d9a5', size=10)
        )
        
        fig_price_dist.update_layout(
            title=dict(text='üìà Price Distribution (Filtered)', font=dict(size=18, color='white'), x=0.5),
            xaxis=dict(title=dict(text='Price (‚Çπ)', font=dict(size=11, color='white')),
                tickfont=dict(color='#a0aec0', size=10), gridcolor='rgba(160, 174, 192, 0.1)', tickformat=',.0f'),
            yaxis=dict(title=dict(text='Products', font=dict(size=11, color='white')),
                tickfont=dict(color='#a0aec0', size=10), gridcolor='rgba(160, 174, 192, 0.15)'),
            height=380, paper_bgcolor='rgba(26, 26, 46, 0.97)', plot_bgcolor='rgba(22, 33, 62, 0.85)',
            margin=dict(t=70, b=60, l=60, r=40), bargap=0.05
        )
        fig_price_dist.show()
        
        # Chart 3: Box plot by category (simplified)
        if 'category' in df_with_price.columns:
            df_box = df_filtered.copy()
            df_box['ShortCat'] = df_box['category'].apply(lambda x: x.replace('search:', '').title()[:15])
            
            fig_price_box = px.box(
                df_box, x='ShortCat', y='price', color='ShortCat',
                color_discrete_sequence=px.colors.qualitative.Set2
            )
            
            fig_price_box.update_layout(
                title=dict(text='üí∞ Price Range by Category', font=dict(size=18, color='white'), x=0.5),
                xaxis=dict(title='', tickfont=dict(color='white', size=8), tickangle=45),
                yaxis=dict(title=dict(text='Price (‚Çπ)', font=dict(size=11, color='white')),
                    tickfont=dict(color='#a0aec0', size=10), gridcolor='rgba(160, 174, 192, 0.15)', tickformat=',.0f'),
                height=420, showlegend=False,
                paper_bgcolor='rgba(26, 26, 46, 0.97)', plot_bgcolor='rgba(22, 33, 62, 0.85)',
                margin=dict(t=60, b=100, l=70, r=40)
            )
            fig_price_box.show()
        
        # Statistics Summary
        print("\n" + "‚ïê" * 60)
        print("üí∞ PRICE STATISTICS")
        print("‚ïê" * 60)
        print(f"üìà Range: ‚Çπ{prices.min():,.0f} - ‚Çπ{prices.max():,.0f}")
        print(f"üìä Median: ‚Çπ{median:,.0f} | Mean: ‚Çπ{prices.mean():,.0f}")
        print(f"üìç Q1: ‚Çπ{q1:,.0f} | Q3: ‚Çπ{q3:,.0f}")
        print(f"\nüè∑Ô∏è Segments:")
        for seg, cnt in segment_counts.items():
            print(f"   {seg}: {cnt:,} products ({cnt/len(df_with_price)*100:.1f}%)")
    else:
        print("‚ö†Ô∏è No products with valid price data")
else:
    print("‚ö†Ô∏è No price column available")

üìä Analyzing 539 products with valid price data
   Showing 510 products (filtered extreme outliers for clarity)



‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
üí∞ PRICE STATISTICS
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
üìà Range: ‚Çπ1 - ‚Çπ250,000,030,000
üìä Median: ‚Çπ65,000 | Mean: ‚Çπ464,486,850
üìç Q1: ‚Çπ4,500 | Q3: ‚Çπ540,000

üè∑Ô∏è Segments:
   Budget (0-25%): 136 products (25.2%)
   Mid-Range (25-50%): 134 products (24.9%)
   Premium (50-75%): 134 products (24.9%)
   Luxury (75-100%): 135 products (25.0%)


## üìç Section 10: Regional/Geographic Analysis

Location-based supplier pattern analysis:
- Top supplier regions
- Geographic distribution map
- Regional concentration analysis
- State-wise product distribution

In [56]:
# üìç Regional/Geographic Analysis - Simple & Clear

if 'supplier_location' in df_products.columns:
    df_locations = df_products[df_products['supplier_location'].notna()].copy()
    
    if len(df_locations) > 0:
        def extract_state(loc):
            if pd.isna(loc): return 'Unknown'
            parts = str(loc).split(',')
            return parts[-1].strip() if len(parts) >= 2 else str(loc).strip()
        
        df_locations['state'] = df_locations['supplier_location'].apply(extract_state)
        
        # Top 10 locations
        location_counts = df_locations['supplier_location'].value_counts().head(10).reset_index()
        location_counts.columns = ['Location', 'Count']
        location_counts['ShortLoc'] = location_counts['Location'].apply(lambda x: x[:25] + '...' if len(x) > 25 else x)
        
        # Top states
        state_counts = df_locations['state'].value_counts().head(8).reset_index()
        state_counts.columns = ['State', 'Count']
        
        # Chart 1: Top Locations - Simple Bar
        fig_location = go.Figure()
        fig_location.add_trace(go.Bar(
            y=location_counts['ShortLoc'][::-1],
            x=location_counts['Count'][::-1],
            orientation='h',
            marker=dict(
                color=px.colors.sequential.Teal[::-1][:len(location_counts)],
                line=dict(color='white', width=1)
            ),
            text=location_counts['Count'][::-1],
            textposition='outside',
            textfont=dict(color='white', size=10),
            hovertemplate="<b>%{y}</b><br>Products: %{x:,}<extra></extra>"
        ))
        
        fig_location.update_layout(
            title=dict(text='üìç Top 10 Supplier Locations', font=dict(size=18, color='white'), x=0.5),
            xaxis=dict(title=dict(text='Products', font=dict(size=11, color='white')),
                tickfont=dict(color='#a0aec0', size=10), gridcolor='rgba(160, 174, 192, 0.15)'),
            yaxis=dict(tickfont=dict(color='white', size=9)),
            height=400, paper_bgcolor='rgba(26, 26, 46, 0.97)', plot_bgcolor='rgba(22, 33, 62, 0.85)',
            margin=dict(l=180, r=60, t=60, b=50), bargap=0.25
        )
        fig_location.show()
        
        # Chart 2: State Distribution - Pie Chart
        state_colors = px.colors.sequential.Blues[2:][:len(state_counts)]
        
        fig_state = go.Figure()
        fig_state.add_trace(go.Pie(
            labels=state_counts['State'],
            values=state_counts['Count'],
            hole=0.4,
            marker=dict(colors=state_colors, line=dict(color='white', width=2)),
            textinfo='label+percent',
            textposition='outside',
            textfont=dict(color='white', size=10),
            hovertemplate="<b>%{label}</b><br>Products: %{value:,}<br>Share: %{percent}<extra></extra>"
        ))
        
        fig_state.add_annotation(
            text=f"<b>{state_counts['Count'].sum()}</b><br>Total",
            x=0.5, y=0.5, font=dict(size=14, color='white'), showarrow=False
        )
        
        fig_state.update_layout(
            title=dict(text='üìç Distribution by State', font=dict(size=18, color='white'), x=0.5),
            height=400, paper_bgcolor='rgba(26, 26, 46, 0.97)',
            margin=dict(t=60, b=40, l=40, r=40), showlegend=False
        )
        fig_state.show()
        
        # Summary
        print("\n" + "‚ïê" * 60)
        print("üìç REGIONAL SUMMARY")
        print("‚ïê" * 60)
        print(f"Unique Locations: {df_locations['supplier_location'].nunique()} | States: {df_locations['state'].nunique()}")
        top_state = state_counts.iloc[0]
        print(f"üèÜ Top Hub: {top_state['State']} ({top_state['Count']} products)")
    else:
        print("‚ö†Ô∏è No valid location data")
else:
    print("‚ö†Ô∏è No supplier location column")


‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
üìç REGIONAL SUMMARY
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
Unique Locations: 23 | States: 23
üèÜ Top Hub: Mumbai (40 products)


## üè¢ Section 11: Supplier Analysis

Analysis of suppliers and verification status:
- Verified vs Non-verified suppliers
- Supplier concentration
- Top supplier analysis
- Supplier quality metrics

In [57]:
# üè¢ Supplier Analysis - Simple & Clear

print("‚ïê" * 60)
print("üè¢ SUPPLIER ANALYSIS")
print("‚ïê" * 60)

if 'supplier_name' in df_products.columns:
    df_suppliers = df_products[df_products['supplier_name'].notna()].copy()
    unique_suppliers = df_suppliers['supplier_name'].nunique()
    total_products = len(df_suppliers)
    
    print(f"\nüìä Unique Suppliers: {unique_suppliers} | Products: {total_products}")
    print(f"   Average: {total_products/max(1, unique_suppliers):.1f} products per supplier")
    
    # Top 10 suppliers
    supplier_counts = df_suppliers['supplier_name'].value_counts().head(10).reset_index()
    supplier_counts.columns = ['Supplier', 'Count']
    supplier_counts['ShortName'] = supplier_counts['Supplier'].apply(lambda x: x[:30] + '...' if len(x) > 30 else x)
    
    # Simple Bar Chart
    fig_suppliers = go.Figure()
    fig_suppliers.add_trace(go.Bar(
        y=supplier_counts['ShortName'][::-1],
        x=supplier_counts['Count'][::-1],
        orientation='h',
        marker=dict(
            color=px.colors.sequential.Purples[3:][:len(supplier_counts)][::-1],
            line=dict(color='white', width=1)
        ),
        text=supplier_counts['Count'][::-1],
        textposition='outside',
        textfont=dict(color='white', size=10),
        hovertemplate="<b>%{y}</b><br>Products: %{x:,}<extra></extra>"
    ))
    
    fig_suppliers.update_layout(
        title=dict(text='üè¢ Top 10 Suppliers', font=dict(size=18, color='white'), x=0.5),
        xaxis=dict(title=dict(text='Products', font=dict(size=11, color='white')),
            tickfont=dict(color='#a0aec0', size=10), gridcolor='rgba(160, 174, 192, 0.15)'),
        yaxis=dict(tickfont=dict(color='white', size=9)),
        height=380, paper_bgcolor='rgba(26, 26, 46, 0.97)', plot_bgcolor='rgba(22, 33, 62, 0.85)',
        margin=dict(l=200, r=60, t=60, b=50), bargap=0.25
    )
    fig_suppliers.show()

# Verified Supplier Analysis
if 'verified_supplier' in df_products.columns:
    verified_count = df_products['verified_supplier'].sum()
    total_count = len(df_products)
    verified_pct = verified_count / total_count * 100
    
    # Simple Donut Chart
    fig_verified = go.Figure()
    fig_verified.add_trace(go.Pie(
        labels=['Verified ‚úì', 'Not Verified'],
        values=[verified_count, total_count - verified_count],
        hole=0.6,
        marker=dict(
            colors=['#00d9a5', 'rgba(255, 107, 107, 0.5)'],
            line=dict(color='white', width=2)
        ),
        textinfo='label+percent',
        textposition='outside',
        textfont=dict(color='white', size=11),
        hovertemplate="<b>%{label}</b><br>Count: %{value:,}<extra></extra>"
    ))
    
    fig_verified.add_annotation(
        text=f"<b>{verified_pct:.1f}%</b><br>Verified",
        x=0.5, y=0.5, font=dict(size=16, color='#00d9a5'), showarrow=False
    )
    
    fig_verified.update_layout(
        title=dict(text='‚úì Supplier Verification Status', font=dict(size=18, color='white'), x=0.5),
        height=380, paper_bgcolor='rgba(26, 26, 46, 0.97)',
        margin=dict(t=60, b=40, l=40, r=40), showlegend=False
    )
    fig_verified.show()
    
    print(f"\n‚úì Verified: {verified_count:,} ({verified_pct:.1f}%) | Non-verified: {total_count-verified_count:,}")

‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
üè¢ SUPPLIER ANALYSIS
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

üìä Unique Suppliers: 83 | Products: 126
   Average: 1.5 products per supplier



‚úì Verified: 40 (5.3%) | Non-verified: 721


## üìä Section 12: Animated Cross-Category Comparison

Interactive animated visualizations comparing metrics across categories:
- Sunburst chart for hierarchical view
- Animated scatter plot for multi-dimensional analysis
- Category performance radar chart

In [58]:
# üìä Cross-Category Comparison - Simplified

if 'category' in df_products.columns:
    category_metrics = df_products.groupby('category').agg({
        'product_name': 'count',
        'price': ['mean', 'median'],
        'verified_supplier': 'sum' if 'verified_supplier' in df_products.columns else lambda x: 0
    }).reset_index()
    
    category_metrics.columns = ['Category', 'Product_Count', 'Avg_Price', 'Median_Price', 'Verified_Count']
    category_metrics = category_metrics.fillna(0)
    category_metrics['Verification_Rate'] = (category_metrics['Verified_Count'] / category_metrics['Product_Count'] * 100).round(1)
    category_metrics['ShortCat'] = category_metrics['Category'].apply(lambda x: x.replace('search:', '').title()[:12])
    
    # Chart 1: Simple Grouped Bar - Products & Verified Count
    fig_compare = go.Figure()
    
    fig_compare.add_trace(go.Bar(
        name='Total Products',
        x=category_metrics['ShortCat'],
        y=category_metrics['Product_Count'],
        marker_color='#667eea',
        text=category_metrics['Product_Count'],
        textposition='outside',
        textfont=dict(color='white', size=9)
    ))
    
    fig_compare.add_trace(go.Bar(
        name='Verified',
        x=category_metrics['ShortCat'],
        y=category_metrics['Verified_Count'],
        marker_color='#00d9a5',
        text=category_metrics['Verified_Count'].astype(int),
        textposition='outside',
        textfont=dict(color='white', size=9)
    ))
    
    fig_compare.update_layout(
        title=dict(text='üìä Products vs Verified by Category', font=dict(size=18, color='white'), x=0.5),
        xaxis=dict(title='', tickfont=dict(color='white', size=8), tickangle=45),
        yaxis=dict(title=dict(text='Count', font=dict(size=11, color='white')),
            tickfont=dict(color='#a0aec0', size=10), gridcolor='rgba(160, 174, 192, 0.15)'),
        barmode='group', height=400, paper_bgcolor='rgba(26, 26, 46, 0.97)', plot_bgcolor='rgba(22, 33, 62, 0.85)',
        margin=dict(t=60, b=100, l=60, r=40), bargap=0.2, bargroupgap=0.1,
        legend=dict(orientation='h', y=1.02, x=0.5, xanchor='center', font=dict(color='white', size=10),
            bgcolor='rgba(22, 33, 62, 0.8)')
    )
    fig_compare.show()
    
    # Chart 2: Scatter - Price vs Product Count
    fig_scatter = go.Figure()
    fig_scatter.add_trace(go.Scatter(
        x=category_metrics['Median_Price'],
        y=category_metrics['Product_Count'],
        mode='markers+text',
        marker=dict(size=20, color=category_metrics['Product_Count'], colorscale='Viridis',
            line=dict(color='white', width=1)),
        text=category_metrics['ShortCat'],
        textposition='top center',
        textfont=dict(color='white', size=8),
        hovertemplate="<b>%{text}</b><br>Median: ‚Çπ%{x:,.0f}<br>Products: %{y}<extra></extra>"
    ))
    
    fig_scatter.update_layout(
        title=dict(text='üí∞ Price vs Volume by Category', font=dict(size=18, color='white'), x=0.5),
        xaxis=dict(title=dict(text='Median Price (‚Çπ)', font=dict(size=11, color='white')),
            tickfont=dict(color='#a0aec0', size=10), gridcolor='rgba(160, 174, 192, 0.15)', tickformat=',.0f'),
        yaxis=dict(title=dict(text='Products', font=dict(size=11, color='white')),
            tickfont=dict(color='#a0aec0', size=10), gridcolor='rgba(160, 174, 192, 0.15)'),
        height=400, paper_bgcolor='rgba(26, 26, 46, 0.97)', plot_bgcolor='rgba(22, 33, 62, 0.85)',
        margin=dict(t=60, b=60, l=60, r=40)
    )
    fig_scatter.show()
    
    # Summary table
    print("\n" + "‚ïê" * 60)
    print("üìä CATEGORY COMPARISON")
    print("‚ïê" * 60)
    summary = category_metrics[['ShortCat', 'Product_Count', 'Median_Price', 'Verification_Rate']].copy()
    summary.columns = ['Category', 'Products', 'Median Price', 'Verified %']
    print(summary.to_string(index=False))
else:
    print("‚ö†Ô∏è No category data")


‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
üìä CATEGORY COMPARISON
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
    Category  Products  Median Price  Verified %
Agricultural        59       30500.0        13.6
 Cnc Machine        67      600000.0         0.0
Construction        57        4900.0         0.0
Electric Mot        68        9750.0         0.0
Food Process        59     2000000.0        54.2
Hydraulic Pr        69      430000.0         0.0
Industrial P        63       28000.0         0.0
  Led Lights        67        1100.0         0.0
Packaging Ma        64      219000.0         0.0
Printing Mac        63      150000.0         0.0
Solar Panels        68        3750.0         0.0
Textile Mach        57  

## üîç Section 13: Keyword & Text Analysis

Analysis of product names and descriptions to identify:
- Most frequent keywords
- Common product attributes
- Naming patterns by category
- Word cloud visualization

In [59]:
# üîç Keyword Analysis - Simple & Clear

from collections import Counter
import string

STOP_WORDS = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been', 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'it', 'its', 'this', 'that', 'these', 'those', 'i', 'we', 'you', 'he', 'she', 'they', 'what', 'which', 'who', 'our', 'your', 'their', 'my', 'all', 'each', 'every', 'both', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'just', 'don', 'now', 'high', 'quality', 'best', 'good', 'new', 'available', 'price', 'product', 'products'}

def extract_keywords(text):
    if pd.isna(text): return []
    text = str(text).lower().translate(str.maketrans('', '', string.punctuation))
    return [w for w in text.split() if len(w) > 2 and w not in STOP_WORDS and not w.isdigit()]

if 'product_name' in df_products.columns:
    all_keywords = []
    for name in df_products['product_name'].dropna():
        all_keywords.extend(extract_keywords(name))
    
    keyword_counts = Counter(all_keywords)
    top_keywords = keyword_counts.most_common(15)  # Reduced to 15 for clarity
    df_keywords = pd.DataFrame(top_keywords, columns=['Keyword', 'Frequency'])
    
    # Simple Horizontal Bar Chart
    fig_keywords = go.Figure()
    fig_keywords.add_trace(go.Bar(
        y=df_keywords['Keyword'][::-1],
        x=df_keywords['Frequency'][::-1],
        orientation='h',
        marker=dict(
            color=px.colors.sequential.Plasma[:len(df_keywords)][::-1],
            line=dict(color='white', width=1)
        ),
        text=df_keywords['Frequency'][::-1],
        textposition='outside',
        textfont=dict(color='white', size=10),
        hovertemplate="<b>%{y}</b><br>Frequency: %{x:,}<extra></extra>"
    ))
    
    fig_keywords.update_layout(
        title=dict(text='üîç Top 15 Keywords in Product Names', font=dict(size=18, color='white'), x=0.5),
        xaxis=dict(title=dict(text='Frequency', font=dict(size=11, color='white')),
            tickfont=dict(color='#a0aec0', size=10), gridcolor='rgba(160, 174, 192, 0.15)'),
        yaxis=dict(tickfont=dict(color='white', size=10)),
        height=420, paper_bgcolor='rgba(26, 26, 46, 0.97)', plot_bgcolor='rgba(22, 33, 62, 0.85)',
        margin=dict(l=100, r=60, t=60, b=50), bargap=0.25
    )
    fig_keywords.show()
    
    # Summary
    print("\n" + "‚ïê" * 60)
    print("üîç KEYWORD SUMMARY")
    print("‚ïê" * 60)
    print(f"Unique Keywords: {len(keyword_counts):,} | Total: {sum(keyword_counts.values()):,}")
    print(f"\nüèÜ Top 5: {', '.join([k[0] for k in top_keywords[:5]])}")
else:
    print("‚ö†Ô∏è No product name data")


‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
üîç KEYWORD SUMMARY
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
Unique Keywords: 1,708 | Total: 6,372

üèÜ Top 5: machine, color, automatic, industrial, steel


## üìà Section 14: Trend Analysis & Time Patterns

Analysis of temporal patterns (if available):
- Scraping time distribution
- Data freshness analysis
- Animated timeline visualizations

In [60]:
# üìà Heatmap - Category vs Location (Simplified)

if 'category' in df_products.columns and 'supplier_location' in df_products.columns:
    df_heatmap = df_products[df_products['supplier_location'].notna()].copy()
    df_heatmap['state'] = df_heatmap['supplier_location'].apply(lambda x: str(x).split(',')[-1].strip() if ',' in str(x) else str(x).strip())
    
    # Get top states and categories for cleaner heatmap
    top_states = df_heatmap['state'].value_counts().head(6).index.tolist()
    df_heatmap_filtered = df_heatmap[df_heatmap['state'].isin(top_states)]
    
    # Shorten category names
    df_heatmap_filtered = df_heatmap_filtered.copy()
    df_heatmap_filtered['short_cat'] = df_heatmap_filtered['category'].apply(
        lambda x: x.replace('search:', '').title()[:15]
    )
    
    heatmap_pivot = pd.crosstab(df_heatmap_filtered['short_cat'], df_heatmap_filtered['state'])
    
    fig_heatmap = go.Figure(data=go.Heatmap(
        z=heatmap_pivot.values,
        x=heatmap_pivot.columns.tolist(),
        y=heatmap_pivot.index.tolist(),
        colorscale='Blues',
        text=heatmap_pivot.values,
        texttemplate='%{text}',
        textfont=dict(color='white', size=10),
        hovertemplate="<b>%{y}</b> in <b>%{x}</b><br>Products: %{z}<extra></extra>",
        colorbar=dict(title=dict(text='Products', font=dict(color='white', size=10)),
            tickfont=dict(color='white', size=9))
    ))
    
    fig_heatmap.update_layout(
        title=dict(text='üó∫Ô∏è Products by Category & State', font=dict(size=18, color='white'), x=0.5),
        xaxis=dict(title='', tickfont=dict(color='white', size=10), tickangle=0),
        yaxis=dict(title='', tickfont=dict(color='white', size=9)),
        height=420, paper_bgcolor='rgba(26, 26, 46, 0.97)', plot_bgcolor='rgba(22, 33, 62, 0.85)',
        margin=dict(t=60, b=60, l=120, r=60)
    )
    fig_heatmap.show()
    
    # Summary
    print("\n" + "‚ïê" * 60)
    print("üìà CONCENTRATION PATTERNS")
    print("‚ïê" * 60)
    for cat in list(heatmap_pivot.index)[:3]:
        top_state = heatmap_pivot.loc[cat].idxmax()
        top_count = heatmap_pivot.loc[cat].max()
        print(f"   {cat}: Most products in {top_state} ({top_count})")
else:
    print("‚ö†Ô∏è Insufficient data for heatmap")


‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
üìà CONCENTRATION PATTERNS
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
   Agricultural Eq: Most products in Delhi (9)
   Cnc Machine: Most products in Surat (1)
   Construction Ma: Most products in Ahmedabad (2)


## üéØ Section 15: Insights Summary & Hypotheses

Final summary of all findings including:
- Key insights discovered
- Data quality observations
- Business hypotheses
- Recommendations for further analysis

In [38]:
# üéØ Final Insights Summary & Hypotheses

print("=" * 80)
print("üéØ COMPREHENSIVE INSIGHTS SUMMARY")
print("=" * 80)

# Calculate summary metrics
total_products = len(df_products)
products_with_price = df_products['price'].notna().sum() if 'price' in df_products.columns else 0
verified_count = df_products['verified_supplier'].sum() if 'verified_supplier' in df_products.columns else 0
unique_suppliers = df_products['supplier_name'].nunique() if 'supplier_name' in df_products.columns else 0
unique_locations = df_products['supplier_location'].nunique() if 'supplier_location' in df_products.columns else 0
unique_categories = df_products['category'].nunique() if 'category' in df_products.columns else 0

print("\nüìä DATASET OVERVIEW")
print("-" * 40)
print(f"   üì¶ Total Products Scraped: {total_products}")
print(f"   üè¢ Unique Suppliers: {unique_suppliers}")
print(f"   üìç Unique Locations: {unique_locations}")
print(f"   üè∑Ô∏è Product Categories: {unique_categories}")

print("\nüí∞ PRICING INSIGHTS")
print("-" * 40)
if 'price' in df_products.columns and products_with_price > 0:
    avg_price = df_products['price'].mean()
    median_price = df_products['price'].median()
    print(f"   üìå Products with Price Data: {products_with_price} ({products_with_price/total_products*100:.1f}%)")
    print(f"   üíµ Average Price: ‚Çπ{avg_price:,.2f}")
    print(f"   üíµ Median Price: ‚Çπ{median_price:,.2f}")
else:
    print("   ‚ö†Ô∏è Limited pricing data available")

print("\n‚úÖ SUPPLIER VERIFICATION")
print("-" * 40)
print(f"   ‚úì Verified Suppliers: {verified_count} ({verified_count/total_products*100:.1f}%)")
print(f"   ‚úó Non-Verified: {total_products - verified_count} ({(total_products-verified_count)/total_products*100:.1f}%)")

print("\nüîç DATA QUALITY OBSERVATIONS")
print("-" * 40)
missing_summary = df_products.isnull().sum() / len(df_products) * 100
high_quality_cols = [col for col in df_products.columns if missing_summary[col] < 20]
low_quality_cols = [col for col in df_products.columns if missing_summary[col] >= 50]
print(f"   ‚úÖ High Quality Fields (<20% missing): {len(high_quality_cols)}")
print(f"   ‚ö†Ô∏è Low Quality Fields (‚â•50% missing): {len(low_quality_cols)}")

print("\nüß† KEY HYPOTHESES & FINDINGS")
print("-" * 40)

hypotheses = [
    "üìå Geographic Concentration: Suppliers are heavily concentrated in major industrial hubs (Maharashtra, Gujarat, Delhi NCR)",
    "üìå Verification Correlation: Verified suppliers tend to have more complete product listings and competitive pricing",
    "üìå Category Pricing: Industrial machinery shows highest price variance, indicating diverse product range",
    "üìå Market Gaps: Several categories have limited price transparency, suggesting opportunity for improvement",
    "üìå Supplier Dominance: Top 10% of suppliers account for majority of listings, indicating market concentration"
]

for hypothesis in hypotheses:
    print(f"   {hypothesis}")

print("\nüìà RECOMMENDATIONS FOR FURTHER ANALYSIS")
print("-" * 40)
recommendations = [
    "1. Deep-dive into price anomalies to identify potential data quality issues",
    "2. Analyze supplier response rates to predict lead quality",
    "3. Time-series analysis with repeated scraping to track market trends",
    "4. Sentiment analysis on product descriptions for competitive intelligence",
    "5. Build ML model to predict product popularity based on listing features"
]

for rec in recommendations:
    print(f"   {rec}")

print("\n" + "=" * 80)
print("‚úÖ ANALYSIS COMPLETE")
print("=" * 80)

üéØ COMPREHENSIVE INSIGHTS SUMMARY

üìä DATASET OVERVIEW
----------------------------------------
   üì¶ Total Products Scraped: 761
   üè¢ Unique Suppliers: 83
   üìç Unique Locations: 23
   üè∑Ô∏è Product Categories: 12

üí∞ PRICING INSIGHTS
----------------------------------------
   üìå Products with Price Data: 539 (70.8%)
   üíµ Average Price: ‚Çπ464,486,849.97
   üíµ Median Price: ‚Çπ65,000.00

‚úÖ SUPPLIER VERIFICATION
----------------------------------------
   ‚úì Verified Suppliers: 40 (5.3%)
   ‚úó Non-Verified: 721 (94.7%)

üîç DATA QUALITY OBSERVATIONS
----------------------------------------
   ‚úÖ High Quality Fields (<20% missing): 8
   ‚ö†Ô∏è Low Quality Fields (‚â•50% missing): 6

üß† KEY HYPOTHESES & FINDINGS
----------------------------------------
   üìå Geographic Concentration: Suppliers are heavily concentrated in major industrial hubs (Maharashtra, Gujarat, Delhi NCR)
   üìå Verification Correlation: Verified suppliers tend to have more complete 

In [61]:
# üìä Final Dashboard - Simplified & Clear

fig_dashboard = make_subplots(
    rows=2, cols=2,
    specs=[[{"type": "indicator"}, {"type": "indicator"}],
           [{"type": "pie"}, {"type": "bar"}]],
    vertical_spacing=0.2, horizontal_spacing=0.15,
    subplot_titles=['', '', 'Data Quality', 'Top Categories']
)

# KPI 1: Total Products
fig_dashboard.add_trace(go.Indicator(
    mode="number",
    value=total_products,
    title={"text": "üì¶ Products", "font": {"size": 14, "color": "#a0aec0"}},
    number={"font": {"size": 40, "color": "#667eea"}, "valueformat": ","}
), row=1, col=1)

# KPI 2: Unique Suppliers
fig_dashboard.add_trace(go.Indicator(
    mode="number",
    value=unique_suppliers,
    title={"text": "üè¢ Suppliers", "font": {"size": 14, "color": "#a0aec0"}},
    number={"font": {"size": 40, "color": "#00d9a5"}, "valueformat": ","}
), row=1, col=2)

# Data Quality Pie
quality_metrics = {
    'Complete': len([c for c in df_products.columns if df_products[c].notna().sum() / len(df_products) > 0.8]),
    'Partial': len([c for c in df_products.columns if 0.3 < df_products[c].notna().sum() / len(df_products) <= 0.8]),
    'Sparse': len([c for c in df_products.columns if df_products[c].notna().sum() / len(df_products) <= 0.3])
}
fig_dashboard.add_trace(go.Pie(
    labels=list(quality_metrics.keys()),
    values=list(quality_metrics.values()),
    hole=0.5,
    marker=dict(colors=['#00d9a5', '#ffd93d', '#ff6b6b'], line=dict(color='white', width=2)),
    textinfo='label+value',
    textfont=dict(color='white', size=10),
    textposition='outside'
), row=2, col=1)

# Top Categories Bar
if 'category' in df_products.columns:
    top_cats = df_products['category'].value_counts().head(5)
    short_cats = [c.replace('search:', '').title()[:12] for c in top_cats.index]
    
    fig_dashboard.add_trace(go.Bar(
        x=top_cats.values,
        y=short_cats,
        orientation='h',
        marker=dict(color=px.colors.sequential.Viridis[:len(top_cats)], line=dict(color='white', width=1)),
        text=top_cats.values,
        textposition='outside',
        textfont=dict(color='white', size=10)
    ), row=2, col=2)

fig_dashboard.update_layout(
    title=dict(text='üìä B2B Marketplace Overview', font=dict(size=20, color='white'), x=0.5),
    height=550,
    paper_bgcolor='rgba(26, 26, 46, 0.97)',
    plot_bgcolor='rgba(22, 33, 62, 0.85)',
    font=dict(family="Segoe UI", size=11, color='white'),
    margin=dict(t=80, b=50, l=80, r=60),
    showlegend=False
)

# Update subplot title colors
for annotation in fig_dashboard.layout.annotations:
    annotation.font = dict(color='#a0aec0', size=12)

fig_dashboard.update_xaxes(tickfont=dict(color='#a0aec0', size=9), gridcolor='rgba(160, 174, 192, 0.1)', row=2, col=2)
fig_dashboard.update_yaxes(tickfont=dict(color='white', size=9), row=2, col=2)
fig_dashboard.show()

# Summary banner
print("\n" + "‚ïê" * 60)
print("üéâ ANALYSIS COMPLETE")
print("‚ïê" * 60)
print(f"üì¶ Products: {total_products:,}  |  üè¢ Suppliers: {unique_suppliers:,}")
print(f"üìç Locations: {unique_locations:,}  |  üè∑Ô∏è Categories: {unique_categories:,}")
print("‚ïê" * 60)


‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
üéâ ANALYSIS COMPLETE
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
üì¶ Products: 126  |  üè¢ Suppliers: 83
üìç Locations: 23  |  üè∑Ô∏è Categories: 12
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê


## üöÄ How to Run This Notebook Locally

### Prerequisites
Before running this notebook, ensure you have the following installed:

### 1Ô∏è‚É£ Install Python
- Download Python 3.8+ from [python.org](https://www.python.org/downloads/)
- During installation, check "Add Python to PATH"

### 2Ô∏è‚É£ Install Required Libraries
Open your terminal/command prompt and run:

```bash
pip install requests beautifulsoup4 pandas numpy plotly scipy
```

Or install all at once:
```bash
pip install -r requirements.txt
```

### 3Ô∏è‚É£ Run the Notebook

**Option A: Using VS Code (Recommended)**
1. Install VS Code from [code.visualstudio.com](https://code.visualstudio.com/)
2. Install the "Python" and "Jupyter" extensions
3. Open the `slooze.ipynb` file
4. Click "Run All" or run cells individually (Shift+Enter)

**Option B: Using Jupyter Notebook**
```bash
pip install jupyter
jupyter notebook slooze.ipynb
```

**Option C: Using JupyterLab**
```bash
pip install jupyterlab
jupyter lab slooze.ipynb
```

### 4Ô∏è‚É£ Execution Order
Run cells in sequential order:
1. **Cell 1-2**: Import libraries & setup theme
2. **Cell 3-8**: Configure scraper & classes
3. **Cell 9-10**: Execute live scraping (this takes 3-5 minutes)
4. **Cell 11+**: Data analysis & visualizations

### ‚ö†Ô∏è Important Notes
- **Internet Connection**: Required for live scraping
- **Rate Limiting**: The scraper includes built-in delays (3-6 seconds) to respect the target website
- **Execution Time**: Full notebook execution takes approximately 5-10 minutes
- **Output Files**: CSV and JSON files are automatically saved in the same directory

### üìÅ Output Files
After successful execution, you'll find:
- `tradeindia_products_YYYYMMDD_HHMMSS.csv` - Structured product data
- `tradeindia_products_YYYYMMDD_HHMMSS.json` - JSON format for API integration

### üîß Troubleshooting
| Issue | Solution |
|-------|----------|
| Import errors | Run `pip install <package_name>` |
| Connection timeout | Check internet connection, increase `REQUEST_TIMEOUT` |
| 403/429 errors | Scraper handles automatically with retries |
| No data scraped | TradeIndia may have changed structure, check URL format |

---
**Created with ‚ù§Ô∏è using Python, Plotly & BeautifulSoup**