In [6]:
#!/usr/bin/env python3
"""
StarTech FAST Complete Scraper - Speed Optimized
Multi-threaded approach with minimal delays for maximum efficiency
Compatible with Google Colab
"""

# Quick setup for Colab
import sys
import subprocess
import importlib

def install_package(package):
    try:
        importlib.import_module(package.split('==')[0])
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package, "--quiet"])

print("🚀 Setting up FAST scraper...")
required = ["requests", "beautifulsoup4", "pandas", "concurrent.futures", "threading", "tqdm"]
for pkg in required:
    install_package(pkg)

import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from queue import Queue
import random
from urllib.parse import urljoin
from dataclasses import dataclass, asdict
from typing import List, Dict, Optional
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

@dataclass
class Product:
    """Lightweight product structure"""
    name: str
    price: str
    brand: str
    category: str
    subcategory: str
    availability: str
    image_url: str
    product_url: str
    model: str = ""
    rating: str = ""

class FastStarTechScraper:
    """Ultra-fast multi-threaded StarTech scraper"""

    def __init__(self, max_workers=15, max_pages=5):
        self.base_url = "https://www.startech.com.bd"
        self.max_workers = max_workers  # Increased for speed
        self.max_pages = max_pages

        # User agents for rotation - MOVED BEFORE _create_session()
        self.user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
        ]

        self.session = self._create_session()

        # Data storage
        self.all_categories = []
        self.all_products = []
        self.failed_urls = []

        # Thread safety
        self.lock = threading.Lock()
        self.progress_bar = None

    def _create_session(self):
        """Create optimized requests session"""
        session = requests.Session()

        # Connection pooling for speed
        adapter = requests.adapters.HTTPAdapter(
            pool_connections=20,
            pool_maxsize=20,
            max_retries=3
        )
        session.mount('http://', adapter)
        session.mount('https://', adapter)

        # Headers
        session.headers.update({
            'User-Agent': random.choice(self.user_agents),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Connection': 'keep-alive',
        })

        return session

    def get_all_categories(self):
        """FAST category extraction"""
        print("🔍 Extracting ALL categories...")

        try:
            response = self.session.get(self.base_url, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')

            categories = []

            # Find all navigation items
            nav_items = soup.select('li.nav-item.has-child')

            for nav_item in nav_items:
                main_link = nav_item.find('a', class_='nav-link')
                if not main_link:
                    continue

                cat_name = main_link.get_text(strip=True)
                cat_url = urljoin(self.base_url, main_link.get('href', ''))

                if len(cat_name) < 2:
                    continue

                # Add main category
                categories.append({
                    'name': cat_name,
                    'url': cat_url,
                    'level': 'main',
                    'parent': None
                })

                # Add subcategories
                dropdown = nav_item.find('ul', class_='drop-down')
                if dropdown:
                    sub_items = dropdown.find_all('li', class_='nav-item', recursive=False)

                    for sub_item in sub_items:
                        sub_link = sub_item.find('a', class_='nav-link')
                        if not sub_link:
                            continue

                        sub_name = sub_link.get_text(strip=True)
                        sub_url = urljoin(self.base_url, sub_link.get('href', ''))

                        if 'show all' in sub_name.lower() or len(sub_name) < 2:
                            continue

                        categories.append({
                            'name': sub_name,
                            'url': sub_url,
                            'level': 'sub',
                            'parent': cat_name
                        })

                        # Add sub-subcategories
                        sub_dropdown = sub_item.find('ul', class_='drop-down')
                        if sub_dropdown:
                            subsub_items = sub_dropdown.find_all('li', class_='nav-item', recursive=False)

                            for subsub_item in subsub_items:
                                subsub_link = subsub_item.find('a', class_='nav-link')
                                if not subsub_link:
                                    continue

                                subsub_name = subsub_link.get_text(strip=True)
                                subsub_url = urljoin(self.base_url, subsub_link.get('href', ''))

                                if 'show all' in subsub_name.lower() or len(subsub_name) < 2:
                                    continue

                                categories.append({
                                    'name': subsub_name,
                                    'url': subsub_url,
                                    'level': 'subsub',
                                    'parent': sub_name
                                })

            self.all_categories = categories
            print(f"✅ Found {len(categories)} total categories/subcategories")

            # Show breakdown
            main_count = len([c for c in categories if c['level'] == 'main'])
            sub_count = len([c for c in categories if c['level'] == 'sub'])
            subsub_count = len([c for c in categories if c['level'] == 'subsub'])

            print(f"   📁 Main categories: {main_count}")
            print(f"   📂 Subcategories: {sub_count}")
            print(f"   📄 Sub-subcategories: {subsub_count}")

            return categories

        except Exception as e:
            print(f"❌ Error extracting categories: {e}")
            return []

    def scrape_category_fast(self, category_data):
        """FAST single category scraping"""
        category_name = category_data['name']
        category_url = category_data['url']
        parent = category_data.get('parent', '')

        products = []

        try:
            # Create fresh session for this thread
            thread_session = requests.Session()
            thread_session.headers.update({
                'User-Agent': random.choice(self.user_agents),
                'Accept': 'text/html,application/xhtml+xml',
                'Connection': 'keep-alive',
            })

            for page in range(1, self.max_pages + 1):
                try:
                    # Construct page URL
                    if page == 1:
                        page_url = category_url
                    else:
                        page_url = f"{category_url}?page={page}" if '?' not in category_url else f"{category_url}&page={page}"

                    # Fast request with short timeout
                    response = thread_session.get(page_url, timeout=8)

                    if response.status_code != 200:
                        break

                    soup = BeautifulSoup(response.content, 'html.parser')

                    # Fast product extraction
                    page_products = self._extract_products_fast(soup, category_name, parent)

                    if not page_products:
                        break

                    products.extend(page_products)

                    # Quick check for next page
                    if not soup.select('.pagination .next:not(.disabled)'):
                        break

                except Exception as e:
                    print(f"⚠️ Error on page {page} of {category_name}: {e}")
                    break

            thread_session.close()
            return products

        except Exception as e:
            print(f"❌ Error scraping {category_name}: {e}")
            return []

    def _extract_products_fast(self, soup, category, subcategory):
        """FAST product extraction - essential fields only"""
        products = []

        # Fast selectors
        product_elements = soup.select('.p-item') or soup.select('.product-item') or soup.select('.main-product')

        for element in product_elements:
            try:
                # Quick extraction - only essential fields
                name_elem = element.select_one('.p-item-name a, .product-title a, h3 a, h4 a')
                name = name_elem.get_text(strip=True) if name_elem else ""

                price_elem = element.select_one('.p-item-price, .price, .current-price')
                price = price_elem.get_text(strip=True) if price_elem else "N/A"

                brand_elem = element.select_one('.brand, .p-item-brand, .manufacturer')
                brand = brand_elem.get_text(strip=True) if brand_elem else ""

                availability_elem = element.select_one('.stock, .availability, .p-item-stock')
                availability = availability_elem.get_text(strip=True) if availability_elem else ""

                img_elem = element.find('img')
                image_url = urljoin(self.base_url, img_elem.get('src', '')) if img_elem else ""

                link_elem = element.find('a', href=True)
                product_url = urljoin(self.base_url, link_elem.get('href', '')) if link_elem else ""

                if name and len(name) > 2:
                    product = Product(
                        name=name[:200],  # Limit length
                        price=price[:50],
                        brand=brand[:50],
                        category=category,
                        subcategory=subcategory,
                        availability=availability[:50],
                        image_url=image_url,
                        product_url=product_url
                    )
                    products.append(product)

            except Exception as e:
                continue  # Skip problematic products

        return products

    def scrape_all_products_fast(self):
        """FAST multi-threaded scraping of ALL products"""
        if not self.all_categories:
            print("❌ No categories found. Run get_all_categories() first.")
            return []

        print(f"\n🚀 Starting FAST multi-threaded scraping...")
        print(f"⚡ Using {self.max_workers} threads")
        print(f"📄 Max {self.max_pages} pages per category")
        print(f"📊 Processing {len(self.all_categories)} categories...")

        start_time = time.time()
        self.all_products = []

        # Progress bar
        self.progress_bar = tqdm(total=len(self.all_categories), desc="Categories", unit="cat")

        # Multi-threaded execution
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            # Submit all tasks
            future_to_category = {
                executor.submit(self.scrape_category_fast, cat_data): cat_data
                for cat_data in self.all_categories
            }

            # Collect results as they complete
            for future in as_completed(future_to_category):
                category_data = future_to_category[future]

                try:
                    products = future.result()

                    # Thread-safe update
                    with self.lock:
                        self.all_products.extend(products)

                    self.progress_bar.update(1)
                    self.progress_bar.set_description(f"Categories ({len(self.all_products)} products)")

                except Exception as e:
                    print(f"❌ Failed: {category_data['name']} - {e}")
                    self.progress_bar.update(1)

        self.progress_bar.close()

        # Remove duplicates quickly
        seen = set()
        unique_products = []
        for product in self.all_products:
            identifier = f"{product.name}|{product.product_url}"
            if identifier not in seen:
                seen.add(identifier)
                unique_products.append(product)

        self.all_products = unique_products

        elapsed_time = time.time() - start_time
        rate = len(self.all_products) / elapsed_time * 60  # products per minute

        print(f"\n✅ FAST SCRAPING COMPLETED!")
        print(f"⏱️ Time: {elapsed_time:.1f} seconds ({elapsed_time/60:.1f} minutes)")
        print(f"🛍️ Products: {len(self.all_products):,}")
        print(f"📈 Rate: {rate:.1f} products/minute")
        print(f"🗑️ Duplicates removed: {len(self.all_products) - len(unique_products)}")

        return self.all_products

    def save_results(self, filename_base="startech_fast"):
        """Save results in multiple formats"""
        if not self.all_products:
            print("❌ No products to save")
            return

        timestamp = time.strftime("%Y%m%d_%H%M%S")

        try:
            # CSV for Excel
            csv_file = f"{filename_base}_{timestamp}.csv"
            df = pd.DataFrame([asdict(product) for product in self.all_products])
            df.to_csv(csv_file, index=False, encoding='utf-8-sig')
            print(f"✅ Saved CSV: {csv_file}")

            # JSON for developers
            json_file = f"{filename_base}_{timestamp}.json"
            with open(json_file, 'w', encoding='utf-8') as f:
                json.dump([asdict(product) for product in self.all_products], f, indent=2, ensure_ascii=False)
            print(f"✅ Saved JSON: {json_file}")

            # Categories list
            cat_file = f"{filename_base}_categories_{timestamp}.json"
            with open(cat_file, 'w', encoding='utf-8') as f:
                json.dump(self.all_categories, f, indent=2, ensure_ascii=False)
            print(f"✅ Saved categories: {cat_file}")

            # Summary report
            self._generate_quick_report(filename_base, timestamp)

        except Exception as e:
            print(f"❌ Error saving: {e}")

    def _generate_quick_report(self, filename_base, timestamp):
        """Generate quick summary report"""
        try:
            # Basic statistics
            total_products = len(self.all_products)
            categories_with_products = len(set(p.category for p in self.all_products))
            top_categories = {}
            top_brands = {}

            for product in self.all_products:
                # Category count
                cat = product.category
                top_categories[cat] = top_categories.get(cat, 0) + 1

                # Brand count
                if product.brand:
                    top_brands[product.brand] = top_brands.get(product.brand, 0) + 1

            # Sort top items
            top_categories = dict(sorted(top_categories.items(), key=lambda x: x[1], reverse=True)[:20])
            top_brands = dict(sorted(top_brands.items(), key=lambda x: x[1], reverse=True)[:20])

            report = {
                'scraping_summary': {
                    'timestamp': timestamp,
                    'total_products': total_products,
                    'total_categories_found': len(self.all_categories),
                    'categories_with_products': categories_with_products,
                    'scraping_method': 'fast_multi_threaded'
                },
                'top_categories': top_categories,
                'top_brands': top_brands,
                'sample_products': [asdict(p) for p in self.all_products[:10]]
            }

            report_file = f"{filename_base}_report_{timestamp}.json"
            with open(report_file, 'w', encoding='utf-8') as f:
                json.dump(report, f, indent=2, ensure_ascii=False)
            print(f"✅ Saved report: {report_file}")

        except Exception as e:
            print(f"⚠️ Report generation error: {e}")

# CORRECTED FUNCTION NAMES FOR EASY USAGE
def extract_categories():
    """Extract all StarTech categories quickly"""
    scraper = FastStarTechScraper()
    categories = scraper.get_all_categories()

    # Save categories
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    with open(f"startech_categories_{timestamp}.json", 'w', encoding='utf-8') as f:
        json.dump(categories, f, indent=2, ensure_ascii=False)

    return categories

def scrape_all_fast(max_workers=15, max_pages=5):
    """FAST scraping of ALL StarTech products - MAIN FUNCTION"""
    print("🚀 STARTECH FAST COMPLETE SCRAPER")
    print("="*50)

    scraper = FastStarTechScraper(max_workers=max_workers, max_pages=max_pages)

    # Step 1: Get categories
    print("Step 1: Extracting categories...")
    categories = scraper.get_all_categories()

    if not categories:
        print("❌ Failed to extract categories")
        return None

    # Step 2: Scrape products
    print("\nStep 2: Scraping all products...")
    products = scraper.scrape_all_products_fast()

    # Step 3: Save results
    print("\nStep 3: Saving results...")
    scraper.save_results()

    # Summary
    if products:
        print(f"\n🎉 SUCCESS! Scraped {len(products):,} products")
        print(f"📁 Categories processed: {len(categories)}")

        # Show top categories
        cat_counts = {}
        for p in products:
            cat_counts[p.category] = cat_counts.get(p.category, 0) + 1

        print(f"\n🏆 TOP 10 CATEGORIES:")
        top_cats = sorted(cat_counts.items(), key=lambda x: x[1], reverse=True)[:10]
        for i, (cat, count) in enumerate(top_cats, 1):
            print(f"   {i:2d}. {cat[:40]:40s} {count:4d} products")

    return {
        'categories': categories,
        'products': products,
        'scraper': scraper
    }

def scrape_sample_fast(max_categories=10, max_pages=3):
    """Quick test with limited categories"""
    print(f"🧪 SAMPLE FAST SCRAPING ({max_categories} categories)")

    scraper = FastStarTechScraper(max_workers=8, max_pages=max_pages)

    # Get categories
    categories = scraper.get_all_categories()
    if not categories:
        return None

    # Limit to sample
    scraper.all_categories = categories[:max_categories]

    # Scrape
    products = scraper.scrape_all_products_fast()
    scraper.save_results("startech_sample")

    print(f"✅ Sample complete: {len(products)} products from {max_categories} categories")
    return products

def show_usage():
    """Show correct usage instructions"""
    print("🚀 STARTECH FAST SCRAPER - USAGE")
    print("="*40)
    print()
    print("📖 MAIN FUNCTIONS:")
    print()
    print("1️⃣ COMPLETE SCRAPING (FAST):")
    print("   result = scrape_all_fast()")
    print("   • Scrapes ALL categories (15 threads)")
    print("   • Takes 15-30 minutes typically")
    print("   • Auto-saves results")
    print()
    print("2️⃣ SAMPLE SCRAPING (TEST):")
    print("   result = scrape_sample_fast(max_categories=10)")
    print("   • Test with first 10 categories")
    print("   • Takes 2-5 minutes")
    print()
    print("3️⃣ CATEGORIES ONLY:")
    print("   categories = extract_categories()")
    print("   • Just get category structure")
    print("   • Takes 30 seconds")
    print()
    print("⚙️ CUSTOMIZATION:")
    print("   # More threads (faster, but higher load)")
    print("   result = scrape_all_fast(max_workers=20)")
    print()
    print("   # More pages per category")
    print("   result = scrape_all_fast(max_pages=10)")
    print()
    print("📁 OUTPUT FILES:")
    print("   • startech_fast_YYYYMMDD_HHMMSS.csv")
    print("   • startech_fast_YYYYMMDD_HHMMSS.json")
    print("   • startech_fast_categories_YYYYMMDD_HHMMSS.json")
    print("   • startech_fast_report_YYYYMMDD_HHMMSS.json")

# Initialize
print("⚡ FAST StarTech Scraper Ready!")
print("📚 Run show_usage() for instructions")
print("🚀 Run scrape_all_fast() to start")
print("🧪 Run scrape_sample_fast() to test")

"""
⚡ FAST STARTECH SCRAPER - OPTIMIZED FOR SPEED
=============================================

🎯 KEY IMPROVEMENTS:
✅ 15 concurrent threads (vs 3 before)
✅ Minimal delays (vs long waits before)
✅ Direct requests (vs heavy crawl4ai)
✅ Essential fields only (vs comprehensive extraction)
✅ Smart session reuse
✅ Quick duplicate removal
✅ Progress tracking with tqdm

🚀 SPEED ESTIMATES:
• Sample (10 categories): 2-5 minutes
• Complete scraping: 15-30 minutes (vs 4-8 hours before)
• Category extraction: 30 seconds

📊 EXPECTED RESULTS:
• 10,000-50,000+ products
• All categories and subcategories
• High-quality essential data
• Multiple output formats

⚠️ NOTES:
• Uses respectful but aggressive threading
• Optimized for speed over comprehensive data
• Falls back gracefully on errors
• Auto-saves results continuously
"""

🚀 Setting up FAST scraper...
⚡ FAST StarTech Scraper Ready!
📚 Run show_usage() for instructions
🚀 Run scrape_all_fast() to start
🧪 Run scrape_sample_fast() to test




In [7]:
# Test with a small sample first
result = scrape_sample_fast(max_categories=5)

# If that works well, then run the complete scraping
# result = scrape_all_fast()

🧪 SAMPLE FAST SCRAPING (5 categories)
🔍 Extracting ALL categories...
✅ Found 1712 total categories/subcategories
   📁 Main categories: 144
   📂 Subcategories: 988
   📄 Sub-subcategories: 580

🚀 Starting FAST multi-threaded scraping...
⚡ Using 8 threads
📄 Max 3 pages per category
📊 Processing 5 categories...


Categories:   0%|          | 0/5 [00:00<?, ?cat/s]


✅ FAST SCRAPING COMPLETED!
⏱️ Time: 1.9 seconds (0.0 minutes)
🛍️ Products: 31
📈 Rate: 994.4 products/minute
🗑️ Duplicates removed: 0
✅ Saved CSV: startech_sample_20250903_193808.csv
✅ Saved JSON: startech_sample_20250903_193808.json
✅ Saved categories: startech_sample_categories_20250903_193808.json
✅ Saved report: startech_sample_report_20250903_193808.json
✅ Sample complete: 31 products from 5 categories


In [8]:
# Test with a small sample first
# result = scrape_sample_fast(max_categories=5)

# If that works well, then run the complete scraping
result = scrape_all_fast()

🚀 STARTECH FAST COMPLETE SCRAPER
Step 1: Extracting categories...
🔍 Extracting ALL categories...
✅ Found 1712 total categories/subcategories
   📁 Main categories: 144
   📂 Subcategories: 988
   📄 Sub-subcategories: 580

Step 2: Scraping all products...

🚀 Starting FAST multi-threaded scraping...
⚡ Using 15 threads
📄 Max 5 pages per category
📊 Processing 1712 categories...


Categories:   0%|          | 0/1712 [00:00<?, ?cat/s]


✅ FAST SCRAPING COMPLETED!
⏱️ Time: 700.8 seconds (11.7 minutes)
🛍️ Products: 8,462
📈 Rate: 724.5 products/minute
🗑️ Duplicates removed: 0

Step 3: Saving results...
✅ Saved CSV: startech_fast_20250903_195133.csv
✅ Saved JSON: startech_fast_20250903_195133.json
✅ Saved categories: startech_fast_categories_20250903_195133.json
✅ Saved report: startech_fast_report_20250903_195133.json

🎉 SUCCESS! Scraped 8,462 products
📁 Categories processed: 1712

🏆 TOP 10 CATEGORIES:
    1. Hikvision                                 110 products
    2. HP                                        101 products
    3. Canon                                      96 products
    4. Dahua                                      92 products
    5. TP-Link                                    88 products
    6. Samsung                                    73 products
    7. Epson                                      69 products
    8. Sony                                       69 products
    9. Transcend               