In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
import time
import concurrent.futures
import re


# ----------------------------------------------------
# Fast Product Scanner for Skill Council India
# ----------------------------------------------------
def fast_scan_skillcouncil():
    base_url = "https://skillcouncil.in"
    all_courses = set()
    
    print("🚀 FAST Scanning Skill Council India for courses...")
    
    # Test website accessibility
    try:
        response = requests.get(base_url, timeout=10)
        if response.status_code != 200:
            print(f"❌ Website not accessible. Status code: {response.status_code}")
            return []
        print("✅ Website is accessible")
    except Exception as e:
        print(f"❌ Cannot connect to website: {e}")
        return []

    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    })

    # Direct product listing pages (common in WooCommerce sites)
    product_pages = [
        f"{base_url}/shop/",
        f"{base_url}/products/",
        f"{base_url}/courses/",
        f"{base_url}/online-courses/",
        f"{base_url}/all-courses/",
        f"{base_url}/certification-courses/",
    ]

    # Add category pages directly
    categories = [
        "banking", "accounting", "finance", "administration", "clerical",
        "management", "computer", "it", "information-technology", 
        "education", "training", "business", "construction", "civil",
        "hotel", "tourism", "hospitality", "healthcare", "fitness", "medical",
        "art", "design", "journalism", "media", "publishing",
        "transportation", "logistics", "supply-chain", "fire", "safety",
        "apparel", "textile", "fashion", "yoga", "meditation", "gsdci"
    ]

    for category in categories:
        product_pages.append(f"{base_url}/product-category/{category}/")
        product_pages.append(f"{base_url}/category/{category}/")

    def scan_product_page(page_url):
        try:
            print(f"🔍 Scanning: {page_url}")
            response = session.get(page_url, timeout=10)
            
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # Find product links - common WooCommerce selectors
                product_selectors = [
                    'a.woocommerce-LoopProduct-link',
                    'a[href*="/product/"]',
                    '.product a[href]',
                    '.course a[href]',
                    '.product-item a[href]',
                    'li.product a'
                ]
                
                for selector in product_selectors:
                    for link in soup.select(selector):
                        href = link.get('href', '')
                        if href and '/product/' in href:
                            full_url = urljoin(base_url, href)
                            clean_url = full_url.split('?')[0].split('#')[0]
                            if clean_url not in all_courses:
                                all_courses.add(clean_url)
                                print(f"   ✅ Found: {clean_url}")
                
                # Check for pagination
                next_links = soup.select('a.next, a.page-numbers:not(.current)')
                for next_link in next_links:
                    href = next_link.get('href', '')
                    if href and 'page' in href.lower():
                        full_next_url = urljoin(base_url, href)
                        if full_next_url not in product_pages:
                            product_pages.append(full_next_url)
                            
        except Exception as e:
            print(f"   ⚠️  Error scanning {page_url}: {e}")
    
    # Scan all product pages
    for page in product_pages:
        scan_product_page(page)
        time.sleep(0.5)  # Small delay to be respectful

    print(f"📊 Total course URLs found: {len(all_courses)}")
    return list(all_courses)


# ----------------------------------------------------
# Fast Course Details Extractor
# ----------------------------------------------------
def fast_get_course_details(course_urls, output_file):
    if not course_urls:
        print("❌ No course URLs to process")
        return []

    print(f"🔄 Fast processing {len(course_urls)} courses...")
    
    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    })

    all_courses = []

    def extract_course_info(url):
        try:
            response = session.get(url, timeout=10)
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # Quick title extraction
                title_selectors = [
                    'h1.product_title',
                    'h1.entry-title', 
                    'h1.title',
                    'h1',
                    'title'
                ]
                
                course_name = None
                for selector in title_selectors:
                    el = soup.select_one(selector)
                    if el:
                        course_name = el.get_text(strip=True)
                        if course_name and len(course_name) > 3:
                            break
                
                if not course_name:
                    course_name = url.split('/')[-2].replace('-', ' ').title()
                    course_name = re.sub(r'\s+', ' ', course_name).strip()
                
                return {
                    'course_name': course_name,
                    'course_link': url
                }
                
        except Exception as e:
            print(f"⚠️  Error extracting {url}: {e}")
        
        # Fallback
        course_name = url.split('/')[-2].replace('-', ' ').title()
        return {
            'course_name': course_name,
            'course_link': url
        }

    # Process in smaller batches for better performance
    batch_size = 10
    for i in range(0, len(course_urls), batch_size):
        batch = course_urls[i:i + batch_size]
        print(f"📖 Processing batch {i//batch_size + 1}/{(len(course_urls)-1)//batch_size + 1}")
        
        for url in batch:
            course_data = extract_course_info(url)
            if course_data:
                all_courses.append(course_data)
                print(f"✅ {course_data['course_name']}")
        
        # Save progress after each batch
        try:
            df = pd.DataFrame(all_courses)
            df = df[['course_name', 'course_link']]
            df.to_excel(output_file, index=False)
            print(f"💾 Saved {len(all_courses)} courses so far...")
        except Exception as e:
            print(f"⚠️  Error saving: {e}")
        
        time.sleep(1)  # Small delay between batches

    return all_courses


# ----------------------------------------------------
# Ultra Fast Main Function
# ----------------------------------------------------
def ultra_fast_main():
    print("🚀 ULTRA FAST Skill Council Scraper")
    print("=" * 40)
    
    start_time = time.time()
    save_path = r"C:\Users\taslim.siddiqui\Downloads\skillcouncil_courses_fast.xlsx"
    
    try:
        # Step 1: Fast scanning
        print("🎯 Step 1: Fast course discovery...")
        course_urls = fast_scan_skillcouncil()
        
        if not course_urls:
            print("❌ No courses found!")
            return
            
        print(f"✅ Found {len(course_urls)} course URLs")
        
        # Step 2: Quick extraction
        print("🎯 Step 2: Fast course extraction...")
        courses = fast_get_course_details(course_urls, save_path)
        
        # Final results
        end_time = time.time()
        
        if courses:
            # Remove duplicates
            unique_courses = []
            seen_urls = set()
            
            for course in courses:
                clean_url = course['course_link'].rstrip('/')
                if clean_url not in seen_urls:
                    seen_urls.add(clean_url)
                    unique_courses.append(course)
            
            # Final save
            df_final = pd.DataFrame(unique_courses)
            df_final = df_final[['course_name', 'course_link']]
            df_final.to_excel(save_path, index=False)
            
            print(f"\n🎉 COMPLETED IN {end_time - start_time:.1f} SECONDS!")
            print(f"📊 Final Results: {len(unique_courses)} unique courses")
            print(f"💾 Saved to: {save_path}")
            
            # Show sample
            print(f"\n📋 SAMPLE (first 5 courses):")
            print("=" * 50)
            for i, course in enumerate(unique_courses[:5], 1):
                print(f"{i}. {course['course_name']}")
                print(f"   {course['course_link']}")
                print("-" * 50)
                
        else:
            print("❌ No courses extracted")
            
    except KeyboardInterrupt:
        print("\n⏹️ Stopped by user")
    except Exception as e:
        print(f"❌ Error: {e}")


# ----------------------------------------------------
# SUPER FAST VERSION - Minimal scanning
# ----------------------------------------------------
def super_fast_version():
    """Even faster version - scans only main product pages"""
    print("⚡ SUPER FAST Version - Scanning main pages only")
    
    base_url = "https://skillcouncil.in"
    all_courses = set()
    
    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    })
    
    # Only scan these key pages
    key_pages = [
        f"{base_url}/shop/",
        f"{base_url}/products/", 
        f"{base_url}/courses/",
        f"{base_url}/online-courses/",
    ]
    
    for page in key_pages:
        try:
            print(f"🔍 Scanning: {page}")
            response = session.get(page, timeout=10)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Get all links with /product/ in them
            for link in soup.find_all('a', href=True):
                href = link['href']
                if '/product/' in href and 'surewinindia.com' not in href:
                    full_url = urljoin(base_url, href)
                    clean_url = full_url.split('?')[0].split('#')[0]
                    all_courses.add(clean_url)
                    print(f"✅ Found: {clean_url}")
                    
        except Exception as e:
            print(f"⚠️  Error: {e}")
    
    course_list = list(all_courses)
    print(f"📊 Found {len(course_list)} courses")
    
    # Quick extraction
    courses_data = []
    for url in course_list:
        try:
            response = session.get(url, timeout=5)
            soup = BeautifulSoup(response.content, 'html.parser')
            title_el = soup.find('h1') or soup.find('title')
            name = title_el.get_text(strip=True) if title_el else url.split('/')[-2].replace('-', ' ').title()
            courses_data.append({'course_name': name, 'course_link': url})
            print(f"✅ Extracted: {name}")
        except:
            name = url.split('/')[-2].replace('-', ' ').title()
            courses_data.append({'course_name': name, 'course_link': url})
    
    # Save
    df = pd.DataFrame(courses_data)
    df = df[['course_name', 'course_link']]
    save_path = r"C:\Users\taslim.siddiqui\Downloads\skillcouncil_courses_super_fast.xlsx"
    df.to_excel(save_path, index=False)
    
    print(f"🎉 DONE! Saved {len(courses_data)} courses to {save_path}")


# ----------------------------------------------------
# Choose which version to run
# ----------------------------------------------------
if __name__ == "__main__":
    print("Choose scanning speed:")
    print("1. Ultra Fast (Recommended)")
    print("2. Super Fast (Quickest)")
    
    choice = input("Enter choice (1 or 2): ").strip()
    
    if choice == "2":
        super_fast_version()
    else:
        ultra_fast_main()