In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
import time
import re
import os


# ----------------------------------------------------
# Step 1: Find all course/product links on SureWin India
# ----------------------------------------------------
def find_all_course_pages_surewin():
    base_url = "https://surewinindia.com"
    all_courses = set()

    print("🔍 Scanning SureWin India website for course pages...")

    # Test if website is accessible first
    try:
        test_response = requests.get(base_url, timeout=10)
        if test_response.status_code != 200:
            print(f"❌ Website not accessible. Status code: {test_response.status_code}")
            return []
        print("✅ Website is accessible")
    except Exception as e:
        print(f"❌ Cannot connect to website: {e}")
        return []

    # Main category pages to start crawling
    main_pages = [
        base_url,
        f"{base_url}/shop/",
        f"{base_url}/courses/",
        f"{base_url}/products/",
        f"{base_url}/online-courses/",
        f"{base_url}/all-courses/",
        f"{base_url}/programs/",
        f"{base_url}/trainings/",
        f"{base_url}/certification-courses/",
        f"{base_url}/diploma-courses/",
        f"{base_url}/professional-courses/",
        f"{base_url}/it-courses/",
        f"{base_url}/technical-courses/",
        f"{base_url}/management-courses/",
        f"{base_url}/skill-development-courses/",
        f"{base_url}/office-management-courses/",
        f"{base_url}/teacher-training-courses/",
        f"{base_url}/job-oriented-courses/",
        f"{base_url}/safety-management-courses/",
        f"{base_url}/advance-diploma/",
        f"{base_url}/vocation-courses/",
        f"{base_url}/yoga-courses/",
        f"{base_url}/certificate-courses/",
        f"{base_url}/industrial-courses/",
        f"{base_url}/hobby-courses/",
        f"{base_url}/edp-courses/",
        f"{base_url}/language-courses/",
    ]

    visited_pages = set()
    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    })

    def scan_page(url):
        if url in visited_pages:
            return
        visited_pages.add(url)

        try:
            print(f"🔍 Scanning: {url}")
            response = session.get(url, timeout=15)

            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')

                all_links = soup.find_all('a', href=True)
                print(f"   Found {len(all_links)} total links on page")

                course_links = []
                for link in all_links:
                    href = link.get('href', '').strip()

                    is_course_link = (
                        ('/product/' in href or '/course/' in href or '/program/' in href)
                        and 'surewinindia.com' in href
                    )

                    if is_course_link:
                        full_url = urljoin(base_url, href)
                        clean_url = full_url.split('?')[0].split('#')[0]

                        if clean_url not in all_courses:
                            all_courses.add(clean_url)
                            course_links.append(clean_url)
                            print(f"   ✅ Found course: {clean_url}")

                print(f"   📊 Total course links found on this page: {len(course_links)}")

                # Scan pagination links
                pagination_selectors = ['.page-numbers a', '.pagination a', 'a.next', 'a.prev', '.nav-links a', 'a.page-numbers']
                for selector in pagination_selectors:
                    for page_link in soup.select(selector):
                        page_href = page_link.get('href', '')
                        if page_href and 'surewinindia.com' in page_href:
                            scan_page(page_href)

                # Scan category links
                category_selectors = ['a[href*="category"]', 'a[href*="shop"]', 'a[href*="courses"]']
                for selector in category_selectors:
                    for cat_link in soup.select(selector):
                        cat_href = cat_link.get('href', '')
                        if cat_href and 'surewinindia.com' in cat_href:
                            scan_page(cat_href)

            else:
                print(f"   ❌ Failed to load: HTTP {response.status_code}")

        except Exception as e:
            print(f"   ⚠️  Error scanning {url}: {e}")

    successful_scans = 0
    for page in main_pages:
        try:
            scan_page(page)
            successful_scans += 1
            time.sleep(1)
        except Exception as e:
            print(f"⚠️  Error with main page {page}: {e}")
            continue

    print(f"\n📊 Scan Summary: {successful_scans}/{len(main_pages)} pages scanned successfully")
    print(f"📊 Total course URLs found: {len(all_courses)}")

    return list(all_courses)


# ----------------------------------------------------
# Step 2: Extract details from each course page
# ----------------------------------------------------
def get_course_details_surewin(course_urls, output_file):
    if not course_urls:
        print("❌ No course URLs to process")
        return []

    # Load existing progress if file exists
    existing_courses = []
    if os.path.exists(output_file):
        try:
            existing_df = pd.read_excel(output_file)
            existing_courses = existing_df.to_dict('records')
            print(f"📁 Loaded {len(existing_courses)} existing courses from {output_file}")
        except:
            print("📁 Starting fresh - no existing file")

    processed_urls = set(course['course_link'] for course in existing_courses)
    urls_to_process = [url for url in course_urls if url not in processed_urls]

    print(f"🔄 Processing {len(urls_to_process)} new URLs")

    # Save backup of all input URLs
    with open('surewin_course_urls_backup.txt', 'w') as f:
        for url in course_urls:
            f.write(url + '\n')
    print("💾 URLs backup saved to 'surewin_course_urls_backup.txt'")

    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    })

    all_courses = existing_courses.copy()

    for i, url in enumerate(urls_to_process, 1):
        max_retries = 3
        course_data = None

        for attempt in range(max_retries):
            try:
                print(f"📖 Fetching: {url}")
                response = session.get(url, timeout=20)

                if response.status_code == 200:
                    soup = BeautifulSoup(response.content, 'html.parser')

                    # Try multiple selectors for title
                    title_selectors = [
                        'h1.product_title', '.product_title',
                        'h1.entry-title', '.entry-title',
                        'h1.title', 'h1',
                        '.product-title', '.course-title',
                        'title'
                    ]

                    course_name = None
                    for selector in title_selectors:
                        el = soup.select_one(selector)
                        if el:
                            course_name = el.get_text(strip=True)
                            if course_name and len(course_name) > 3:
                                break

                    if not course_name:
                        course_name = url.split('/')[-2].replace('-', ' ').title()
                        course_name = re.sub(r'\s+', ' ', course_name).strip()

                    course_data = {
                        'course_name': course_name,
                        'course_link': url
                    }

                    print(f"✅ {i}/{len(urls_to_process)}: {course_name}")
                    break

                else:
                    print(f"⚠️  HTTP {response.status_code} for {url}")

            except requests.exceptions.Timeout:
                print(f"⏰ Timeout (attempt {attempt + 1}/{max_retries}) for {url}")
                time.sleep(2)
                continue
            except Exception as e:
                print(f"⚠️  Error (attempt {attempt + 1}): {e}")
                time.sleep(2)
                continue

        if course_data:
            all_courses.append(course_data)

        # Auto-save every 3
        if i % 3 == 0:
            try:
                pd.DataFrame(all_courses).to_excel(output_file, index=False)
                print(f"💾 AUTO-SAVED: {len(all_courses)} courses to {output_file}")
            except Exception as e:
                print(f"⚠️  Error auto-saving: {e}")

        time.sleep(1)

    try:
        pd.DataFrame(all_courses).to_excel(output_file, index=False)
        print(f"💾 FINAL SAVE: {len(all_courses)} courses to {output_file}")
    except Exception as e:
        print(f"⚠️  Error in final save: {e}")

    return all_courses


# ----------------------------------------------------
# Step 3: Main runner
# ----------------------------------------------------
def main():
    print("🚀 Scraping SureWin India Website")
    print("=" * 50)

    start_time = time.time()
    save_path = r"C:\Users\taslim.siddiqui\Downloads\surewin_courses.xlsx"

    try:
        print("Step 1: Finding course pages...")
        course_urls = find_all_course_pages_surewin()

        if not course_urls:
            print("\n❌ No course pages found.")
            return

        print(f"\n✅ Step 1 Complete: Found {len(course_urls)} course URLs")

        print("\nStep 2: Extracting course details...")
        courses = get_course_details_surewin(course_urls, save_path)

    except KeyboardInterrupt:
        print("\n⏹️ Process interrupted by user")
        return
    except Exception as e:
        print(f"❌ Critical error: {e}")
        return

    end_time = time.time()

    if courses:
        unique_courses = []
        seen_urls = set()

        for course in courses:
            clean_url = course['course_link'].rstrip('/')
            if clean_url not in seen_urls:
                seen_urls.add(clean_url)
                unique_courses.append(course)

        try:
            df_final = pd.DataFrame(unique_courses)
            final_save_path = r"C:\Users\taslim.siddiqui\Downloads\surewin_courses_final.xlsx"
            df_final.to_excel(final_save_path, index=False)
            print(f"💾 FINAL: {len(unique_courses)} courses to {final_save_path}")
        except Exception as e:
            print(f"⚠️  Error saving final file: {e}")

        print(f"\n🎯 FINAL STATISTICS:")
        print(f"Total course URLs found: {len(course_urls)}")
        print(f"Unique courses extracted: {len(unique_courses)}")
        print(f"Time taken: {end_time - start_time:.2f} seconds")

        print(f"\n📋 SAMPLE COURSES (first 10):")
        print("=" * 80)
        for i, course in enumerate(unique_courses[:10], 1):
            print(f"{i}. {course['course_name']}")
            print(f"   {course['course_link']}")
            print("-" * 80)

    else:
        print("❌ No courses were processed")


if __name__ == "__main__":
    main()


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
import time
import re
import os

def find_all_product_pages_surewin():
    base_url = "https://surewinindia.com"
    all_products = set()
    
    print("🔍 Scanning SureWin India website for product pages...")
    
    # Main pages to scan on SureWin website
    main_pages = [
        base_url,
        f"{base_url}/shop/",
        f"{base_url}/courses/",
        f"{base_url}/products/",
        f"{base_url}/online-courses/",
        f"{base_url}/all-courses/",
        f"{base_url}/programs/",
        f"{base_url}/trainings/",
        f"{base_url}/certification-courses/",
        f"{base_url}/diploma-courses/",
        f"{base_url}/professional-courses/",
        f"{base_url}/it-courses/",
        f"{base_url}/technical-courses/",
        f"{base_url}/management-courses/",
        f"{base_url}/skill-development-courses/",
        f"{base_url}/office-management-courses/",
        f"{base_url}/teacher-training-courses/",
        f"{base_url}/job-oriented-courses/",
        f"{base_url}/safety-management-courses/",
        f"{base_url}/advance-diploma/",
        f"{base_url}/vocation-courses/",
        f"{base_url}/yoga-courses/",
        f"{base_url}/certificate-courses/",
        f"{base_url}/industrial-courses/",
        f"{base_url}/hobby-courses/",
        f"{base_url}/edp-courses/",
        f"{base_url}/language-courses/",
    ]
    
    # Common course categories for SureWin
    categories = [
        "computer", "management", "skill", "office", "teacher", "job", 
        "safety", "yoga", "certificate", "diploma", "professional", 
        "it", "technical", "industrial", "hobby", "language", "web",
        "programming", "accounting", "finance", "beauty", "health",
        "training", "education", "development", "graphic", "hardware",
        "networking", "software", "tally"
    ]
    
    for category in categories:
        main_pages.extend([
            f"{base_url}/product-category/{category}/",
            f"{base_url}/course-category/{category}/",
            f"{base_url}/category/{category}/"
        ])
    
    visited_pages = set()
    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    })
    
    def scan_page(url):
        if url in visited_pages:
            return
        visited_pages.add(url)
        
        try:
            print(f"Scanning: {url}")
            response = session.get(url, timeout=15)
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # Find all product links on SureWin
                links = soup.find_all('a', href=True)
                for link in links:
                    href = link.get('href', '').strip()
                    if '/product/' in href and 'surewinindia.com' in href:
                        full_url = urljoin(base_url, href)
                        # Clean URL - remove parameters and fragments
                        clean_url = full_url.split('?')[0].split('#')[0]
                        if clean_url not in all_products:
                            all_products.add(clean_url)
                            print(f"Found product: {clean_url}")
                
                # Look for pagination
                pagination_links = soup.select('.page-numbers a, .pagination a, a.next, a.prev, .nav-links a')
                for page_link in pagination_links:
                    page_href = page_link.get('href', '')
                    if page_href and 'surewinindia.com' in page_href and '/page/' in page_href:
                        scan_page(page_href)
                
                # Look for category links
                category_links = soup.select('a[href*="product-category"], a[href*="course-category"]')
                for cat_link in category_links:
                    cat_href = cat_link.get('href', '')
                    if cat_href and 'surewinindia.com' in cat_href:
                        scan_page(cat_href)
                        
        except Exception as e:
            print(f"⚠️  Error scanning {url}: {e}")
    
    # Scan all main pages
    for page in main_pages:
        try:
            scan_page(page)
            time.sleep(2)
        except Exception as e:
            print(f"⚠️  Error with main page {page}: {e}")
            continue
    
    return list(all_products)

def get_course_details_surewin(product_urls, output_file='surewin_courses_auto.xlsx'):
    """Get course details from SureWin product pages with auto-save to Excel"""
    
    # Load existing progress if file exists
    existing_courses = []
    if os.path.exists(output_file):
        try:
            existing_df = pd.read_excel(output_file)
            existing_courses = existing_df.to_dict('records')
            print(f"📁 Loaded {len(existing_courses)} existing courses from {output_file}")
        except:
            print("📁 Starting fresh - no existing file")
    
    # Create set of already processed URLs
    processed_urls = set(course['course_link'] for course in existing_courses)
    
    # Filter out already processed URLs
    urls_to_process = [url for url in product_urls if url not in processed_urls]
    
    print(f"🔄 Processing {len(urls_to_process)} new URLs (skipping {len(processed_urls)} already processed)")
    
    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    })
    
    all_courses = existing_courses.copy()
    
    for i, url in enumerate(urls_to_process, 1):
        max_retries = 2
        course_data = None
        
        for attempt in range(max_retries):
            try:
                response = session.get(url, timeout=25)
                if response.status_code == 200:
                    soup = BeautifulSoup(response.content, 'html.parser')
                    
                    # Try to get course title from SureWin
                    title_selectors = [
                        'h1.product_title',
                        '.product_title', 
                        'h1.entry-title',
                        '.entry-title',
                        'h1.title',
                        'h1',
                        '.product-title',
                        '.course-title'
                    ]
                    
                    course_name = None
                    for selector in title_selectors:
                        title_element = soup.select_one(selector)
                        if title_element:
                            course_name = title_element.get_text(strip=True)
                            if course_name and len(course_name) > 3:
                                break
                    
                    # If no title found, use the last part of URL
                    if not course_name:
                        course_name = url.split('/')[-2].replace('-', ' ').title()
                        course_name = re.sub(r'\s+', ' ', course_name).strip()
                    
                    course_data = {
                        'course_name': course_name,
                        'course_link': url
                    }
                    
                    print(f"✅ {i}/{len(urls_to_process)}: {course_name}")
                    break
                    
                else:
                    print(f"⚠️  HTTP {response.status_code} for {url}")
                    
            except requests.exceptions.Timeout:
                print(f"⏰ Timeout (attempt {attempt + 1}/{max_retries}) for {url}")
                if attempt < max_retries - 1:
                    time.sleep(3)
                    continue
                else:
                    print(f"❌ Failed after {max_retries} attempts: {url}")
                    course_data = {
                        'course_name': url.split('/')[-2].replace('-', ' ').title() + " (Timeout)",
                        'course_link': url
                    }
                    break
                    
            except Exception as e:
                print(f"⚠️  Error (attempt {attempt + 1}): {url} - {e}")
                if attempt < max_retries - 1:
                    time.sleep(2)
                    continue
                else:
                    print(f"❌ Failed after {max_retries} attempts: {url}")
                    course_data = {
                        'course_name': url.split('/')[-2].replace('-', ' ').title() + " (Error)",
                        'course_link': url
                    }
                    break
        
        # Add course data if available
        if course_data:
            all_courses.append(course_data)
        
        # AUTO-SAVE to Excel every 5 courses
        if i % 5 == 0:
            try:
                df = pd.DataFrame(all_courses)
                df.to_excel(output_file, index=False)
                print(f"💾 AUTO-SAVED: {len(all_courses)} courses to {output_file}")
            except Exception as e:
                print(f"⚠️  Error auto-saving: {e}")
        
        # Progress tracking
        if i % 10 == 0:
            print(f"📊 Progress: {i}/{len(urls_to_process)} new URLs processed | Total: {len(all_courses)} courses")
            
        time.sleep(1)
    
    # Final save to Excel
    try:
        df = pd.DataFrame(all_courses)
        df.to_excel(output_file, index=False)
        print(f"💾 FINAL SAVE: {len(all_courses)} courses to {output_file}")
    except Exception as e:
        print(f"⚠️  Error in final save: {e}")
    
    return all_courses

def main():
    print("🚀 Scraping ALL course links from SureWin India website (surewinindia.com)")
    print("Target: https://surewinindia.com/product/... pattern\n")
    
    start_time = time.time()
    # Save to Excel file in Downloads folder
    save_path = r"C:\Users\taslim.siddiqui\Downloads\surewin_courses.xlsx"
    output_file = save_path
    
    try:
        # Find all product pages on SureWin
        product_urls = find_all_product_pages_surewin()
        
        if not product_urls:
            print("❌ No product pages found on SureWin website.")
            return
        
        print(f"\n✅ Found {len(product_urls)} product page URLs on SureWin")
        
        # Save URLs backup
        with open('surewin_product_urls_backup.txt', 'w') as f:
            for url in product_urls:
                f.write(url + '\n')
        print("💾 URLs backup saved to 'surewin_product_urls_backup.txt'")
        
        # Get course details with automatic saving to Excel
        courses = get_course_details_surewin(product_urls, output_file)
        
    except KeyboardInterrupt:
        print("\n⏹️ Process interrupted by user")
        if 'courses' in locals():
            try:
                df = pd.DataFrame(courses)
                df.to_excel(output_file, index=False)
                print(f"💾 Saved {len(courses)} courses before interruption to {output_file}")
            except:
                pass
        return
    except Exception as e:
        print(f"❌ Critical error: {e}")
        return
    
    end_time = time.time()
    
    if courses:
        # Remove duplicates from final list
        unique_courses = []
        seen_urls = set()
        
        for course in courses:
            clean_url = course['course_link'].rstrip('/')
            if clean_url not in seen_urls:
                seen_urls.add(clean_url)
                unique_courses.append(course)
        
        # Save final deduplicated version to Excel
        try:
            df_final = pd.DataFrame(unique_courses)
            final_save_path = r"C:\Users\taslim.siddiqui\Downloads\surewin_course_finals.xlsx"
            df_final.to_excel(final_save_path, index=False)
            print(f"💾 FINAL DEDUPLICATED: {len(unique_courses)} courses to {final_save_path}")
        except Exception as e:
            print(f"⚠️  Error saving final file: {e}")
        
        print(f"\n🎯 Final Statistics for SureWin India:")
        print(f"Total product URLs found: {len(product_urls)}")
        print(f"Unique courses extracted: {len(unique_courses)}")
        print(f"Time taken: {end_time - start_time:.2f} seconds")
        
        # Display sample
        print("\n📋 Sample of SureWin courses:")
        print("=" * 80)
        for i, course in enumerate(unique_courses[:15], 1):
            print(f"{i}. {course['course_name']}")
            print(f"   {course['course_link']}")
            print("-" * 80)
        
        # Final confirmation
        print(f"\nData saved to {save_path}")
        
    else:
        print("❌ No courses were processed from SureWin")

if __name__ == "__main__":
    main()

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
import time
import re
import os

def find_all_course_pages_iisdt():
    base_url = "https://surewinindia.com/"
    all_courses = set()
    
    print("🔍 Scanning IISDT website for course pages...")
    
    # Main pages to scan on IISDT website
    main_pages = [
        base_url,
        f"{base_url}/shop/",
        f"{base_url}/courses/",
        f"{base_url}/products/",
        f"{base_url}/online-courses/",
        f"{base_url}/all-courses/",
        f"{base_url}/programs/",
        f"{base_url}/trainings/",
    ]
    
    # Common course categories for IISDT
    categories = [
        "human-rights", "law", "legal", "diploma", "certificate", 
        "management", "development", "social-work", "education",
        "professional", "training", "course", "program", "sanitizer",
        "manufacturing", "technology", "health", "safety", "bakery",
        "hobby", "industrial", "folklore","technical","language","hobby"
    ]
    
    for category in categories:
        main_pages.extend([
            f"{base_url}/product-category/{category}/",
            f"{base_url}/course-category/{category}/",
            f"{base_url}/category/{category}/"\
            
            
        ])
    
    visited_pages = set()
    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    })
    
    def scan_page(url):
        if url in visited_pages:
            return
        visited_pages.add(url)
        
        try:
            print(f"Scanning: {url}")
            response = session.get(url, timeout=15)
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # Find ALL links first to see what's available
                all_links = soup.find_all('a', href=True)
                print(f"   Found {len(all_links)} total links on page")
                
                # Look for course/product links - IMPROVED DETECTION
                course_links = []
                for link in all_links:
                    href = link.get('href', '').strip()
                    text = link.get_text(strip=True)
                    
                    # IMPROVED: Check for course indicators in URL and text
                    is_course_link = (
                        ('/product/' in href or '/course/' in href or '/program/' in href) and 
                        'iisdt.com' in href
                    )
                    
                    # Also check if link text suggests it's a course
                    if not is_course_link and text:
                        course_keywords = ['course', 'program', 'training', 'certificate', 'diploma', 'technology', 'development']
                        if any(keyword in text.lower() for keyword in course_keywords) and len(text) > 5:
                            if 'iisdt.com' in href and any(pattern in href for pattern in ['/course/', '/product/', '/program/']):
                                is_course_link = True
                    
                    if is_course_link:
                        full_url = urljoin(base_url, href)
                        clean_url = full_url.split('?')[0].split('#')[0]
                        
                        if clean_url not in all_courses:
                            all_courses.add(clean_url)
                            course_links.append(clean_url)
                            print(f"   ✅ Found course: {clean_url}")
                
                print(f"   📊 Total course links found on this page: {len(course_links)}")
                
                # Look for pagination
                pagination_selectors = ['.page-numbers a', '.pagination a', 'a.next', 'a.prev', '.nav-links a', 'a.page-numbers']
                for selector in pagination_selectors:
                    pagination_links = soup.select(selector)
                    for page_link in pagination_links:
                        page_href = page_link.get('href', '')
                        if page_href and 'iisdt.com' in page_href:
                            scan_page(page_href)
                
                # Look for category links
                category_selectors = ['a[href*="category"]', 'a[href*="shop"]', 'a[href*="courses"]']
                for selector in category_selectors:
                    category_links = soup.select(selector)
                    for cat_link in category_links:
                        cat_href = cat_link.get('href', '')
                        if cat_href and 'iisdt.com' in cat_href:
                            scan_page(cat_href)
                        
            else:
                print(f"   ❌ Failed to load: HTTP {response.status_code}")
                
        except Exception as e:
            print(f"   ⚠️  Error scanning {url}: {e}")
    
    # Scan all main pages
    successful_scans = 0
    for page in main_pages:
        try:
            scan_page(page)
            successful_scans += 1
            time.sleep(1)
        except Exception as e:
            print(f"⚠️  Error with main page {page}: {e}")
            continue
    
    print(f"\n📊 Scan Summary: {successful_scans}/{len(main_pages)} pages scanned successfully")
    print(f"📊 Total course URLs found: {len(all_courses)}")
    
    return list(all_courses)

def get_course_details_iisdt(course_urls, output_file):
    """Get course details from IISDT pages with improved title extraction"""
    
    if not course_urls:
        print("❌ No course URLs to process")
        return []
    
    # Load existing progress if file exists
    existing_courses = []
    if os.path.exists(output_file):
        try:
            existing_df = pd.read_excel(output_file)
            existing_courses = existing_df.to_dict('records')
            print(f"📁 Loaded {len(existing_courses)} existing courses from {output_file}")
        except:
            print("📁 Starting fresh - no existing file")
    
    # Create set of already processed URLs
    processed_urls = set(course['course_link'] for course in existing_courses)
    
    # Filter out already processed URLs
    urls_to_process = [url for url in course_urls if url not in processed_urls]
    
    print(f"🔄 Processing {len(urls_to_process)} new URLs")
    
    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    })
    
    all_courses = existing_courses.copy()
    
    for i, url in enumerate(urls_to_process, 1):
        max_retries = 2
        course_data = None
        
        for attempt in range(max_retries):
            try:
                print(f"📖 Fetching: {url}")
                response = session.get(url, timeout=20)
                
                if response.status_code == 200:
                    soup = BeautifulSoup(response.content, 'html.parser')
                    
                    # IMPROVED title extraction for IISDT
                    title_selectors = [
                        'h1',  # Most common
                        '.title-page h1',  # From the HTML you provided
                        'h1.product_title',
                        '.product_title', 
                        'h1.entry-title',
                        '.entry-title',
                        '.product-title',
                        '.course-title',
                        'title'  # Fallback to page title
                    ]
                    
                    course_name = None
                    for selector in title_selectors:
                        title_element = soup.select_one(selector)
                        if title_element:
                            course_name = title_element.get_text(strip=True)
                            # Clean the course name
                            if course_name and len(course_name) > 3:
                                # Remove website name if present
                                course_name = re.sub(r'\s*-\s*IISDT\s*$', '', course_name, flags=re.IGNORECASE)
                                course_name = course_name.strip()
                                break
                    
                    # If no title found, use the last part of URL and clean it
                    if not course_name:
                        course_name = url.split('/')[-2].replace('-', ' ').title()
                        course_name = re.sub(r'\s+', ' ', course_name).strip()
                        # Remove numbers at the end like "-1", "-2"
                        course_name = re.sub(r'\s+\d+$', '', course_name)
                    
                    course_data = {
                        'course_name': course_name,
                        'course_link': url
                    }
                    
                    print(f"✅ {i}/{len(urls_to_process)}: {course_name}")
                    break
                    
                else:
                    print(f"⚠️  HTTP {response.status_code} for {url}")
                    
            except requests.exceptions.Timeout:
                print(f"⏰ Timeout (attempt {attempt + 1}/{max_retries}) for {url}")
                if attempt < max_retries - 1:
                    time.sleep(2)
                    continue
                else:
                    print(f"❌ Failed after {max_retries} attempts: {url}")
                    course_name = url.split('/')[-2].replace('-', ' ').title()
                    course_name = re.sub(r'\s+\d+$', '', course_name)
                    course_data = {
                        'course_name': course_name + " (Timeout)",
                        'course_link': url
                    }
                    break
                    
            except Exception as e:
                print(f"⚠️  Error (attempt {attempt + 1}): {e}")
                if attempt < max_retries - 1:
                    time.sleep(2)
                    continue
                else:
                    print(f"❌ Failed after {max_retries} attempts: {url}")
                    course_name = url.split('/')[-2].replace('-', ' ').title()
                    course_name = re.sub(r'\s+\d+$', '', course_name)
                    course_data = {
                        'course_name': course_name + " (Error)",
                        'course_link': url
                    }
                    break
        
        # Add course data if available
        if course_data:
            all_courses.append(course_data)
        
        # AUTO-SAVE to Excel every 3 courses
        if i % 3 == 0:
            try:
                df = pd.DataFrame(all_courses)
                df.to_excel(output_file, index=False)
                print(f"💾 AUTO-SAVED: {len(all_courses)} courses to {output_file}")
            except Exception as e:
                print(f"⚠️  Error auto-saving: {e}")
        
        time.sleep(1)
    
    # Final save
    try:
        df = pd.DataFrame(all_courses)
        df.to_excel(output_file, index=False)
        print(f"💾 FINAL SAVE: {len(all_courses)} courses to {output_file}")
    except Exception as e:
        print(f"⚠️  Error in final save: {e}")
    
    return all_courses

def main():
    print("🚀 Scraping IISDT Website - Improved Course Detection")
    print("=" * 50)
    
    start_time = time.time()
    save_path = r"C:\Users\siddi\Downloads\iisdt_courses.xlsx"
    
    try:
        # Find all course pages
        print("Step 1: Finding course pages...")
        course_urls = find_all_course_pages_iisdt()
        
        if not course_urls:
            print("\n❌ No course pages found.")
            return
        
        print(f"\n✅ Step 1 Complete: Found {len(course_urls)} course URLs")
        
        # Save URLs backup
        with open('iisdt_course_urls_backup.txt', 'w') as f:
            for url in course_urls:
                f.write(url + '\n')
        print("💾 URLs backup saved to 'iisdt_course_urls_backup.txt'")
        
        # Get course details
        print("\nStep 2: Extracting course details...")
        courses = get_course_details_iisdt(course_urls, save_path)
        
    except KeyboardInterrupt:
        print("\n⏹️ Process interrupted by user")
        return
    except Exception as e:
        print(f"❌ Critical error: {e}")
        return
    
    end_time = time.time()
    
    if courses:
        # Remove duplicates
        unique_courses = []
        seen_urls = set()
        
        for course in courses:
            clean_url = course['course_link'].rstrip('/')
            if clean_url not in seen_urls:
                seen_urls.add(clean_url)
                unique_courses.append(course)
        
        # Save final version
        try:
            df_final = pd.DataFrame(unique_courses)
            final_save_path = r"C:\Users\siddi\Downloads\iisdt_courses_final.xlsx"
            df_final.to_excel(final_save_path, index=False)
            print(f"💾 FINAL: {len(unique_courses)} courses to {final_save_path}")
        except Exception as e:
            print(f"⚠️  Error saving final file: {e}")
        
        print(f"\n🎯 FINAL STATISTICS:")
        print(f"Total course URLs found: {len(course_urls)}")
        print(f"Unique courses extracted: {len(unique_courses)}")
        print(f"Time taken: {end_time - start_time:.2f} seconds")
        
        # Display sample
        print(f"\n📋 SAMPLE COURSES (showing first 15):")
        print("=" * 80)
        for i, course in enumerate(unique_courses[:15], 1):
            print(f"{i}. {course['course_name']}")
            print(f"   {course['course_link']}")
            print("-" * 80)
        
        print(f"\nData saved to {save_path}")
        
    else:
        print("❌ No courses were processed")

if __name__ == "__main__":
    main()