In [None]:
import time
import re
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime
from urllib.parse import urljoin

def get_driver():
    """Initialize Chrome driver"""
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--log-level=3")
    # options.add_argument("--headless")  # Remove for debugging
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

def find_all_courses_comprehensive():
    """
    Find ALL courses from the entire website by exploring every possible page
    """
    print("="*80)
    print("EXTRACTING ALL COURSES FROM ENTIRE WEBSITE")
    print("="*80)
    
    driver = None
    all_courses = []
    base_url = "https://www.smartonlinecourse.co.in"
    
    try:
        # Initialize driver
        print("üöÄ Initializing Chrome browser...")
        driver = get_driver()
        
        # STEP 1: Start from main store and find all navigation links
        print("\nüìå STEP 1: Exploring main navigation...")
        main_store_url = f"{base_url}/s/store"
        print(f"üåê Visiting: {main_store_url}")
        driver.get(main_store_url)
        time.sleep(5)
        
        # Take screenshot for debugging
        driver.save_screenshot("main_store.png")
        print("üì∏ Screenshot saved: main_store.png")
        
        # Get all links on the main page
        all_links = get_all_links_on_page(driver, base_url)
        print(f"   Found {len(all_links)} total links on main page")
        
        # STEP 2: Visit all category pages
        print("\nüìå STEP 2: Visiting all category pages...")
        
        # Extract category links
        category_links = []
        for link_info in all_links:
            url = link_info['url']
            text = link_info['text'].lower()
            
            # Look for category indicators
            category_keywords = ['risk', 'insurance', 'skill', 'mock', 'banking', 
                                'legal', 'technology', 'life', 'magazine', 'productivity']
            if any(keyword in text for keyword in category_keywords) and len(text) > 2:
                category_links.append(url)
        
        # Visit each category link
        category_links = list(set(category_links))  # Remove duplicates
        print(f"   Found {len(category_links)} category pages to explore")
        
        for i, category_url in enumerate(category_links, 1):
            print(f"\n   {i}/{len(category_links)}: Visiting category: {category_url}")
            
            try:
                driver.get(category_url)
                time.sleep(4)
                
                # Scroll to load all content
                scroll_to_load_all(driver)
                
                # Extract courses from this category
                courses = extract_courses_from_current_page(driver, base_url)
                print(f"     Found {len(courses)} courses on this page")
                
                all_courses.extend(courses)
                
                # Try to find and follow pagination
                more_courses = handle_pagination(driver, base_url)
                all_courses.extend(more_courses)
                
            except Exception as e:
                print(f"     Error: {e}")
                continue
        
        # STEP 3: Try to find course listing pages
        print("\nüìå STEP 3: Searching for course listing pages...")
        
        # Common course listing patterns
        listing_patterns = [
            "courses", "all-courses", "course-list", "online-courses", 
            "programs", "learn", "training", "certificate"
        ]
        
        for pattern in listing_patterns:
            listing_url = f"{base_url}/{pattern}"
            print(f"\n   Checking: {listing_url}")
            
            try:
                driver.get(listing_url)
                time.sleep(4)
                
                # Check if page exists and has content
                if "404" not in driver.title.lower() and "not found" not in driver.page_source.lower():
                    scroll_to_load_all(driver)
                    
                    courses = extract_courses_from_current_page(driver, base_url)
                    print(f"     Found {len(courses)} courses")
                    
                    all_courses.extend(courses)
                    
                    # Handle pagination
                    more_courses = handle_pagination(driver, base_url)
                    all_courses.extend(more_courses)
                    
            except Exception as e:
                print(f"     Error: {e}")
                continue
        
        # STEP 4: Try to discover via sitemap or robots.txt
        print("\nüìå STEP 4: Checking for sitemap...")
        
        sitemap_urls = [
            f"{base_url}/sitemap.xml",
            f"{base_url}/sitemap_index.xml",
            f"{base_url}/wp-sitemap.xml",
        ]
        
        for sitemap_url in sitemap_urls:
            try:
                print(f"\n   Checking sitemap: {sitemap_url}")
                driver.get(sitemap_url)
                time.sleep(3)
                
                # Parse XML content
                page_source = driver.page_source
                if 'xml' in page_source:
                    soup = BeautifulSoup(page_source, 'xml')
                    urls = soup.find_all('loc')
                    
                    course_urls = []
                    for url_tag in urls:
                        url = url_tag.get_text()
                        if '/courses/' in url and re.search(r'-[a-f0-9]{24}$', url):
                            course_urls.append(url)
                    
                    print(f"     Found {len(course_urls)} course URLs in sitemap")
                    
                    for course_url in course_urls:
                        course_name = extract_course_name_from_url(course_url)
                        all_courses.append({
                            'url': course_url,
                            'course_name': course_name
                        })
                        
            except Exception as e:
                print(f"     Error: {e}")
                continue
        
        # STEP 5: Remove duplicates and process results
        print("\nüìå STEP 5: Processing results...")
        
        unique_courses = remove_duplicates(all_courses)
        print(f"‚úÖ Total unique courses found: {len(unique_courses)}")
        
        # STEP 6: Categorize and save
        if unique_courses:
            categorized_courses = categorize_courses(unique_courses)
            save_all_courses_to_excel(categorized_courses)
            return categorized_courses
        else:
            print("\n‚ùå No courses found")
            return []
            
    except Exception as e:
        print(f"\nüí• Error: {str(e)}")
        import traceback
        traceback.print_exc()
        return []
    finally:
        if driver:
            driver.quit()

def get_all_links_on_page(driver, base_url):
    """Get all links on current page"""
    links = []
    
    # Get page source and parse with BeautifulSoup
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    # Find all anchor tags
    anchor_tags = soup.find_all('a', href=True)
    
    for tag in anchor_tags:
        href = tag.get('href', '')
        text = tag.get_text(strip=True)
        
        if href:
            # Make full URL
            if href.startswith('/'):
                full_url = urljoin(base_url, href)
            elif href.startswith('http'):
                full_url = href
            else:
                continue
            
            # Clean URL
            clean_url = full_url.split('?')[0].split('#')[0].rstrip('/')
            
            links.append({
                'url': clean_url,
                'text': text
            })
    
    return links

def scroll_to_load_all(driver):
    """Scroll to load all lazy-loaded content"""
    last_height = driver.execute_script("return document.body.scrollHeight")
    scroll_attempts = 0
    
    while scroll_attempts < 5:
        # Scroll down
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        
        # Check for "Load More" buttons
        try:
            buttons = driver.find_elements(By.XPATH, 
                "//button[contains(., 'Load') or contains(., 'More') or contains(., 'Show')]")
            for button in buttons:
                try:
                    if button.is_displayed():
                        driver.execute_script("arguments[0].scrollIntoView();", button)
                        time.sleep(1)
                        button.click()
                        print("       Clicked 'Load More' button")
                        time.sleep(3)
                except:
                    continue
        except:
            pass
        
        # Calculate new scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        
        if new_height == last_height:
            scroll_attempts += 1
        else:
            scroll_attempts = 0
            last_height = new_height
        
        time.sleep(1)

def extract_courses_from_current_page(driver, base_url):
    """Extract all course links from current page"""
    courses = []
    
    # Get page source
    page_source = driver.page_source
    
    # METHOD 1: Search for course URL patterns
    patterns = [
        r'https?://[^"\']+/courses/[^"\']+?-[a-f0-9]{24}[^"\']*',
        r'/courses/[^"\']+?-[a-f0-9]{24}',
    ]
    
    for pattern in patterns:
        matches = re.findall(pattern, page_source, re.IGNORECASE)
        for match in matches:
            if isinstance(match, tuple):
                match = match[0]
            
            # Construct full URL
            if match.startswith('/'):
                full_url = urljoin(base_url, match)
            elif match.startswith('http'):
                full_url = match
            else:
                continue
            
            # Clean URL
            clean_url = full_url.split('?')[0].split('#')[0].rstrip('/')
            
            # Extract course name
            course_name = extract_course_name_from_url(clean_url)
            
            # Add to list
            courses.append({
                'url': clean_url,
                'course_name': course_name
            })
    
    # METHOD 2: Also look for links in anchor tags
    soup = BeautifulSoup(page_source, 'html.parser')
    anchor_tags = soup.find_all('a', href=True)
    
    for tag in anchor_tags:
        href = tag.get('href', '')
        text = tag.get_text(strip=True)
        
        if href and '/courses/' in href:
            # Check if it has the ID pattern
            if re.search(r'-[a-f0-9]{24}$', href):
                # Make full URL
                if href.startswith('/'):
                    full_url = urljoin(base_url, href)
                elif href.startswith('http'):
                    full_url = href
                else:
                    continue
                
                clean_url = full_url.split('?')[0].split('#')[0].rstrip('/')
                
                # Use link text if available, otherwise extract from URL
                if text and len(text) > 5:
                    course_name = text
                else:
                    course_name = extract_course_name_from_url(clean_url)
                
                courses.append({
                    'url': clean_url,
                    'course_name': course_name
                })
    
    return courses

def handle_pagination(driver, base_url):
    """Handle pagination if exists"""
    more_courses = []
    
    try:
        # Look for pagination links
        pagination_links = driver.find_elements(By.XPATH, 
            "//a[contains(@href, 'page=') or contains(@href, 'p=') or contains(text(), '2') or contains(text(), 'Next')]")
        
        if pagination_links:
            print("     Found pagination, checking next pages...")
            
            # Try to visit next few pages
            for i in range(2, 6):  # Check pages 2-5
                try:
                    # Construct page URL
                    current_url = driver.current_url
                    if '?' in current_url:
                        page_url = f"{current_url}&page={i}"
                    else:
                        page_url = f"{current_url}?page={i}"
                    
                    print(f"       Checking page {i}: {page_url}")
                    driver.get(page_url)
                    time.sleep(4)
                    
                    # Scroll to load
                    scroll_to_load_all(driver)
                    
                    # Extract courses
                    courses = extract_courses_from_current_page(driver, base_url)
                    more_courses.extend(courses)
                    
                    print(f"         Found {len(courses)} courses on page {i}")
                    
                except:
                    break
    except:
        pass
    
    return more_courses

def extract_course_name_from_url(url):
    """Extract course name from URL"""
    try:
        # Pattern: /courses/[course-name]-[24-char-id]
        match = re.search(r'/courses/([a-zA-Z0-9\-]+)-[a-f0-9]{24}', url)
        if match:
            name_part = match.group(1)
            # Convert kebab-case to Title Case
            name_part = name_part.replace('-', ' ')
            name_part = ' '.join(word.capitalize() for word in name_part.split())
            return name_part
        
        # Alternative pattern
        match = re.search(r'/courses/(.+)', url)
        if match:
            name_part = match.group(1)
            # Remove ID if present
            name_part = re.sub(r'-[a-f0-9]{24}$', '', name_part)
            name_part = name_part.replace('-', ' ').replace('%20', ' ')
            name_part = ' '.join(word.capitalize() for word in name_part.split())
            return name_part
        
        return url.split('/')[-1].replace('-', ' ').title()
    except:
        return "Course"

def remove_duplicates(courses):
    """Remove duplicate courses"""
    unique_courses = []
    seen_urls = set()
    
    for course in courses:
        clean_url = course['url'].split('?')[0].split('#')[0].rstrip('/')
        if clean_url not in seen_urls:
            seen_urls.add(clean_url)
            unique_courses.append({
                'url': clean_url,
                'course_name': course['course_name']
            })
    
    return unique_courses

def categorize_courses(courses):
    """Categorize courses based on keywords"""
    categories = {
        'Risk Management': ['risk management', 'risk'],
        'Insurance': ['insurance', 'insure', 'policy'],
        'Legal and Regulatory Compliance': ['legal', 'law', 'regulation', 'compliance'],
        'Life Insurance': ['life insurance'],
        'Banking': ['banking', 'bank', 'finance'],
        'MAGAZINE': ['magazine'],
        'Mock Test': ['mock test', 'exam', 'quiz'],
        'Skill Building and Productivity': ['skill', 'productivity', 'soft skills'],
        'Technology': ['technology', 'tech', 'cyber', 'digital', 'ai', 'artificial intelligence'],
        'Other': []
    }
    
    categorized = []
    
    for course in courses:
        search_text = (course['course_name'] + ' ' + course['url']).lower()
        
        assigned_category = 'Other'
        for category, keywords in categories.items():
            if category == 'Other':
                continue
            
            for keyword in keywords:
                if keyword.lower() in search_text:
                    assigned_category = category
                    break
            
            if assigned_category != 'Other':
                break
        
        course['category'] = assigned_category
        categorized.append(course)
    
    return categorized

def save_all_courses_to_excel(courses):
    """Save all courses to Excel"""
    # Prepare data
    data = []
    for i, course in enumerate(courses, 1):
        data.append({
            'S.No': i,
            'Course Name': course['course_name'],
            'URL': course['url'],
            'Category': course['category'],
            'Scraped_Date': datetime.now().strftime('%d-%m-%Y %H:%M')
        })
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Save to Excel
    output_path = r"C:\Users\taslim.siddiqui\Downloads\all_courses_complete.xlsx"
    
    with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
        df.to_excel(writer, sheet_name='All Courses', index=False)
        
        # Auto-adjust columns
        worksheet = writer.sheets['All Courses']
        for column in worksheet.columns:
            max_length = 0
            column_letter = column[0].column_letter
            for cell in column:
                try:
                    if cell.value:
                        max_length = max(max_length, len(str(cell.value)))
                except:
                    pass
            adjusted_width = min(max_length + 2, 50)
            worksheet.column_dimensions[column_letter].width = adjusted_width
        
        # Bold header
        for cell in worksheet[1]:
            cell.font = cell.font.copy(bold=True)
    
    print(f"\nüíæ Saved {len(df)} courses to: {output_path}")
    
    # Print comprehensive summary
    print("\nüìä COMPREHENSIVE CATEGORY DISTRIBUTION:")
    category_counts = df['Category'].value_counts()
    for category, count in category_counts.items():
        print(f"   {category}: {count} courses")
    
    # Show sample from each category
    print("\nüìã SAMPLE FROM EACH CATEGORY:")
    for category in category_counts.index:
        category_courses = df[df['Category'] == category].head(3)
        print(f"\n   {category} (showing {len(category_courses)} of {category_counts[category]}):")
        for idx, row in category_courses.iterrows():
            print(f"     ‚Ä¢ {row['Course Name']}")
    
    return df

# Advanced discovery using API/JSON endpoints
def discover_courses_via_api():
    """Try to discover courses via API endpoints"""
    print("\n" + "="*80)
    print("ADVANCED DISCOVERY VIA API ENDPOINTS")
    print("="*80)
    
    import requests
    import json
    
    base_url = "https://www.smartonlinecourse.co.in"
    all_courses = []
    
    # Common API endpoints
    api_endpoints = [
        f"{base_url}/api/courses",
        f"{base_url}/api/v1/courses",
        f"{base_url}/wp-json/wp/v2/courses",
        f"{base_url}/graphql",
        f"{base_url}/courses.json",
        f"{base_url}/data/courses.json",
    ]
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    
    for endpoint in api_endpoints:
        try:
            print(f"\nüîç Checking API: {endpoint}")
            response = requests.get(endpoint, headers=headers, timeout=10)
            
            if response.status_code == 200:
                try:
                    data = response.json()
                    courses = extract_courses_from_json(data, base_url)
                    print(f"   Found {len(courses)} courses via API")
                    all_courses.extend(courses)
                except json.JSONDecodeError:
                    # Try to parse as text/HTML
                    courses = extract_courses_from_text(response.text, base_url)
                    print(f"   Found {len(courses)} courses in response text")
                    all_courses.extend(courses)
                    
        except Exception as e:
            print(f"   Error: {e}")
            continue
    
    return all_courses

def extract_courses_from_json(data, base_url):
    """Extract courses from JSON data"""
    courses = []
    
    def traverse(obj):
        if isinstance(obj, dict):
            # Check for course data
            if 'url' in obj and '/courses/' in str(obj['url']):
                url = obj['url']
                if isinstance(url, str) and re.search(r'-[a-f0-9]{24}$', url):
                    course_name = obj.get('name') or obj.get('title') or obj.get('course_name', '')
                    if not course_name:
                        course_name = extract_course_name_from_url(url)
                    
                    courses.append({
                        'url': url,
                        'course_name': course_name
                    })
            
            # Recursively traverse
            for value in obj.values():
                traverse(value)
                
        elif isinstance(obj, list):
            for item in obj:
                traverse(item)
    
    traverse(data)
    return courses

def extract_courses_from_text(text, base_url):
    """Extract courses from text/HTML"""
    courses = []
    
    # Search for course URL patterns
    patterns = [
        r'https?://[^"\']+/courses/[^"\']+?-[a-f0-9]{24}[^"\']*',
        r'/courses/[^"\']+?-[a-f0-9]{24}',
    ]
    
    for pattern in patterns:
        matches = re.findall(pattern, text)
        for match in matches:
            if isinstance(match, tuple):
                match = match[0]
            
            # Construct full URL
            if match.startswith('/'):
                full_url = urljoin(base_url, match)
            elif match.startswith('http'):
                full_url = match
            else:
                continue
            
            clean_url = full_url.split('?')[0].split('#')[0].rstrip('/')
            course_name = extract_course_name_from_url(clean_url)
            
            courses.append({
                'url': clean_url,
                'course_name': course_name
            })
    
    return courses

# Main execution
if __name__ == "__main__":
    print("="*80)
    print("SMART ONLINE COURSE - COMPLETE WEBSITE COURSE EXTRACTION")
    print("="*80)
    print("Finding ALL courses in format:")
    print("https://www.smartonlinecourse.co.in/courses/[Course-Name]-[24-char-ID]")
    print("="*80)
    
    # Method 1: Comprehensive Selenium exploration
    print("\nüéØ METHOD 1: Comprehensive website exploration...")
    courses = find_all_courses_comprehensive()
    
    # Method 2: API discovery
    if len(courses) < 50:
        print("\nüéØ METHOD 2: API endpoint discovery...")
        api_courses = discover_courses_via_api()
        
        # Merge results
        existing_urls = set(course['url'] for course in courses)
        for course in api_courses:
            if course['url'] not in existing_urls:
                courses.append(course)
        
        print(f"\n‚ûï Added {len(api_courses)} courses from API discovery")
        
        # Save final results
        if courses:
            categorized = categorize_courses(courses)
            save_all_courses_to_excel(categorized)
    
    print("\n" + "="*80)
    if courses:
        print(f"üéâ EXTRACTION COMPLETE!")
        print(f"üìä Total courses extracted: {len(courses)}")
        print(f"üìÅ Output saved to: C:\\Users\\taslim.siddiqui\\Downloads\\all_courses_complete.xlsx")
    else:
        print("‚ùå No courses found")
        print("\nüí° The website might:")
        print("   1. Require login to see all courses")
        print("   2. Use complex JavaScript loading")
        print("   3. Have courses behind different URLs")
    print("="*80)