In [15]:
import time
import re
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

def get_driver(headless=True):
    """Initialize Chrome driver"""
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    if headless:
        options.add_argument("--headless=new")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

def clean_text(text):
    """Clean and normalize text"""
    if not isinstance(text, str):
        return ""
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def extract_course_name(soup):
    """Extract course name from the page"""
    try:
        course_name_elem = soup.find('h1', class_='size-38')
        if course_name_elem:
            return clean_text(course_name_elem.get_text())
        
        # Alternative selectors if the primary one fails
        course_name_elem = soup.find('h1')
        if course_name_elem:
            return clean_text(course_name_elem.get_text())
        
        return "Course name not found"
    except Exception as e:
        print(f"‚ö†Ô∏è Error extracting course name: {e}")
        return "Course name not found"

def extract_price(soup):
    """Extract course price from the page"""
    try:
        # Look for the original price with strike-through
        price_elem = soup.find('span', class_='curr', style=lambda value: value and 'line-through' in value)
        if price_elem:
            return clean_text(price_elem.get_text())
        
        # Alternative: look for any price element
        price_elem = soup.find('span', class_='curr')
        if price_elem:
            return clean_text(price_elem.get_text())
        
        # Try to find any price information
        price_text = soup.find(string=re.compile(r'‚Çπ|INR|Rs\.', re.IGNORECASE))
        if price_text:
            return clean_text(price_text)
        
        return "Price not found"
    except Exception as e:
        print(f"‚ö†Ô∏è Error extracting price: {e}")
        return "Price not found"

def extract_language(soup):
    """Extract course language from the page"""
    try:
        # Look for language block
        language_elem = soup.find('p', id='languageBlock')
        if language_elem:
            # Extract just the language part (remove "Language:" prefix)
            lang_text = clean_text(language_elem.get_text())
            lang_text = re.sub(r'^Language:\s*', '', lang_text, flags=re.IGNORECASE)
            return lang_text
        
        # Alternative: look for language information in any element
        lang_elem = soup.find(string=re.compile(r'Language:', re.IGNORECASE))
        if lang_elem:
            lang_text = clean_text(lang_elem)
            lang_text = re.sub(r'^Language:\s*', '', lang_text, flags=re.IGNORECASE)
            return lang_text
        
        return "Language not specified"
    except Exception as e:
        print(f"‚ö†Ô∏è Error extracting language: {e}")
        return "Language not specified"

def extract_duration(soup):
    """Extract course duration/validity period from the page"""
    try:
        # Look for validity period
        duration_elem = soup.find('p', class_='ml-0 mb-0 validityPeriod size-16')
        if duration_elem:
            # Extract just the duration part (remove "Validity Period:" prefix)
            duration_text = clean_text(duration_elem.get_text())
            duration_text = re.sub(r'^Validity Period:\s*', '', duration_text, flags=re.IGNORECASE)
            return duration_text
        
        # Alternative: look for duration information in any element
        duration_elem = soup.find(string=re.compile(r'Validity Period|Duration|Access', re.IGNORECASE))
        if duration_elem:
            duration_text = clean_text(duration_elem)
            duration_text = re.sub(r'^Validity Period:\s*', '', duration_text, flags=re.IGNORECASE)
            return duration_text
        
        return "Duration not specified"
    except Exception as e:
        print(f"‚ö†Ô∏è Error extracting duration: {e}")
        return "Duration not specified"

def extract_about_course(soup):
    """Extract 'About Course' information from the page"""
    try:
        # METHOD 1: Look for the specific div with ID
        specific_div = soup.find('div', id='1638793281030')
        if specific_div:
            print("‚úÖ Found div with specific ID")
            # Get all paragraphs in this div
            paragraphs = specific_div.find_all('p', class_='skrollable skrollable-between')
            if len(paragraphs) >= 2:
                # First paragraph (remove üåç emoji)
                para1 = clean_text(paragraphs[0].get_text()).replace('üåç', '').strip()
                # Second paragraph
                para2 = clean_text(paragraphs[1].get_text())
                about_text = f"{para1} {para2}"
                return about_text[:2000]
        
        # METHOD 2: Look for column full div
        column_div = soup.find('div', class_='column full')
        if column_div:
            print("‚úÖ Found column full div")
            paragraphs = column_div.find_all('p', class_='skrollable skrollable-between')
            if len(paragraphs) >= 2:
                para1 = clean_text(paragraphs[0].get_text()).replace('üåç', '').strip()
                para2 = clean_text(paragraphs[1].get_text())
                about_text = f"{para1} {para2}"
                return about_text[:2000]
        
        # METHOD 3: Search for the specific text pattern
        all_text = clean_text(soup.get_text())
        if 'Smart Online Course' in all_text and 'Risk Management Association of India' in all_text:
            print("‚úÖ Found specific text pattern")
            # Extract text starting from "Smart Online Course"
            start_idx = all_text.find('Smart Online Course')
            if start_idx != -1:
                # Take next 800 characters or until next major section
                about_text = all_text[start_idx:start_idx + 800]
                # Try to end at a reasonable point
                if 'What You Will Learn' in about_text:
                    about_text = about_text.split('What You Will Learn')[0].strip()
                elif 'üéØ' in about_text:
                    about_text = about_text.split('üéØ')[0].strip()
                return about_text[:2000]
        
        # METHOD 4: Look for paragraphs containing key phrases
        all_paragraphs = soup.find_all('p')
        about_parts = []
        
        for p in all_paragraphs:
            text = clean_text(p.get_text())
            if ('Smart Online Course' in text and 'Risk Management Association' in text) or \
               ('comprehensive overview of ISO 31000' in text):
                about_parts.append(text)
                # If we found the first part, try to get the next paragraph too
                if len(about_parts) == 1:
                    next_p = p.find_next_sibling('p')
                    if next_p:
                        next_text = clean_text(next_p.get_text())
                        if next_text and len(next_text) > 50:
                            about_parts.append(next_text)
                break
        
        if about_parts:
            return " ".join(about_parts)[:2000]
        
        return "About course information not found"
    except Exception as e:
        print(f"‚ö†Ô∏è Error extracting about course: {e}")
        return "About course information not found"

def extract_who_should_take_it(soup):
    """Extract 'Who Should Take It' information from the page"""
    try:
        # METHOD 1: Look for the "Who Should Enroll?" heading
        for heading in soup.find_all(['h3', 'h2', 'h4', 'strong']):
            heading_text = clean_text(heading.get_text())
            if 'who should' in heading_text.lower():
                print(f"‚úÖ Found heading: {heading_text}")
                # Look for the next paragraph
                next_p = heading.find_next_sibling('p')
                if next_p:
                    text = clean_text(next_p.get_text())
                    if text and ('‚úîÔ∏è' in text or 'professionals' in text.lower()):
                        # Format the text
                        text = re.sub(r'‚úîÔ∏è\s*', '‚Ä¢ ', text)
                        text = re.sub(r'&amp;', '&', text)
                        text = re.sub(r'<br\s*/?>', '\n', text)
                        return text[:1000]
                
                # Look in parent container
                parent = heading.parent
                if parent:
                    # Find all paragraphs with checkmarks
                    check_paragraphs = parent.find_all('p')
                    for p in check_paragraphs:
                        text = clean_text(p.get_text())
                        if '‚úîÔ∏è' in text and ('professionals' in text.lower() or 'leaders' in text.lower()):
                            text = re.sub(r'‚úîÔ∏è\s*', '‚Ä¢ ', text)
                            text = re.sub(r'&amp;', '&', text)
                            text = re.sub(r'<br\s*/?>', '\n', text)
                            return text[:1000]
        
        # METHOD 2: Look for paragraphs with checkmarks
        all_paragraphs = soup.find_all('p')
        for p in all_paragraphs:
            text = clean_text(p.get_text())
            if '‚úîÔ∏è' in text and ('professionals' in text.lower() or 'leaders' in text.lower()):
                print("‚úÖ Found paragraph with checkmarks")
                # Clean and format
                text = re.sub(r'‚úîÔ∏è\s*', '‚Ä¢ ', text)
                text = re.sub(r'&amp;', '&', text)
                text = re.sub(r'<br\s*/?>', '\n', text)
                
                # Format as clean list
                lines = [line.strip() for line in text.split('\n') if line.strip()]
                formatted_lines = []
                for line in lines:
                    if line.startswith('‚Ä¢'):
                        formatted_lines.append(line)
                    elif '‚Ä¢' in line:
                        formatted_lines.append(line)
                
                if formatted_lines:
                    return "\n".join(formatted_lines)[:1000]
        
        # METHOD 3: Search in the specific div
        specific_div = soup.find('div', id='1638793281030')
        if specific_div:
            print("‚úÖ Checking specific div for target audience")
            # Look for heading
            for heading in specific_div.find_all(['h3', 'strong']):
                if 'who should' in clean_text(heading.get_text()).lower():
                    # Get next paragraph
                    next_p = heading.find_next_sibling('p')
                    if next_p:
                        text = clean_text(next_p.get_text())
                        if text:
                            text = re.sub(r'‚úîÔ∏è\s*', '‚Ä¢ ', text)
                            text = re.sub(r'&amp;', '&', text)
                            return text[:1000]
        
        # METHOD 4: Search entire page text
        all_text = clean_text(soup.get_text())
        if 'Who Should Enroll?' in all_text:
            print("‚úÖ Found 'Who Should Enroll?' in page text")
            parts = all_text.split('Who Should Enroll?')
            if len(parts) > 1:
                who_text = parts[1]
                # Extract until next section
                next_sections = ['Course Highlights', 'What You Will Learn', 'üéì', 'üì¢']
                for section in next_sections:
                    if section in who_text:
                        who_text = who_text.split(section)[0]
                
                who_text = clean_text(who_text)
                if who_text:
                    who_text = re.sub(r'‚úîÔ∏è\s*', '‚Ä¢ ', who_text)
                    return who_text[:1000]
        
        return "Target audience information not found"
    except Exception as e:
        print(f"‚ö†Ô∏è Error extracting target audience: {e}")
        return "Target audience information not found"

def extract_syllabus_content(soup):
    """Extract syllabus content from the curriculum2 structure"""
    syllabus_parts = []
    
    # Find the main curriculum container
    curriculum_container = soup.find('div', class_='curriculum2')
    if not curriculum_container:
        print("‚ùå No curriculum container found")
        return "Syllabus content not available"
    
    print("‚úÖ Found curriculum container")
    
    # Extract all course modules (SKIP course overview completely)
    course_boxes = curriculum_container.find_all('div', class_='course-course-content-box')
    print(f"üìö Found {len(course_boxes)} course modules")
    
    for i, box in enumerate(course_boxes, 1):
        # Extract module header
        header = box.find('div', class_='course-content-blue-title')
        if header:
            module_title_elem = header.find('h3')
            module_info_elem = header.find('p')
            
            module_title = clean_text(module_title_elem.get_text()) if module_title_elem else f"Module {i}"
            module_info = clean_text(module_info_elem.get_text()) if module_info_elem else ""
            
            # Remove the expand icon from module info
            module_info = module_info.replace('expand_more', '').strip()
            
            # Skip the feedback/review section
            if 'feedback' in module_title.lower() or 'review' in module_title.lower() or 'enjoyed' in module_title.lower():
                print(f"‚è© Skipping feedback section: {module_title}")
                continue
            
            # Skip final test section
            if 'final test' in module_title.lower():
                print(f"‚è© Skipping final test section: {module_title}")
                continue
            
            syllabus_parts.append(f"üéØ {module_title}")
            syllabus_parts.append("-" * 40)
            
            # Remove ONLY time information from module info, KEEP Sessions
            if module_info:
                # Remove time patterns but KEEP Sessions information
                module_info = re.sub(r'\|\s*Time:\s*[\d\sminsechour]*', '', module_info)
                module_info = re.sub(r'Time:\s*[\d\sminsechour]*', '', module_info)
                module_info = re.sub(r'\d+\s*hour\s*\d*\s*min\s*\d*\s*sec', '', module_info)
                module_info = re.sub(r'\d+\s*min\s*\d*\s*sec', '', module_info)
                module_info = clean_text(module_info)
                
                # Remove any remaining pipes or empty strings but KEEP Sessions
                module_info = re.sub(r'^\|\s*', '', module_info)
                module_info = re.sub(r'\|\s*$', '', module_info)
                module_info = clean_text(module_info)
                
                if module_info:
                    syllabus_parts.append(f"üìù {module_info}")
            syllabus_parts.append("")
            
            # Extract module content
            content_bodies = box.find_all('div', class_='course-content-body')
            for content_body in content_bodies:
                content = extract_module_content(content_body)
                if content:
                    syllabus_parts.append(content)
                    syllabus_parts.append("")
    
    return "\n".join(syllabus_parts) if syllabus_parts else "No syllabus content found"

def extract_module_content(content_body):
    """Extract content from a course-content-body div - WITHOUT duration"""
    content_parts = []
    
    desc = content_body.find('div', class_='desc1')
    if desc:
        # Extract the main content box
        content_box = desc.find('div', class_='is-box-9')
        if content_box:
            box_content = content_box.find('div', class_='boxa1')
            if box_content:
                # Get icon type
                icon = box_content.find('span', class_='material-icons')
                icon_type = clean_text(icon.get_text()) if icon else ""
                
                # Get content text
                content_text_elem = box_content.find('p')
                content_text = clean_text(content_text_elem.get_text()) if content_text_elem else ""
                
                if content_text:
                    # Skip content that contains duration or is empty
                    if (not content_text or 
                        'min' in content_text.lower() and 'sec' in content_text.lower() or
                        'hour' in content_text.lower() and 'min' in content_text.lower()):
                        return ""
                    
                    # Map icons to emojis
                    icon_map = {
                        'label_important': 'üìå',
                        'ondemand_video': 'üé•',
                        'description': 'üìÑ',
                        'picture_as_pdf': 'üìé'
                    }
                    
                    emoji = icon_map.get(icon_type, '‚Ä¢')
                    
                    # Remove any duration text from content
                    content_text = re.sub(r'\d+\s*hour\s*\d*\s*min\s*\d*\s*sec', '', content_text)
                    content_text = re.sub(r'\d+\s*min\s*\d*\s*sec', '', content_text)
                    content_text = clean_text(content_text)
                    
                    if content_text:
                        content_parts.append(f"{emoji} {content_text}")
    
    return "\n".join(content_parts)

def debug_page_content(soup):
    """Debug function to see what's actually on the page"""
    print("\nüîç DEBUGGING PAGE CONTENT:")
    print("="*60)
    
    # Check for column full div
    column_div = soup.find('div', class_='column full')
    print(f"1. Column full div found: {'YES' if column_div else 'NO'}")
    if column_div:
        print(f"   Div text preview: {clean_text(column_div.get_text())[:200]}...")
    
    # Check for specific ID
    specific_div = soup.find('div', id='1638793281030')
    print(f"2. Div with id='1638793281030' found: {'YES' if specific_div else 'NO'}")
    
    # Check for skrollable paragraphs
    skrollable_ps = soup.find_all('p', class_='skrollable skrollable-between')
    print(f"3. Skrollable paragraphs found: {len(skrollable_ps)}")
    for i, p in enumerate(skrollable_ps[:3]):
        text = clean_text(p.get_text())
        print(f"   Paragraph {i+1} (first 100 chars): {text[:100]}...")
    
    # Check for headings
    headings = soup.find_all(['h3', 'h2', 'h4'])
    print(f"4. Headings found: {len(headings)}")
    for h in headings:
        text = clean_text(h.get_text())
        if 'who' in text.lower() or 'enroll' in text.lower():
            print(f"   Found relevant heading: {text}")
    
    # Check for "Smart Online Course" text
    all_text = clean_text(soup.get_text())
    if 'Smart Online Course' in all_text:
        print("5. 'Smart Online Course' text found in page")
        idx = all_text.find('Smart Online Course')
        print(f"   Context: {all_text[idx:idx+150]}...")
    
    # Check for checkmarks
    if '‚úîÔ∏è' in all_text:
        print("6. Checkmarks (‚úîÔ∏è) found in page")
    
    print("="*60)

def test_single_course(url):
    """Test extraction for a single course"""
    print(f"üåê Testing extraction from: {url}")
    driver = get_driver(headless=False)
    try:
        driver.get(url)
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        time.sleep(5)
        
        if "404" in driver.title or "not found" in driver.title.lower():
            print("‚ùå Page not found (404)")
            return None
        
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, "html.parser")
        
        # DEBUG: Check what's on the page
        debug_page_content(soup)
        
        # Extract all course information
        course_name = extract_course_name(soup)
        print(f"üìñ Course Name: {course_name}")
        
        price = extract_price(soup)
        print(f"üí∞ Price: {price}")
        
        language = extract_language(soup)
        print(f"üåê Language: {language}")
        
        duration = extract_duration(soup)
        print(f"‚è±Ô∏è Duration: {duration}")
        
        about_course = extract_about_course(soup)
        print(f"üìù About Course (first 150 chars): {about_course[:150]}...")
        
        who_should_take_it = extract_who_should_take_it(soup)
        print(f"üéØ Target Audience (first 150 chars): {who_should_take_it[:150]}...")
        
        syllabus_content = extract_syllabus_content(soup)
        
        course_data = {
            "Course_URL": url,
            "Course_Name": course_name,
            "Price": price,
            "Language": language,
            "Duration": duration,
            "About_Course": about_course,
            "Who_Should_Take_It": who_should_take_it,
            "Syllabus_Content": syllabus_content
        }
        
        print("‚úÖ Extraction completed successfully!")
        return course_data
        
    except Exception as e:
        print(f"‚ùå Extraction failed: {str(e)}")
        return None
    finally:
        driver.quit()

def save_to_excel(course_data, filename="syllabus_output.xlsx"):
    """Save course data to Excel"""
    try:
        # Create DataFrame from the course data
        df = pd.DataFrame([course_data])
        
        # Save to Excel
        df.to_excel(filename, index=False)
        print(f"üíæ Data saved to: {filename}")
        
        # Also display the saved data
        print("\nüìä Saved Data Preview:")
        print("=" * 50)
        for key, value in course_data.items():
            if key == 'Syllabus_Content':
                print(f"{key}: {len(value)} characters")
            elif key in ['About_Course', 'Who_Should_Take_It']:
                print(f"{key}:")
                print(f"{value[:300]}..." if len(value) > 300 else value)
                print()
            else:
                print(f"{key}: {value}")
        
        return True
    except Exception as e:
        print(f"‚ùå Error saving to Excel: {e}")
        return False

def display_syllabus_info(course_data):
    print("\n" + "="*80)
    print("üìã EXTRACTED COURSE INFORMATION")
    print("="*80)
    print(f"üîó URL: {course_data['Course_URL']}")
    print(f"üìñ Course Name: {course_data['Course_Name']}")
    print(f"üí∞ Price: {course_data['Price']}")
    print(f"üåê Language: {course_data['Language']}")
    print(f"‚è±Ô∏è Duration: {course_data['Duration']}")
    
    print(f"\nüìù About Course:")
    print("-" * 40)
    print(course_data['About_Course'][:500] + "..." if len(course_data['About_Course']) > 500 else course_data['About_Course'])
    
    print(f"\nüéØ Target Audience:")
    print("-" * 40)
    print(course_data['Who_Should_Take_It'][:500] + "..." if len(course_data['Who_Should_Take_It']) > 500 else course_data['Who_Should_Take_It'])
    
    print("\n" + "="*80)
    print("üìñ COURSE SYLLABUS CONTENT")
    print("="*80)
    print(course_data['Syllabus_Content'][:1000] + "..." if len(course_data['Syllabus_Content']) > 1000 else course_data['Syllabus_Content'])

# -------------------- TEST SINGLE COURSE AND SAVE TO EXCEL --------------------
if __name__ == "__main__":
    # Test with the specific course link
    test_url = "https://www.smartonlinecourse.co.in/courses/Regulations-of-Insurance-Business-IC14---III-Exam-Mock-Test-61cabb940cf2776e30388e3c"
    
    print("üöÄ TESTING SINGLE COURSE EXTRACTION")
    print("="*60)
    
    # Extract course data
    course_data = test_single_course(test_url)
    
    if course_data:
        # Display the extracted content
        display_syllabus_info(course_data)
        
        # Save to Excel with your specific path
        output_filename = r"C:\Users\taslim.siddiqui\Downloads\course_syllabus_output.xlsx"
        save_to_excel(course_data, output_filename)
        
        print(f"\n‚úÖ Process completed! Check the Excel file: {output_filename}")
    else:
        print("‚ùå Failed to extract course data")

üöÄ TESTING SINGLE COURSE EXTRACTION
üåê Testing extraction from: https://www.smartonlinecourse.co.in/courses/Regulations-of-Insurance-Business-IC14---III-Exam-Mock-Test-61cabb940cf2776e30388e3c

üîç DEBUGGING PAGE CONTENT:
1. Column full div found: YES
   Div text preview: ...
2. Div with id='1638793281030' found: YES
3. Skrollable paragraphs found: 0
4. Headings found: 17
5. 'Smart Online Course' text found in page
   Context: Smart Online Course Coins ADD TO CART Why this course? Description Sashi Publications is pleased to launch the Online Mock Test for paper IC14, Regula...
üìñ Course Name: Regulations of Insurance Business (IC14) - III Exam Mock Test
üí∞ Price: ‚Çπ600
üåê Language: ENGLISH
‚è±Ô∏è Duration: 120 days
‚úÖ Found div with specific ID
‚úÖ Found column full div
üìù About Course (first 150 chars): About course information not found...
‚úÖ Checking specific div for target audience
üéØ Target Audience (first 150 chars): Target audience information not found...
‚úÖ

In [1]:
import time
import re
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import warnings
warnings.filterwarnings('ignore')

def get_driver(headless=True):
    """Initialize Chrome driver"""
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    if headless:
        options.add_argument("--headless=new")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

def clean_text(text):
    """Clean and normalize text"""
    if not isinstance(text, str):
        return ""
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def extract_course_name(soup):
    """Extract course name from the page"""
    try:
        course_name_elem = soup.find('h1', class_='size-38')
        if course_name_elem:
            return clean_text(course_name_elem.get_text())
        
        course_name_elem = soup.find('h1')
        if course_name_elem:
            return clean_text(course_name_elem.get_text())
        
        return "Course name not found"
    except Exception as e:
        print(f"‚ö†Ô∏è Error extracting course name: {e}")
        return "Course name not found"

def extract_price(soup):
    """Extract course price from the page"""
    try:
        price_elem = soup.find('span', class_='curr', style=lambda value: value and 'line-through' in value)
        if price_elem:
            return clean_text(price_elem.get_text())
        
        price_elem = soup.find('span', class_='curr')
        if price_elem:
            return clean_text(price_elem.get_text())
        
        price_text = soup.find(string=re.compile(r'‚Çπ|INR|Rs\.', re.IGNORECASE))
        if price_text:
            return clean_text(price_text)
        
        return "Price not found"
    except Exception as e:
        print(f"‚ö†Ô∏è Error extracting price: {e}")
        return "Price not found"

def extract_language(soup):
    """Extract course language from the page"""
    try:
        language_elem = soup.find('p', id='languageBlock')
        if language_elem:
            lang_text = clean_text(language_elem.get_text())
            lang_text = re.sub(r'^Language:\s*', '', lang_text, flags=re.IGNORECASE)
            return lang_text
        
        lang_elem = soup.find(string=re.compile(r'Language:', re.IGNORECASE))
        if lang_elem:
            lang_text = clean_text(lang_elem)
            lang_text = re.sub(r'^Language:\s*', '', lang_text, flags=re.IGNORECASE)
            return lang_text
        
        return "Language not specified"
    except Exception as e:
        print(f"‚ö†Ô∏è Error extracting language: {e}")
        return "Language not specified"

def extract_duration(soup):
    """Extract course duration/validity period from the page"""
    try:
        duration_elem = soup.find('p', class_='ml-0 mb-0 validityPeriod size-16')
        if duration_elem:
            duration_text = clean_text(duration_elem.get_text())
            duration_text = re.sub(r'^Validity Period:\s*', '', duration_text, flags=re.IGNORECASE)
            return duration_text
        
        duration_elem = soup.find(string=re.compile(r'Validity Period|Duration|Access', re.IGNORECASE))
        if duration_elem:
            duration_text = clean_text(duration_elem)
            duration_text = re.sub(r'^Validity Period:\s*', '', duration_text, flags=re.IGNORECASE)
            return duration_text
        
        return "Duration not specified"
    except Exception as e:
        print(f"‚ö†Ô∏è Error extracting duration: {e}")
        return "Duration not specified"

def extract_about_course(soup):
    """Extract 'About Course' information from the page"""
    try:
        specific_div = soup.find('div', id='1638793281030')
        if specific_div:
            paragraphs = specific_div.find_all('p', class_='skrollable skrollable-between')
            if len(paragraphs) >= 2:
                para1 = clean_text(paragraphs[0].get_text()).replace('üåç', '').strip()
                para2 = clean_text(paragraphs[1].get_text())
                about_text = f"{para1} {para2}"
                return about_text[:2000]
        
        column_div = soup.find('div', class_='column full')
        if column_div:
            paragraphs = column_div.find_all('p', class_='skrollable skrollable-between')
            if len(paragraphs) >= 2:
                para1 = clean_text(paragraphs[0].get_text()).replace('üåç', '').strip()
                para2 = clean_text(paragraphs[1].get_text())
                about_text = f"{para1} {para2}"
                return about_text[:2000]
        
        all_text = clean_text(soup.get_text())
        if 'Smart Online Course' in all_text and 'Risk Management Association of India' in all_text:
            start_idx = all_text.find('Smart Online Course')
            if start_idx != -1:
                about_text = all_text[start_idx:start_idx + 800]
                if 'What You Will Learn' in about_text:
                    about_text = about_text.split('What You Will Learn')[0].strip()
                elif 'üéØ' in about_text:
                    about_text = about_text.split('üéØ')[0].strip()
                return about_text[:2000]
        
        all_paragraphs = soup.find_all('p')
        about_parts = []
        
        for p in all_paragraphs:
            text = clean_text(p.get_text())
            if ('Smart Online Course' in text and 'Risk Management Association' in text) or \
               ('comprehensive overview of ISO 31000' in text):
                about_parts.append(text)
                if len(about_parts) == 1:
                    next_p = p.find_next_sibling('p')
                    if next_p:
                        next_text = clean_text(next_p.get_text())
                        if next_text and len(next_text) > 50:
                            about_parts.append(next_text)
                break
        
        if about_parts:
            return " ".join(about_parts)[:2000]
        
        return "About course information not found"
    except Exception as e:
        print(f"‚ö†Ô∏è Error extracting about course: {e}")
        return "About course information not found"

def extract_who_should_take_it(soup):
    """Extract 'Who Should Take It' information from the page"""
    try:
        for heading in soup.find_all(['h3', 'h2', 'h4', 'strong']):
            heading_text = clean_text(heading.get_text())
            if 'who should' in heading_text.lower():
                next_p = heading.find_next_sibling('p')
                if next_p:
                    text = clean_text(next_p.get_text())
                    if text and ('‚úîÔ∏è' in text or 'professionals' in text.lower()):
                        text = re.sub(r'‚úîÔ∏è\s*', '‚Ä¢ ', text)
                        text = re.sub(r'&amp;', '&', text)
                        text = re.sub(r'<br\s*/?>', '\n', text)
                        return text[:1000]
                
                parent = heading.parent
                if parent:
                    check_paragraphs = parent.find_all('p')
                    for p in check_paragraphs:
                        text = clean_text(p.get_text())
                        if '‚úîÔ∏è' in text and ('professionals' in text.lower() or 'leaders' in text.lower()):
                            text = re.sub(r'‚úîÔ∏è\s*', '‚Ä¢ ', text)
                            text = re.sub(r'&amp;', '&', text)
                            text = re.sub(r'<br\s*/?>', '\n', text)
                            return text[:1000]
        
        all_paragraphs = soup.find_all('p')
        for p in all_paragraphs:
            text = clean_text(p.get_text())
            if '‚úîÔ∏è' in text and ('professionals' in text.lower() or 'leaders' in text.lower()):
                text = re.sub(r'‚úîÔ∏è\s*', '‚Ä¢ ', text)
                text = re.sub(r'&amp;', '&', text)
                text = re.sub(r'<br\s*/?>', '\n', text)
                
                lines = [line.strip() for line in text.split('\n') if line.strip()]
                formatted_lines = []
                for line in lines:
                    if line.startswith('‚Ä¢'):
                        formatted_lines.append(line)
                    elif '‚Ä¢' in line:
                        formatted_lines.append(line)
                
                if formatted_lines:
                    return "\n".join(formatted_lines)[:1000]
        
        specific_div = soup.find('div', id='1638793281030')
        if specific_div:
            for heading in specific_div.find_all(['h3', 'strong']):
                if 'who should' in clean_text(heading.get_text()).lower():
                    next_p = heading.find_next_sibling('p')
                    if next_p:
                        text = clean_text(next_p.get_text())
                        if text:
                            text = re.sub(r'‚úîÔ∏è\s*', '‚Ä¢ ', text)
                            text = re.sub(r'&amp;', '&', text)
                            return text[:1000]
        
        all_text = clean_text(soup.get_text())
        if 'Who Should Enroll?' in all_text:
            parts = all_text.split('Who Should Enroll?')
            if len(parts) > 1:
                who_text = parts[1]
                next_sections = ['Course Highlights', 'What You Will Learn', 'üéì', 'üì¢']
                for section in next_sections:
                    if section in who_text:
                        who_text = who_text.split(section)[0]
                
                who_text = clean_text(who_text)
                if who_text:
                    who_text = re.sub(r'‚úîÔ∏è\s*', '‚Ä¢ ', who_text)
                    return who_text[:1000]
        
        return "Target audience information not found"
    except Exception as e:
        print(f"‚ö†Ô∏è Error extracting target audience: {e}")
        return "Target audience information not found"

def extract_syllabus_content(soup):
    """Extract syllabus content from the curriculum2 structure"""
    syllabus_parts = []
    
    curriculum_container = soup.find('div', class_='curriculum2')
    if not curriculum_container:
        return "Syllabus content not available"
    
    course_boxes = curriculum_container.find_all('div', class_='course-course-content-box')
    
    for i, box in enumerate(course_boxes, 1):
        header = box.find('div', class_='course-content-blue-title')
        if header:
            module_title_elem = header.find('h3')
            module_info_elem = header.find('p')
            
            module_title = clean_text(module_title_elem.get_text()) if module_title_elem else f"Module {i}"
            module_info = clean_text(module_info_elem.get_text()) if module_info_elem else ""
            
            module_info = module_info.replace('expand_more', '').strip()
            
            if 'feedback' in module_title.lower() or 'review' in module_title.lower() or 'enjoyed' in module_title.lower():
                continue
            
            if 'final test' in module_title.lower():
                continue
            
            syllabus_parts.append(f"üéØ {module_title}")
            syllabus_parts.append("-" * 40)
            
            if module_info:
                module_info = re.sub(r'\|\s*Time:\s*[\d\sminsechour]*', '', module_info)
                module_info = re.sub(r'Time:\s*[\d\sminsechour]*', '', module_info)
                module_info = re.sub(r'\d+\s*hour\s*\d*\s*min\s*\d*\s*sec', '', module_info)
                module_info = re.sub(r'\d+\s*min\s*\d*\s*sec', '', module_info)
                module_info = clean_text(module_info)
                
                module_info = re.sub(r'^\|\s*', '', module_info)
                module_info = re.sub(r'\|\s*$', '', module_info)
                module_info = clean_text(module_info)
                
                if module_info:
                    syllabus_parts.append(f"üìù {module_info}")
            syllabus_parts.append("")
            
            content_bodies = box.find_all('div', class_='course-content-body')
            for content_body in content_bodies:
                content = extract_module_content(content_body)
                if content:
                    syllabus_parts.append(content)
                    syllabus_parts.append("")
    
    return "\n".join(syllabus_parts) if syllabus_parts else "No syllabus content found"

def extract_module_content(content_body):
    """Extract content from a course-content-body div - WITHOUT duration"""
    content_parts = []
    
    desc = content_body.find('div', class_='desc1')
    if desc:
        content_box = desc.find('div', class_='is-box-9')
        if content_box:
            box_content = content_box.find('div', class_='boxa1')
            if box_content:
                icon = box_content.find('span', class_='material-icons')
                icon_type = clean_text(icon.get_text()) if icon else ""
                
                content_text_elem = box_content.find('p')
                content_text = clean_text(content_text_elem.get_text()) if content_text_elem else ""
                
                if content_text:
                    if (not content_text or 
                        'min' in content_text.lower() and 'sec' in content_text.lower() or
                        'hour' in content_text.lower() and 'min' in content_text.lower()):
                        return ""
                    
                    icon_map = {
                        'label_important': 'üìå',
                        'ondemand_video': 'üé•',
                        'description': 'üìÑ',
                        'picture_as_pdf': 'üìé'
                    }
                    
                    emoji = icon_map.get(icon_type, '‚Ä¢')
                    
                    content_text = re.sub(r'\d+\s*hour\s*\d*\s*min\s*\d*\s*sec', '', content_text)
                    content_text = re.sub(r'\d+\s*min\s*\d*\s*sec', '', content_text)
                    content_text = clean_text(content_text)
                    
                    if content_text:
                        content_parts.append(f"{emoji} {content_text}")
    
    return "\n".join(content_parts)

def scrape_single_course(driver, url):
    """Scrape data from a single course URL"""
    try:
        print(f"\nüåê Processing: {url}")
        driver.get(url)
        
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        time.sleep(3)
        
        if "404" in driver.title or "not found" in driver.title.lower():
            print(f"‚ùå Page not found (404): {url}")
            return None
        
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, "html.parser")
        
        course_data = {
            "Course_URL": url,
            "Course_Name": extract_course_name(soup),
            "Price": extract_price(soup),
            "Language": extract_language(soup),
            "Duration": extract_duration(soup),
            "About_Course": extract_about_course(soup),
            "Who_Should_Take_It": extract_who_should_take_it(soup),
            "Syllabus_Content": extract_syllabus_content(soup)
        }
        
        print(f"‚úÖ Successfully scraped: {course_data['Course_Name']}")
        return course_data
        
    except Exception as e:
        print(f"‚ùå Error scraping {url}: {str(e)}")
        return None

def main():
    """Main function to read URLs from input Excel and save scraped data to output Excel"""
    
    # üìò Input Excel file path
    input_excel = r"C:\Users\taslim.siddiqui\Downloads\smart_online_data.xlsx"
    
    # Output file
    output_filename = r"C:\Users\taslim.siddiqui\Downloads\all_courses_Scarped_data_Smart_online.xlsx"
    
    print("üöÄ STARTING COURSE DATA SCRAPER")
    print("="*60)
    print(f"üì• Input Excel: {input_excel}")
    print(f"üì§ Output Excel: {output_filename}")
    print("="*60)
    
    try:
        # Read URLs from input Excel
        print("\nüìñ Reading URLs from input Excel...")
        df_input = pd.read_excel(input_excel)
        
        # Check for URL column (try common column names)
        url_column = None
        for col in df_input.columns:
            if 'URL' in col.lower() or 'link' in col.lower() or 'URL' in col:
                url_column = col
                break
        
        if not url_column:
            print("‚ùå No URL column found in input Excel")
            print("Available columns:", df_input.columns.tolist())
            return
        
        print(f"‚úÖ Found URL column: '{url_column}'")
        print(f"üìä Total URLs found: {len(df_input)}")
        
        # Get unique URLs
        urls = df_input[url_column].dropna().unique().tolist()
        print(f"üîó Unique URLs to process: {len(urls)}")
        
        if len(urls) == 0:
            print("‚ùå No URLs found to process")
            return
        
        # Initialize driver
        print("\nüöÄ Initializing Chrome driver...")
        driver = get_driver(headless=True)
        
        all_course_data = []
        failed_urls = []
        
        # Process each URL
        for i, url in enumerate(urls, 1):
            print(f"\nüìù Processing URL {i}/{len(urls)}")
            
            if not isinstance(url, str) or not url.startswith('http'):
                print(f"‚ö†Ô∏è Skipping invalid URL: {url}")
                failed_urls.append({"url": url, "error": "Invalid URL format"})
                continue
            
            course_data = scrape_single_course(driver, url)
            
            if course_data:
                all_course_data.append(course_data)
                print(f"‚úÖ Successfully scraped: {course_data['Course_Name']}")
            else:
                failed_urls.append({"url": url, "error": "Failed to scrape"})
                print(f"‚ùå Failed to scrape: {url}")
            
            # Small delay between requests
            if i < len(urls):
                time.sleep(2)
        
        # Close driver
        driver.quit()
        
        # Save results to output Excel
        if all_course_data:
            print(f"\nüíæ Saving {len(all_course_data)} scraped courses to Excel...")
            df_output = pd.DataFrame(all_course_data)
            
            # Save to Excel
            df_output.to_excel(output_filename, index=False)
            
            print(f"‚úÖ Successfully saved {len(all_course_data)} courses to: {output_filename}")
            
            # Display summary
            print("\nüìä SCRAPING SUMMARY")
            print("="*60)
            print(f"‚úÖ Successfully scraped: {len(all_course_data)} courses")
            print(f"‚ùå Failed to scrape: {len(failed_urls)} courses")
            
            if failed_urls:
                print("\n‚ùå Failed URLs:")
                for failed in failed_urls[:5]:  # Show first 5 failed URLs
                    print(f"  - {failed['url']} ({failed['error']})")
                if len(failed_urls) > 5:
                    print(f"  ... and {len(failed_urls) - 5} more")
            
            # Show sample of scraped data
            print("\nüìã SAMPLE OF SCRAPED DATA:")
            print("="*60)
            if len(all_course_data) > 0:
                sample = all_course_data[0]
                print(f"üîó URL: {sample['Course_URL']}")
                print(f"üìñ Course Name: {sample['Course_Name']}")
                print(f"üí∞ Price: {sample['Price']}")
                print(f"üåê Language: {sample['Language']}")
                print(f"‚è±Ô∏è Duration: {sample['Duration']}")
                print(f"üìù About Course (first 100 chars): {sample['About_Course'][:100]}...")
                print(f"üéØ Target Audience (first 100 chars): {sample['Who_Should_Take_It'][:100]}...")
                print(f"üìñ Syllabus Length: {len(sample['Syllabus_Content'])} characters")
        
        else:
            print("\n‚ùå No courses were successfully scraped")
            
    except FileNotFoundError:
        print(f"‚ùå Input file not found: {input_excel}")
    except Exception as e:
        print(f"‚ùå Error in main process: {str(e)}")
        import traceback
        traceback.print_exc()

# -------------------- RUN MAIN FUNCTION --------------------
if __name__ == "__main__":
    main()

üöÄ STARTING COURSE DATA SCRAPER
üì• Input Excel: C:\Users\taslim.siddiqui\Downloads\smart_online_data.xlsx
üì§ Output Excel: C:\Users\taslim.siddiqui\Downloads\all_courses_Scarped_data_Smart_online.xlsx

üìñ Reading URLs from input Excel...
‚úÖ Found URL column: 'course_link__c'
üìä Total URLs found: 121
üîó Unique URLs to process: 119

üöÄ Initializing Chrome driver...

üìù Processing URL 1/119

üåê Processing: https://www.smartonlinecourse.co.in/courses/Climate-Risk-and-Resilence-67cc36767d791402c1f575bd
‚úÖ Successfully scraped: Climate Risk and Resilience
‚úÖ Successfully scraped: Climate Risk and Resilience

üìù Processing URL 2/119

üåê Processing: https://www.smartonlinecourse.co.in/courses/Actuarial-Aspects-of-Product-Development-IC92---III-Exam-Mock-Test-61cad7dd0cf215d00ec35f3c
‚úÖ Successfully scraped: Actuarial Aspects of Product Development (IC92) - III Exam Mock Test
‚úÖ Successfully scraped: Actuarial Aspects of Product Development (IC92) - III Exam Mock Test

In [1]:
import time
import re
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

def get_driver(headless=True):
    """Initialize Chrome driver"""
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    if headless:
        options.add_argument("--headless=new")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

def clean_text(text):
    """Clean and normalize text"""
    if not isinstance(text, str):
        return ""
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def extract_syllabus_content(soup):
    """Extract syllabus content from the curriculum2 structure"""
    syllabus_parts = []
    
    # Find the main curriculum container
    curriculum_container = soup.find('div', class_='curriculum2')
    if not curriculum_container:
        print("‚ùå No curriculum container found")
        return "Syllabus content not available"
    
    print("‚úÖ Found curriculum container")
    
    # Extract all course modules (SKIP course overview completely)
    course_boxes = curriculum_container.find_all('div', class_='course-course-content-box')
    print(f"üìö Found {len(course_boxes)} course modules")
    
    for i, box in enumerate(course_boxes, 1):
        # Extract module header
        header = box.find('div', class_='course-content-blue-title')
        if header:
            module_title_elem = header.find('h3')
            module_info_elem = header.find('p')
            
            module_title = clean_text(module_title_elem.get_text()) if module_title_elem else f"Module {i}"
            module_info = clean_text(module_info_elem.get_text()) if module_info_elem else ""
            
            # Remove the expand icon from module info
            module_info = module_info.replace('expand_more', '').strip()
            
            # Skip the feedback/review section
            if 'feedback' in module_title.lower() or 'review' in module_title.lower() or 'enjoyed' in module_title.lower():
                print(f"‚è© Skipping feedback section: {module_title}")
                continue
            
            # Skip final test section
            if 'final test' in module_title.lower():
                print(f"‚è© Skipping final test section: {module_title}")
                continue
            
            syllabus_parts.append(f"üéØ {module_title}")
            syllabus_parts.append("-" * 40)
            
            # Remove ONLY time information from module info, KEEP Sessions
            if module_info:
                # Remove time patterns but KEEP Sessions information
                module_info = re.sub(r'\|\s*Time:\s*[\d\sminsechour]*', '', module_info)
                module_info = re.sub(r'Time:\s*[\d\sminsechour]*', '', module_info)
                module_info = re.sub(r'\d+\s*hour\s*\d*\s*min\s*\d*\s*sec', '', module_info)
                module_info = re.sub(r'\d+\s*min\s*\d*\s*sec', '', module_info)
                module_info = clean_text(module_info)
                
                # Remove any remaining pipes or empty strings but KEEP Sessions
                module_info = re.sub(r'^\|\s*', '', module_info)
                module_info = re.sub(r'\|\s*$', '', module_info)
                module_info = clean_text(module_info)
                
                if module_info:
                    syllabus_parts.append(f"üìù {module_info}")
            syllabus_parts.append("")
            
            # Extract module content
            content_bodies = box.find_all('div', class_='course-content-body')
            for content_body in content_bodies:
                content = extract_module_content(content_body)
                if content:
                    syllabus_parts.append(content)
                    syllabus_parts.append("")
    
    return "\n".join(syllabus_parts) if syllabus_parts else "No syllabus content found"

def extract_module_content(content_body):
    """Extract content from a course-content-body div - WITHOUT duration"""
    content_parts = []
    
    desc = content_body.find('div', class_='desc1')
    if desc:
        # Extract the main content box
        content_box = desc.find('div', class_='is-box-9')
        if content_box:
            box_content = content_box.find('div', class_='boxa1')
            if box_content:
                # Get icon type
                icon = box_content.find('span', class_='material-icons')
                icon_type = clean_text(icon.get_text()) if icon else ""
                
                # Get content text
                content_text_elem = box_content.find('p')
                content_text = clean_text(content_text_elem.get_text()) if content_text_elem else ""
                
                if content_text:
                    # Skip content that contains duration or is empty
                    if (not content_text or 
                        'min' in content_text.lower() and 'sec' in content_text.lower() or
                        'hour' in content_text.lower() and 'min' in content_text.lower()):
                        return ""
                    
                    # Map icons to emojis
                    icon_map = {
                        'label_important': 'üìå',
                        'ondemand_video': 'üé•',
                        'description': 'üìÑ',
                        'picture_as_pdf': 'üìé'
                    }
                    
                    emoji = icon_map.get(icon_type, '‚Ä¢')
                    
                    # Remove any duration text from content
                    content_text = re.sub(r'\d+\s*hour\s*\d*\s*min\s*\d*\s*sec', '', content_text)
                    content_text = re.sub(r'\d+\s*min\s*\d*\s*sec', '', content_text)
                    content_text = clean_text(content_text)
                    
                    if content_text:
                        content_parts.append(f"{emoji} {content_text}")
    
    return "\n".join(content_parts)

def test_single_course(url):
    """Test extraction for a single course"""
    print(f"üåê Testing extraction from: {url}")
    driver = get_driver(headless=False)
    try:
        driver.get(url)
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        time.sleep(5)
        
        if "404" in driver.title or "not found" in driver.title.lower():
            print("‚ùå Page not found (404)")
            return None
        
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, "html.parser")
        
        syllabus_content = extract_syllabus_content(soup)
        
        course_data = {
            "Course_URL": url,
            "Syllabus_Content": syllabus_content
        }
        
        print("‚úÖ Extraction completed successfully!")
        return course_data
        
    except Exception as e:
        print(f"‚ùå Extraction failed: {str(e)}")
        return None
    finally:
        driver.quit()

def save_to_excel(course_data, filename="syllabus_output.xlsx"):
    """Save course data to Excel"""
    try:
        # Create DataFrame from the course data
        df = pd.DataFrame([course_data])
        
        # Save to Excel
        df.to_excel(filename, index=False)
        print(f"üíæ Data saved to: {filename}")
        
        # Also display the saved data
        print("\nüìä Saved Data Preview:")
        print("=" * 50)
        print(f"Course URL: {course_data['Course_URL']}")
        print(f"Syllabus Content Length: {len(course_data['Syllabus_Content'])} characters")
        
        return True
    except Exception as e:
        print(f"‚ùå Error saving to Excel: {e}")
        return False

def display_syllabus_info(course_data):
    print("\n" + "="*80)
    print("üìã EXTRACTED SYLLABUS INFORMATION")
    print("="*80)
    print(f"üîó URL: {course_data['Course_URL']}")
    print("\n" + "="*80)
    print("üìñ COURSE SYLLABUS CONTENT")
    print("="*80)
    print(course_data['Syllabus_Content'])

# -------------------- TEST SINGLE COURSE AND SAVE TO EXCEL --------------------
if __name__ == "__main__":
    # Test with the specific course link
    test_url = "https://www.smartonlinecourse.co.in/courses/ISO-31000-Risk-Management-Standard-67b98757a167595ed22480ca"
    
    print("üöÄ TESTING SINGLE COURSE EXTRACTION")
    print("="*60)
    
    # Extract course data
    course_data = test_single_course(test_url)
    
    if course_data:
        # Display the extracted content
        display_syllabus_info(course_data)
        
        # Save to Excel with your specific path
        output_filename = r"C:\Users\taslim.siddiqui\Downloads\course_syllabus_output.xlsx"
        save_to_excel(course_data, output_filename)
        
        print(f"\n‚úÖ Process completed! Check the Excel file: {output_filename}")
    else:
        print("‚ùå Failed to extract course data")

üöÄ TESTING SINGLE COURSE EXTRACTION
üåê Testing extraction from: https://www.smartonlinecourse.co.in/courses/ISO-31000-Risk-Management-Standard-67b98757a167595ed22480ca
‚úÖ Found curriculum container
üìö Found 17 course modules
‚è© Skipping final test section: Final Test ISO 31000 Risk Management Standard
‚è© Skipping feedback section: We hope you enjoyed the course! Please take a moment to share your feedback on Google Review:
‚úÖ Extraction completed successfully!

üìã EXTRACTED SYLLABUS INFORMATION
üîó URL: https://www.smartonlinecourse.co.in/courses/ISO-31000-Risk-Management-Standard-67b98757a167595ed22480ca

üìñ COURSE SYLLABUS CONTENT
üéØ Lecture 1- Overview of ISO 31000 and Its Principles
----------------------------------------
üìù Sessions: 2

üé• Lecture 1

‚Ä¢ Multiple Choice Practice Questions - Lecture 1

üéØ Lecture 2- The Framework ‚Äì Integration and Design
----------------------------------------
üìù Sessions: 2

üé• Lecture 2

‚Ä¢ Multiple Choice Practice

In [2]:
import time
import re
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

def get_driver(headless=True):
    """Initialize Chrome driver"""
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    if headless:
        options.add_argument("--headless=new")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

def clean_text(text):
    """Clean and normalize text"""
    if not isinstance(text, str):
        return ""
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def extract_course_name(soup):
    """Extract course name from the page"""
    selectors = [
        "h1",
        ".banner-heading h1",
        ".course-title",
        ".curriculum-heading h3",
        "title"
    ]
    
    for selector in selectors:
        element = soup.select_one(selector)
        if element:
            text = clean_text(element.get_text())
            if text and len(text) > 3:
                return text
    
    return "Course name not found"

def extract_syllabus_content(soup):
    """Extract syllabus content from the curriculum2 structure"""
    syllabus_parts = []
    
    # Find the main curriculum container
    curriculum_container = soup.find('div', class_='curriculum2')
    if not curriculum_container:
        print("‚ùå No curriculum container found")
        return "Syllabus content not available"
    
    print("‚úÖ Found curriculum container")
    
    # Extract all course modules
    course_boxes = curriculum_container.find_all('div', class_='course-course-content-box')
    print(f"üìö Found {len(course_boxes)} course modules")
    
    for i, box in enumerate(course_boxes, 1):
        # Extract module header
        header = box.find('div', class_='course-content-blue-title')
        if header:
            module_title_elem = header.find('h3')
            module_info_elem = header.find('p')
            
            module_title = clean_text(module_title_elem.get_text()) if module_title_elem else f"Module {i}"
            module_info = clean_text(module_info_elem.get_text()) if module_info_elem else ""
            
            # Remove the expand icon from module info
            module_info = module_info.replace('expand_more', '').strip()
            
            # Skip the feedback/review section ONLY
            if 'feedback' in module_title.lower() or 'review' in module_title.lower() or 'enjoyed' in module_title.lower():
                print(f"‚è© Skipping feedback section: {module_title}")
                continue
            
            # REMOVED: Skip final test section
            # if 'final test' in module_title.lower():
            #     print(f"‚è© Skipping final test section: {module_title}")
            #     continue
            
            syllabus_parts.append(f"üéØ {module_title}")
            syllabus_parts.append("-" * 40)
            
            # Remove ONLY time information from module info, KEEP Sessions
            if module_info:
                # Remove time patterns but KEEP Sessions information
                module_info = re.sub(r'\|\s*Time:\s*[\d\sminsechour]*', '', module_info)
                module_info = re.sub(r'Time:\s*[\d\sminsechour]*', '', module_info)
                module_info = re.sub(r'\d+\s*hour\s*\d*\s*min\s*\d*\s*sec', '', module_info)
                module_info = re.sub(r'\d+\s*min\s*\d*\s*sec', '', module_info)
                module_info = clean_text(module_info)
                
                # Remove any remaining pipes or empty strings but KEEP Sessions
                module_info = re.sub(r'^\|\s*', '', module_info)
                module_info = re.sub(r'\|\s*$', '', module_info)
                module_info = clean_text(module_info)
                
                if module_info:
                    syllabus_parts.append(f"üìù {module_info}")
            syllabus_parts.append("")
            
            # Extract module content
            content_bodies = box.find_all('div', class_='course-content-body')
            for content_body in content_bodies:
                content = extract_module_content(content_body)
                if content:
                    syllabus_parts.append(content)
                    syllabus_parts.append("")
    
    return "\n".join(syllabus_parts) if syllabus_parts else "No syllabus content found"

def extract_module_content(content_body):
    """Extract content from a course-content-body div - WITHOUT duration"""
    content_parts = []
    
    desc = content_body.find('div', class_='desc1')
    if desc:
        # Extract all content boxes
        content_boxes = desc.find_all('div', class_=re.compile(r'^is-box-\d+$'))
        
        for box in content_boxes:
            # Check if this is a content box (is-box-9) or something else
            if 'is-box-9' in box.get('class', []):
                content_box = box.find('div', class_='boxa1')
                if content_box:
                    # Get icon type
                    icon = content_box.find('span', class_='material-icons')
                    icon_type = clean_text(icon.get_text()) if icon else ""
                    
                    # Get content text
                    content_text_elem = content_box.find('p')
                    content_text = clean_text(content_text_elem.get_text()) if content_text_elem else ""
                    
                    if content_text:
                        # Skip content that is empty
                        if not content_text:
                            continue
                        
                        # Skip if content only contains duration information
                        if ('min' in content_text.lower() and 'sec' in content_text.lower()) or \
                           ('hour' in content_text.lower() and 'min' in content_text.lower()):
                            continue
                        
                        # Map icons to emojis - ADDED 'assignment' icon
                        icon_map = {
                            'label_important': 'üìå',
                            'ondemand_video': 'üé•',
                            'description': 'üìÑ',
                            'picture_as_pdf': 'üìé',
                            'assignment': 'üìù'  # Added for final test
                        }
                        
                        emoji = icon_map.get(icon_type, '‚Ä¢')
                        
                        # Remove any duration text from content (just in case)
                        content_text = re.sub(r'\d+\s*hour\s*\d*\s*min\s*\d*\s*sec', '', content_text)
                        content_text = re.sub(r'\d+\s*min\s*\d*\s*sec', '', content_text)
                        content_text = clean_text(content_text)
                        
                        if content_text:
                            content_parts.append(f"{emoji} {content_text}")
    
    return "\n".join(content_parts)

def extract_course_data(url):
    """Extract course data from a single URL"""
    print(f"üåê Extracting data from: {url}")
    driver = get_driver(headless=False)
    try:
        driver.get(url)
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        time.sleep(5)
        
        if "404" in driver.title or "not found" in driver.title.lower():
            print("‚ùå Page not found (404)")
            return None
        
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, "html.parser")
        
        course_name = extract_course_name(soup)
        syllabus_content = extract_syllabus_content(soup)
        
        course_data = {
            "Course_Name": course_name,
            "Course_URL": url,
            "Syllabus_Content": syllabus_content
        }
        
        print(f"‚úÖ Extraction completed for: {course_name}")
        return course_data
        
    except Exception as e:
        print(f"‚ùå Extraction failed for {url}: {str(e)}")
        return None
    finally:
        driver.quit()

def save_to_excel(course_data_list, filename="multiple_courses_syllabus.xlsx"):
    """Save multiple course data to Excel"""
    try:
        # Create DataFrame from the course data list
        df = pd.DataFrame(course_data_list)
        
        # Save to Excel
        df.to_excel(filename, index=False)
        print(f"üíæ All course data saved to: {filename}")
        
        # Display summary
        print(f"\nüìä SUMMARY:")
        print(f"   Total courses processed: {len(course_data_list)}")
        print(f"   Output file: {filename}")
        
        return True
    except Exception as e:
        print(f"‚ùå Error saving to Excel: {e}")
        return False

def display_course_info(course_data):
    """Display course information"""
    print(f"\nüè∑Ô∏è  Course: {course_data['Course_Name']}")
    print(f"üîó URL: {course_data['Course_URL']}")
    print(f"üìñ Syllabus length: {len(course_data['Syllabus_Content'])} characters")
    print("-" * 80)

# -------------------- MAIN EXECUTION FOR MULTIPLE COURSES --------------------
if __name__ == "__main__":
    print("üöÄ STARTING MULTIPLE COURSE EXTRACTION")
    print("="*60)

    # üìò Input Excel file path
    input_excel = r"C:\Users\taslim.siddiqui\Downloads\smart_online_data.xlsx"
    
    try:
        df = pd.read_excel(input_excel)
        print(f"‚úÖ Loaded Excel file: {input_excel}")
        
        if 'Course Link' not in df.columns:
            print("‚ùå 'Course Link' column not found in Excel file!")
            print("Available columns:", df.columns.tolist())
        else:
            # Get all course links
            course_links = df['Course Link'].dropna().unique()
            print(f"üìã Found {len(course_links)} unique course links")
            
            all_course_data = []
            successful_extractions = 0
            
            for idx, url in enumerate(course_links, 1):
                print(f"\nüîπ [{idx}/{len(course_links)}] Processing: {url}")
                
                course_data = extract_course_data(url)
                if course_data:
                    all_course_data.append(course_data)
                    successful_extractions += 1
                    display_course_info(course_data)
                else:
                    print(f"‚ùå Failed to extract: {url}")
                
                # Small delay between requests
                time.sleep(2)

            # üíæ Save all extracted data to Excel
            if all_course_data:
                output_filename = r"C:\Users\taslim.siddiqui\Downloads\multiple_courses_syllabus.xlsx"
                save_to_excel(all_course_data, output_filename)
                print(f"\n‚úÖ Process completed! {successful_extractions}/{len(course_links)} courses extracted successfully!")
                print(f"üíæ Check the Excel file: {output_filename}")
            else:
                print("‚ùå No course data was extracted successfully!")
                
    except Exception as e:
        print(f"‚ùå Error reading Excel file: {e}")

üöÄ STARTING MULTIPLE COURSE EXTRACTION
‚úÖ Loaded Excel file: C:\Users\taslim.siddiqui\Downloads\smart_online_data.xlsx
üìã Found 119 unique course links

üîπ [1/119] Processing: https://www.smartonlinecourse.co.in/courses/Climate-Risk-and-Resilence-67cc36767d791402c1f575bd
üåê Extracting data from: https://www.smartonlinecourse.co.in/courses/Climate-Risk-and-Resilence-67cc36767d791402c1f575bd
‚úÖ Found curriculum container
üìö Found 14 course modules
‚è© Skipping feedback section: Lecture 1 - Impact of Climate Change on Business Operations Preview
‚è© Skipping feedback section: We hope you enjoyed the course! Please take a moment to share your feedback on Google Review
‚úÖ Extraction completed for: Climate Risk and Resilience

üè∑Ô∏è  Course: Climate Risk and Resilience
üîó URL: https://www.smartonlinecourse.co.in/courses/Climate-Risk-and-Resilence-67cc36767d791402c1f575bd
üìñ Syllabus length: 4076 characters
-------------------------------------------------------------------