In [2]:
import time
import re
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

def get_driver(headless=True):
    """Initialize Chrome driver"""
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    if headless:
        options.add_argument("--headless=new")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

def clean_text(text):
    """Clean and normalize text"""
    if not isinstance(text, str):
        return ""
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def extract_syllabus_content(soup):
    """Extract syllabus content from the curriculum2 structure"""
    syllabus_parts = []
    
    # Find the main curriculum container
    curriculum_container = soup.find('div', class_='curriculum2')
    if not curriculum_container:
        print("❌ No curriculum container found")
        return "Syllabus content not available"
    
    print("✅ Found curriculum container")
    
    # Extract all course modules (SKIP course overview completely)
    course_boxes = curriculum_container.find_all('div', class_='course-course-content-box')
    print(f"📚 Found {len(course_boxes)} course modules")
    
    for i, box in enumerate(course_boxes, 1):
        # Extract module header
        header = box.find('div', class_='course-content-blue-title')
        if header:
            module_title_elem = header.find('h3')
            module_info_elem = header.find('p')
            
            module_title = clean_text(module_title_elem.get_text()) if module_title_elem else f"Module {i}"
            module_info = clean_text(module_info_elem.get_text()) if module_info_elem else ""
            
            # Remove the expand icon from module info
            module_info = module_info.replace('expand_more', '').strip()
            
            # Skip the feedback/review section
            if 'feedback' in module_title.lower() or 'review' in module_title.lower() or 'enjoyed' in module_title.lower():
                print(f"⏩ Skipping feedback section: {module_title}")
                continue
            
            # Skip final test section
            if 'final test' in module_title.lower():
                print(f"⏩ Skipping final test section: {module_title}")
                continue
            
            syllabus_parts.append(f"🎯 {module_title}")
            syllabus_parts.append("-" * 40)
            
            # Remove ONLY time information from module info, KEEP Sessions
            if module_info:
                # Remove time patterns but KEEP Sessions information
                module_info = re.sub(r'\|\s*Time:\s*[\d\sminsechour]*', '', module_info)
                module_info = re.sub(r'Time:\s*[\d\sminsechour]*', '', module_info)
                module_info = re.sub(r'\d+\s*hour\s*\d*\s*min\s*\d*\s*sec', '', module_info)
                module_info = re.sub(r'\d+\s*min\s*\d*\s*sec', '', module_info)
                module_info = clean_text(module_info)
                
                # Remove any remaining pipes or empty strings but KEEP Sessions
                module_info = re.sub(r'^\|\s*', '', module_info)
                module_info = re.sub(r'\|\s*$', '', module_info)
                module_info = clean_text(module_info)
                
                if module_info:
                    syllabus_parts.append(f"📝 {module_info}")
            syllabus_parts.append("")
            
            # Extract module content
            content_bodies = box.find_all('div', class_='course-content-body')
            for content_body in content_bodies:
                content = extract_module_content(content_body)
                if content:
                    syllabus_parts.append(content)
                    syllabus_parts.append("")
    
    return "\n".join(syllabus_parts) if syllabus_parts else "No syllabus content found"

def extract_module_content(content_body):
    """Extract content from a course-content-body div - WITHOUT duration"""
    content_parts = []
    
    desc = content_body.find('div', class_='desc1')
    if desc:
        # Extract the main content box
        content_box = desc.find('div', class_='is-box-9')
        if content_box:
            box_content = content_box.find('div', class_='boxa1')
            if box_content:
                # Get icon type
                icon = box_content.find('span', class_='material-icons')
                icon_type = clean_text(icon.get_text()) if icon else ""
                
                # Get content text
                content_text_elem = box_content.find('p')
                content_text = clean_text(content_text_elem.get_text()) if content_text_elem else ""
                
                if content_text:
                    # Skip content that contains duration or is empty
                    if (not content_text or 
                        'min' in content_text.lower() and 'sec' in content_text.lower() or
                        'hour' in content_text.lower() and 'min' in content_text.lower()):
                        return ""
                    
                    # Map icons to emojis
                    icon_map = {
                        'label_important': '📌',
                        'ondemand_video': '🎥',
                        'description': '📄',
                        'picture_as_pdf': '📎'
                    }
                    
                    emoji = icon_map.get(icon_type, '•')
                    
                    # Remove any duration text from content
                    content_text = re.sub(r'\d+\s*hour\s*\d*\s*min\s*\d*\s*sec', '', content_text)
                    content_text = re.sub(r'\d+\s*min\s*\d*\s*sec', '', content_text)
                    content_text = clean_text(content_text)
                    
                    if content_text:
                        content_parts.append(f"{emoji} {content_text}")
    
    return "\n".join(content_parts)

def test_single_course(url):
    """Test extraction for a single course"""
    print(f"🌐 Testing extraction from: {url}")
    driver = get_driver(headless=False)
    try:
        driver.get(url)
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        time.sleep(5)
        
        if "404" in driver.title or "not found" in driver.title.lower():
            print("❌ Page not found (404)")
            return None
        
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, "html.parser")
        
        syllabus_content = extract_syllabus_content(soup)
        
        course_data = {
            "Course_URL": url,
            "Syllabus_Content": syllabus_content
        }
        
        print("✅ Extraction completed successfully!")
        return course_data
        
    except Exception as e:
        print(f"❌ Extraction failed: {str(e)}")
        return None
    finally:
        driver.quit()

def save_to_excel(course_data, filename="syllabus_output.xlsx"):
    """Save course data to Excel"""
    try:
        # Create DataFrame from the course data
        df = pd.DataFrame([course_data])
        
        # Save to Excel
        df.to_excel(filename, index=False)
        print(f"💾 Data saved to: {filename}")
        
        # Also display the saved data
        print("\n📊 Saved Data Preview:")
        print("=" * 50)
        print(f"Course URL: {course_data['Course_URL']}")
        print(f"Syllabus Content Length: {len(course_data['Syllabus_Content'])} characters")
        
        return True
    except Exception as e:
        print(f"❌ Error saving to Excel: {e}")
        return False

def display_syllabus_info(course_data):
    print("\n" + "="*80)
    print("📋 EXTRACTED SYLLABUS INFORMATION")
    print("="*80)
    print(f"🔗 URL: {course_data['Course_URL']}")
    print("\n" + "="*80)
    print("📖 COURSE SYLLABUS CONTENT")
    print("="*80)
    print(course_data['Syllabus_Content'])

# -------------------- TEST SINGLE COURSE AND SAVE TO EXCEL --------------------
if __name__ == "__main__":
    # Test with the specific course link
    test_url = "https://www.smartonlinecourse.co.in/courses/5-Emerging-Risks-Every-Business-Must-Prepare-for-in-2025-681203660464a709b9696c04"
    
    print("🚀 TESTING SINGLE COURSE EXTRACTION")
    print("="*60)
    
    # Extract course data
    course_data = test_single_course(test_url)
    
    if course_data:
        # Display the extracted content
        display_syllabus_info(course_data)
        
        # Save to Excel with your specific path
        output_filename = r"C:\Users\taslim.siddiqui\Downloads\course_syllabus_output.xlsx"
        save_to_excel(course_data, output_filename)
        
        print(f"\n✅ Process completed! Check the Excel file: {output_filename}")
    else:
        print("❌ Failed to extract course data")

🚀 TESTING SINGLE COURSE EXTRACTION
🌐 Testing extraction from: https://www.smartonlinecourse.co.in/courses/5-Emerging-Risks-Every-Business-Must-Prepare-for-in-2025-681203660464a709b9696c04
✅ Found curriculum container
📚 Found 7 course modules
⏩ Skipping final test section: Final Test
⏩ Skipping feedback section: We hope you enjoyed the course! Please take a moment to share your feedback on Google Review:
✅ Extraction completed successfully!

📋 EXTRACTED SYLLABUS INFORMATION
🔗 URL: https://www.smartonlinecourse.co.in/courses/5-Emerging-Risks-Every-Business-Must-Prepare-for-in-2025-681203660464a709b9696c04

📖 COURSE SYLLABUS CONTENT
🎯 Lecture 1 and 2
----------------------------------------
📝 Sessions: 3

📌 Lecture 1 : Introduction to Course

📌 Lecture 2 : What Is Emerging Risk?

🎥 Emerging Risks - Lecture - 1 and Lecture - 2

🎯 Lecture 3 and 4
----------------------------------------
📝 Sessions: 3

📌 Lecture 3: Cybersecurity Risk Landscape

📌 Lecture 4: Climate & Environmental Risk

🎥 Em

In [5]:
import time
import re
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

def get_driver(headless=True):
    """Initialize Chrome driver"""
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    if headless:
        options.add_argument("--headless=new")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

def clean_text(text):
    """Clean and normalize text"""
    if not isinstance(text, str):
        return ""
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def extract_course_name(soup):
    """Extract course name from the page"""
    selectors = [
        "h1",
        ".banner-heading h1",
        ".course-title",
        ".curriculum-heading h3",
        "title"
    ]
    
    for selector in selectors:
        element = soup.select_one(selector)
        if element:
            text = clean_text(element.get_text())
            if text and len(text) > 3:
                return text
    
    return "Course name not found"

def extract_syllabus_content(soup):
    """Extract syllabus content from the curriculum2 structure"""
    syllabus_parts = []
    
    # Find the main curriculum container
    curriculum_container = soup.find('div', class_='curriculum2')
    if not curriculum_container:
        print("❌ No curriculum container found")
        return "Syllabus content not available"
    
    print("✅ Found curriculum container")
    
    # Extract all course modules
    course_boxes = curriculum_container.find_all('div', class_='course-course-content-box')
    print(f"📚 Found {len(course_boxes)} course modules")
    
    for i, box in enumerate(course_boxes, 1):
        # Extract module header
        header = box.find('div', class_='course-content-blue-title')
        if header:
            module_title_elem = header.find('h3')
            module_info_elem = header.find('p')
            
            module_title = clean_text(module_title_elem.get_text()) if module_title_elem else f"Module {i}"
            module_info = clean_text(module_info_elem.get_text()) if module_info_elem else ""
            
            # Remove the expand icon from module info
            module_info = module_info.replace('expand_more', '').strip()
            
            # Skip the feedback/review section
            if 'feedback' in module_title.lower() or 'review' in module_title.lower() or 'enjoyed' in module_title.lower():
                print(f"⏩ Skipping feedback section: {module_title}")
                continue
            
            # Skip final test section
            if 'final test' in module_title.lower():
                print(f"⏩ Skipping final test section: {module_title}")
                continue
            
            syllabus_parts.append(f"🎯 {module_title}")
            syllabus_parts.append("-" * 40)
            
            # Remove ONLY time information from module info, KEEP Sessions
            if module_info:
                # Remove time patterns but KEEP Sessions information
                module_info = re.sub(r'\|\s*Time:\s*[\d\sminsechour]*', '', module_info)
                module_info = re.sub(r'Time:\s*[\d\sminsechour]*', '', module_info)
                module_info = re.sub(r'\d+\s*hour\s*\d*\s*min\s*\d*\s*sec', '', module_info)
                module_info = re.sub(r'\d+\s*min\s*\d*\s*sec', '', module_info)
                module_info = clean_text(module_info)
                
                # Remove any remaining pipes or empty strings but KEEP Sessions
                module_info = re.sub(r'^\|\s*', '', module_info)
                module_info = re.sub(r'\|\s*$', '', module_info)
                module_info = clean_text(module_info)
                
                if module_info:
                    syllabus_parts.append(f"📝 {module_info}")
            syllabus_parts.append("")
            
            # Extract module content
            content_bodies = box.find_all('div', class_='course-content-body')
            for content_body in content_bodies:
                content = extract_module_content(content_body)
                if content:
                    syllabus_parts.append(content)
                    syllabus_parts.append("")
    
    return "\n".join(syllabus_parts) if syllabus_parts else "No syllabus content found"

def extract_module_content(content_body):
    """Extract content from a course-content-body div - WITHOUT duration"""
    content_parts = []
    
    desc = content_body.find('div', class_='desc1')
    if desc:
        # Extract the main content box
        content_box = desc.find('div', class_='is-box-9')
        if content_box:
            box_content = content_box.find('div', class_='boxa1')
            if box_content:
                # Get icon type
                icon = box_content.find('span', class_='material-icons')
                icon_type = clean_text(icon.get_text()) if icon else ""
                
                # Get content text
                content_text_elem = box_content.find('p')
                content_text = clean_text(content_text_elem.get_text()) if content_text_elem else ""
                
                if content_text:
                    # Skip content that contains duration or is empty
                    if (not content_text or 
                        'min' in content_text.lower() and 'sec' in content_text.lower() or
                        'hour' in content_text.lower() and 'min' in content_text.lower()):
                        return ""
                    
                    # Map icons to emojis
                    icon_map = {
                        'label_important': '📌',
                        'ondemand_video': '🎥',
                        'description': '📄',
                        'picture_as_pdf': '📎'
                    }
                    
                    emoji = icon_map.get(icon_type, '•')
                    
                    # Remove any duration text from content
                    content_text = re.sub(r'\d+\s*hour\s*\d*\s*min\s*\d*\s*sec', '', content_text)
                    content_text = re.sub(r'\d+\s*min\s*\d*\s*sec', '', content_text)
                    content_text = clean_text(content_text)
                    
                    if content_text:
                        content_parts.append(f"{emoji} {content_text}")
    
    return "\n".join(content_parts)

def extract_course_data(url):
    """Extract course data from a single URL"""
    print(f"🌐 Extracting data from: {url}")
    driver = get_driver(headless=False)
    try:
        driver.get(url)
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        time.sleep(5)
        
        if "404" in driver.title or "not found" in driver.title.lower():
            print("❌ Page not found (404)")
            return None
        
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, "html.parser")
        
        course_name = extract_course_name(soup)
        syllabus_content = extract_syllabus_content(soup)
        
        course_data = {
            "Course_Name": course_name,
            "Course_URL": url,
            "Syllabus_Content": syllabus_content
        }
        
        print(f"✅ Extraction completed for: {course_name}")
        return course_data
        
    except Exception as e:
        print(f"❌ Extraction failed for {url}: {str(e)}")
        return None
    finally:
        driver.quit()

def save_to_excel(course_data_list, filename="multiple_courses_syllabus.xlsx"):
    """Save multiple course data to Excel"""
    try:
        # Create DataFrame from the course data list
        df = pd.DataFrame(course_data_list)
        
        # Save to Excel
        df.to_excel(filename, index=False)
        print(f"💾 All course data saved to: {filename}")
        
        # Display summary
        print(f"\n📊 SUMMARY:")
        print(f"   Total courses processed: {len(course_data_list)}")
        print(f"   Output file: {filename}")
        
        return True
    except Exception as e:
        print(f"❌ Error saving to Excel: {e}")
        return False

def display_course_info(course_data):
    """Display course information"""
    print(f"\n🏷️  Course: {course_data['Course_Name']}")
    print(f"🔗 URL: {course_data['Course_URL']}")
    print(f"📖 Syllabus length: {len(course_data['Syllabus_Content'])} characters")
    print("-" * 80)

# -------------------- MAIN EXECUTION FOR MULTIPLE COURSES --------------------
if __name__ == "__main__":
    print("🚀 STARTING MULTIPLE COURSE EXTRACTION")
    print("="*60)

    # 📘 Input Excel file path
    input_excel = r"C:\Users\taslim.siddiqui\Downloads\Smart_online_syl.xlsx"
    
    try:
        df = pd.read_excel(input_excel)
        print(f"✅ Loaded Excel file: {input_excel}")
        
        if 'Course Link' not in df.columns:
            print("❌ 'Course Link' column not found in Excel file!")
            print("Available columns:", df.columns.tolist())
        else:
            # Get all course links
            course_links = df['Course Link'].dropna().unique()
            print(f"📋 Found {len(course_links)} unique course links")
            
            all_course_data = []
            successful_extractions = 0
            
            for idx, url in enumerate(course_links, 1):
                print(f"\n🔹 [{idx}/{len(course_links)}] Processing: {url}")
                
                course_data = extract_course_data(url)
                if course_data:
                    all_course_data.append(course_data)
                    successful_extractions += 1
                    display_course_info(course_data)
                else:
                    print(f"❌ Failed to extract: {url}")
                
                # Small delay between requests
                time.sleep(2)

            # 💾 Save all extracted data to Excel
            if all_course_data:
                output_filename = r"C:\Users\taslim.siddiqui\Downloads\multiple_courses_syllabus.xlsx"
                save_to_excel(all_course_data, output_filename)
                print(f"\n✅ Process completed! {successful_extractions}/{len(course_links)} courses extracted successfully!")
                print(f"💾 Check the Excel file: {output_filename}")
            else:
                print("❌ No course data was extracted successfully!")
                
    except Exception as e:
        print(f"❌ Error reading Excel file: {e}")

🚀 STARTING MULTIPLE COURSE EXTRACTION
✅ Loaded Excel file: C:\Users\taslim.siddiqui\Downloads\Smart_online_syl.xlsx
📋 Found 63 unique course links

🔹 [1/63] Processing: https://www.smartonlinecourse.co.in/courses/Business-Continuity-Planning-BCP-67bd9e0bc8ad3036f62b528c
🌐 Extracting data from: https://www.smartonlinecourse.co.in/courses/Business-Continuity-Planning-BCP-67bd9e0bc8ad3036f62b528c
✅ Found curriculum container
📚 Found 11 course modules
⏩ Skipping final test section: Final Test Business Continuity Planning
⏩ Skipping feedback section: We hope you enjoyed the course! Please take a moment to share your feedback on Google Review:
✅ Extraction completed for: Business Continuity Planning (BCP)

🏷️  Course: Business Continuity Planning (BCP)
🔗 URL: https://www.smartonlinecourse.co.in/courses/Business-Continuity-Planning-BCP-67bd9e0bc8ad3036f62b528c
📖 Syllabus length: 1864 characters
--------------------------------------------------------------------------------

🔹 [2/63] Processi