In [None]:
import time
import re
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# -------------------- FIXED SYLLABUS EXTRACTION FOR NESTED UL STRUCTURE --------------------
def extract_syllabus_fixed(soup):
    """Extract syllabus handling the nested UL structure correctly"""
    modules = []
    
    accordion = soup.find('div', id='accordionExamplecurriculum')
    if not accordion:
        return [{"title": "Syllabus not available", "content": "Could not extract syllabus content"}]
    
    cards = accordion.find_all('div', class_='card')
    
    for card in cards:
        header = card.find('div', class_='card-header')
        if header:
            button = header.find('button')
            if button:
                module_title = clean_text(button.get_text())
                
                card_body = card.find('div', class_='card-body')
                if card_body:
                    if "Training Curriculum" in module_title:
                        module_content = extract_curriculum_fixed(card_body)
                    else:
                        module_content = extract_module_fixed(card_body)
                    
                    if module_title and module_content:
                        modules.append({
                            "title": module_title,
                            "content": module_content
                        })
    
    return modules

def extract_module_fixed(card_body):
    """Extract module content handling nested UL structure"""
    content_parts = []
    main_uls = card_body.find_all('ul')
    
    for main_ul in main_uls:
        topics_content = extract_topics_from_main_ul(main_ul)
        if topics_content:
            content_parts.append(topics_content)
    
    return "\n".join(content_parts)

def extract_topics_from_main_ul(main_ul):
    topics_content = []
    for child in main_ul.children:
        if hasattr(child, 'name'):
            if child.name == 'p' and 'text-align-first-with-icon' in child.get('class', []):
                topic_text = clean_text(child.get_text())
                if topic_text:
                    bullet_points = extract_all_bullet_points_from_topic(child)
                    topics_content.append(topic_text)
                    topics_content.extend(bullet_points)
    return "\n".join(topics_content)

def extract_all_bullet_points_from_topic(topic_element):
    bullet_points = []
    current_element = topic_element.next_sibling
    
    while current_element:
        if hasattr(current_element, 'name'):
            if current_element.name == 'ul':
                list_items = current_element.find_all('li', class_='text-design second')
                for li in list_items:
                    li_text = clean_text(li.get_text())
                    if li_text:
                        bullet_points.append(f"  • {li_text}")
            elif current_element.name == 'p':
                break
        current_element = current_element.next_sibling
    
    return bullet_points

def extract_curriculum_fixed(card_body):
    content_parts = []
    paragraphs = card_body.find_all('p', class_='text-align-first-with-icon')
    for p in paragraphs:
        p_text = clean_text(p.get_text())
        if p_text and p_text not in content_parts:
            content_parts.append(p_text)
    
    learning_items_added = False
    for main_ul in card_body.find_all('ul'):
        for child in main_ul.children:
            if hasattr(child, 'name') and child.name == 'p':
                if 'In this program you will learn' in child.get_text():
                    content_parts.append("In this program you will learn:")
                    nested_uls = main_ul.find_all('ul')
                    for nested_ul in nested_uls:
                        list_items = nested_ul.find_all('li', class_='text-design second')
                        for li in list_items:
                            li_text = clean_text(li.get_text())
                            if li_text:
                                content_parts.append(f"• {li_text}")
                    learning_items_added = True
                    break
        if learning_items_added:
            break
    
    return "\n".join(content_parts)

# -------------------- ENHANCED DEBUGGING --------------------
def debug_html_structure_detailed(soup):
    print("\n🔍 DETAILED HTML STRUCTURE ANALYSIS:")
    print("=" * 60)
    
    accordion = soup.find('div', id='accordionExamplecurriculum')
    if accordion:
        cards = accordion.find_all('div', class_='card')
        for card in cards:
            header = card.find('div', class_='card-header')
            if header:
                button = header.find('button')
                if button and 'Module 1: ASP.NET and MVC' in button.get_text():
                    print(f"\n📖 ANALYZING: Module 1: ASP.NET and MVC")
                    card_body = card.find('div', class_='card-body')
                    if card_body:
                        main_uls = card_body.find_all('ul')
                        print(f"Main UL elements: {len(main_uls)}")

# -------------------- FEE STRUCTURE --------------------
def get_fee_structure(price):
    if price and price != "Not available":
        fee_structure = (
            f"{price} \n- All other fees remain unchanged\n"
            "- Education loans are available through leading banks and NBFCs."
        )
    else:
        fee_structure = (
            "- All other fees remain unchanged\n"
            "- Education loans are available through leading banks and NBFCs."
        )
    return fee_structure

# -------------------- CERTIFICATE --------------------
def extract_certificate(soup):
    cert_img = soup.find('img', alt=re.compile(r'certificate', re.IGNORECASE))
    if cert_img:
        return cert_img.get('src', '') or cert_img.get('data-src', 'Certificate image found but URL not available')
    
    cert_imgs = soup.find_all('img', src=re.compile(r'certificate', re.IGNORECASE))
    if cert_imgs:
        return cert_imgs[0].get('src', 'Certificate image URL found')
    
    return "https://www.cromacampus.com/public/img/Certificate-new-file.webp"

# -------------------- TEST FUNCTION --------------------
def test_single_course_final(url):
    print("🧪 FINAL SYLLABUS EXTRACTION TEST")
    print("=" * 80)
    
    driver = get_driver(headless=True)  # switched to headless for batch runs
    try:
        print(f"🌐 Accessing URL: {url}")
        driver.get(url)
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        time.sleep(3)

        page_source = driver.page_source
        soup = BeautifulSoup(page_source, "html.parser")
        
        course_name = extract_course_name(soup)
        about_course = extract_about_course(soup)
        duration = extract_duration(soup)
        learning_mode = extract_learning_mode(soup)
        price = extract_price(soup)
        certificate = extract_certificate(soup)
        modules = extract_syllabus_fixed(soup)
        syllabus_content = format_syllabus_for_excel(modules)
        fee_structure = get_fee_structure(price)
        
        course_data = {
            "Course Name": course_name,
            "About Course": about_course,
            "Duration": duration,
            "Price": price,
            "Syllabus": syllabus_content,
            "Certificate": certificate,
            "Fee Structure": fee_structure,
            "Learning Mode": learning_mode,
            "Course URL": url
        }
        
        print("✅ Extraction completed")
        return course_data

    except Exception as e:
        print(f"❌ TEST FAILED: {str(e)}")
        return None
    finally:
        driver.quit()



# -------------------- MAIN FUNCTION (EXCEL MULTIPLE URLS) --------------------
def main_test_from_excel():
    input_file = r"C:\Users\taslim.siddiqui\Downloads\Croma campus all course.xlsx"   # input file
    output_file = r"C:\Users\taslim.siddiqui\Downloads\multi_course_output_croma campus.xlsx"

    try:
        df = pd.read_excel(input_file)
    except Exception as e:
        print(f"❌ Failed to read input Excel file: {e}")
        return

    if "Course URL" not in df.columns:
        print("❌ Excel file must contain a column named 'Course URL'")
        return

    all_results = []
    for idx, row in df.iterrows():
        url = str(row["Course URL"]).strip()
        if not url or not url.startswith(("http://", "https://")):
            print(f"⚠️ Skipping invalid URL at row {idx+2}: {url}")
            continue

        print(f"\n🚀 Processing {idx+1}/{len(df)}: {url}")
        result = test_single_course_final(url)
        if result:
            all_results.append(result)

    if all_results:
        try:
            final_df = pd.DataFrame(all_results)
            final_df.to_excel(output_file, index=False)
            print(f"\n✅ All courses extracted successfully!")
            print(f"📁 Output saved to: {output_file}")
        except Exception as e:
            print(f"❌ Failed to save final Excel file: {e}")
    else:
        print("❌ No results extracted")

# -------------------- HELPERS --------------------
def get_driver(headless=False):
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    if headless:
        options.add_argument("--headless=new")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def extract_course_name(soup):
    course_name_tag = soup.find("div", class_="banner-heading")
    if course_name_tag:
        h1_tag = course_name_tag.find("h1")
        if h1_tag:
            return clean_text(h1_tag.get_text())
    h1_tag = soup.find("h1")
    if h1_tag:
        return clean_text(h1_tag.get_text())
    return "Course name not found"

def extract_about_course(soup):
    about_points = []
    uls = soup.find_all('ul')
    for ul in uls:
        lis = ul.find_all('li', style="font-size:13px")
        for li in lis:
            text = clean_text(li.get_text())
            if text and len(text) > 50:
                about_points.append(text)
    if about_points:
        return "\n".join([f"- {point}" for point in about_points])
    return "About course not found"

def extract_duration(soup):
    duration_div = soup.find('div', class_='first-section-duation')
    if duration_div:
        p_tags = duration_div.find_all('p')
        if len(p_tags) >= 1:
            return clean_text(p_tags[0].get_text())
    return "Duration not found"

def extract_learning_mode(soup):
    mode_div = soup.find('div', class_='first-section-duation border-for-right')
    if mode_div:
        p_tags = mode_div.find_all('p')
        if len(p_tags) >= 1:
            return clean_text(p_tags[0].get_text())
    return "Learning mode not found"

def extract_price(soup):
    price_span = soup.find('span', class_='disc-amt')
    if price_span:
        price_text = clean_text(price_span.get_text())
        price_match = re.search(r'[₹\d.,]+', price_text)
        if price_match:
            return price_match.group()
    return "Price not found"

def save_single_course_to_excel(course_data, output_file_path):
    try:
        df = pd.DataFrame([course_data])
        df.to_excel(output_file_path, index=False)
        print(f"💾 Course data saved to: {output_file_path}")
        return True
    except Exception as e:
        print(f"❌ Error saving to Excel: {e}")
        return False

def format_syllabus_for_excel(modules):
    if not modules or (len(modules) == 1 and "not available" in modules[0]["title"].lower()):
        return "Syllabus not available"
    
    syllabus_text = ""
    for i, module in enumerate(modules, 1):
        title = module['title']
        content = module['content']
        clean_title = re.sub(r'^(Module\s+\d+:\s*)+', '', title)
        clean_title = re.sub(r'Module\s+\d+\s*:\s*', '', clean_title)
        
        if "Training Curriculum" in title:
            syllabus_text += f"{clean_title}\n"
            syllabus_text += "=" * 60 + "\n"
            syllabus_text += f"{content}\n\n"
        else:
            module_match = re.search(r'Module\s+(\d+)', title)
            if module_match:
                module_num = module_match.group(1)
                syllabus_text += f"Module {module_num}: {clean_title}\n"
            else:
                syllabus_text += f"Module {i-1}: {clean_title}\n"
            syllabus_text += "=" * 60 + "\n"
            syllabus_text += f"{content}\n\n"
        syllabus_text += "-" * 60 + "\n\n"
    
    return syllabus_text.strip()

# -------------------- ENTRY --------------------
if __name__ == "__main__":
    # main_test()  # single test run
    main_test_from_excel()  # run for all links in Excel


course link https://www.cromacampus.com/master-program/professional-in-data-analytics-with-powerbi/

In [None]:
import time
import re
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

def get_driver(headless=True):
    """Initialize Chrome driver"""
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    if headless:
        options.add_argument("--headless=new")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

def clean_text(text):
    """Clean and normalize text"""
    if not isinstance(text, str):
        return ""
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def extract_course_name(soup):
    """Extract course name from the page"""
    curriculum_heading = soup.find('div', class_='curriculum-heading')
    if curriculum_heading:
        h3 = curriculum_heading.find('h3')
        if h3:
            return clean_text(h3.get_text())
    
    selectors = [
        "div.banner-heading h1",
        "h1",
        "title"
    ]
    
    for selector in selectors:
        element = soup.select_one(selector)
        if element:
            text = clean_text(element.get_text())
            if text:
                return text
    
    return "Course name not found"

def extract_syllabus_content(soup):
    """Extract all syllabus content with improved parsing"""
    syllabus_parts = []
    accordion = (soup.find('div', id='accordionExamplecurriculum') or 
                 soup.find('div', class_='curriculum-accordian') or
                 soup.find('div', class_='accordion'))
    
    if not accordion:
        print("❌ No curriculum accordion found")
        return extract_fallback_syllabus(soup)
    
    print(f"✅ Found curriculum accordion")
    cards = (accordion.find_all('div', class_='card') or 
             accordion.find_all('div', class_='accordion-item') or
             accordion.find_all('div', recursive=False))
    
    print(f"📚 Found {len(cards)} cards/modules")
    
    for i, card in enumerate(cards, 1):
        header = (card.find('div', class_='card-header') or 
                  card.find('div', class_='accordion-header') or
                  card.find(['h2', 'h3', 'h4']))
        
        if header:
            button = header.find('button')
            if button:
                module_title = clean_text(button.get_text())
            else:
                module_title = clean_text(header.get_text())
        else:
            continue
        
        if not module_title or 'download curriculum' in module_title.lower():
            continue
            
        print(f"  Processing module: {module_title}")
        
        card_body = (card.find('div', class_='card-body') or 
                     card.find('div', class_='accordion-body') or
                     card.find('div', class_='collapse') or
                     card.find('div', class_='show1'))
        
        if card_body:
            module_content = extract_module_content_improved(card_body)
            if module_content:
                syllabus_parts.append(f"{module_title}")
                syllabus_parts.append("=" * 60)
                syllabus_parts.append(module_content)
                syllabus_parts.append("")  
    
    return "\n".join(syllabus_parts) if syllabus_parts else extract_fallback_syllabus(soup)

def extract_module_content_improved(card_body):
    """Improved content extraction with multiple strategies"""
    content_parts = []
    for unwanted in card_body.find_all(['div', 'button'], class_=re.compile(r'download|syllabus', re.I)):
        unwanted.decompose()
    
    main_uls = card_body.find_all('ul')
    for main_ul in main_uls:
        for child in main_ul.children:
            if hasattr(child, 'name'):
                if child.name == 'p':
                    topic_classes = child.get('class', [])
                    if any('text-align-first-with-icon' in cls for cls in topic_classes) or 'first' in str(topic_classes):
                        topic_text = clean_text(child.get_text())
                        if topic_text and len(topic_text) > 3:
                            content_parts.append(f"📚 {topic_text}")
                            bullet_points = extract_bullet_points_improved(child)
                            for bullet in bullet_points:
                                content_parts.append(f"    • {bullet}")
                            if bullet_points:
                                content_parts.append("")
    
    if not content_parts:
        all_text = card_body.get_text(separator='\n', strip=True)
        lines = [line.strip() for line in all_text.split('\n') if line.strip()]
        for line in lines:
            if line and not any(x in line.lower() for x in ['download', 'curriculum', 'get full']):
                if len(line) > 10:
                    content_parts.append(f"• {line}")
    
    return "\n".join(content_parts) if content_parts else "No content available for this module"

def extract_bullet_points_improved(topic_element):
    bullet_points = []
    current_element = topic_element.next_sibling
    while current_element:
        if hasattr(current_element, 'name'):
            if current_element.name == 'ul':
                list_items = current_element.find_all('li')
                for li in list_items:
                    li_text = clean_text(li.get_text())
                    if li_text and len(li_text) > 3:
                        bullet_points.append(li_text)
            elif current_element.name == 'p':
                topic_classes = current_element.get('class', [])
                if any('text-align-first-with-icon' in cls for cls in topic_classes) or 'first' in str(topic_classes):
                    break
        current_element = current_element.next_sibling
    return bullet_points

def extract_fallback_syllabus(soup):
    print("🔄 Using fallback syllabus extraction")
    syllabus_parts = []
    possible_sections = soup.find_all(['div', 'section'], class_=re.compile(r'curriculum|syllabus|module|course-content', re.I))
    for section in possible_sections:
        text_content = section.get_text(separator='\n', strip=True)
        lines = [line.strip() for line in text_content.split('\n') if line.strip()]
        for line in lines:
            if (len(line) > 20 and not any(x in line.lower() for x in ['download', 'enroll', 'contact', 'fee', 'price'])):
                syllabus_parts.append(f"• {line}")
    return "\n".join(syllabus_parts) if syllabus_parts else "Syllabus content not available on page"

def extract_single_course_data(url):
    print(f"🌐 Extracting data from: {url}")
    driver = get_driver(headless=False)
    try:
        driver.get(url)
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        time.sleep(5)
        if "404" in driver.title or "not found" in driver.title.lower():
            print("❌ Page not found (404)")
            return None
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, "html.parser")
        course_name = extract_course_name(soup)
        print(f"✅ Course Name: {course_name}")
        syllabus_content = extract_syllabus_content(soup)
        course_data = {
            "Course Name": course_name,
            "Course Syllabus": syllabus_content,
            "Course URL": url
        }
        print("✅ Extraction completed successfully!")
        return course_data
    except Exception as e:
        print(f"❌ Extraction failed: {str(e)}")
        return None
    finally:
        driver.quit()

def save_to_excel(course_data_list, filename="multiple_courses_output.xlsx"):
    """Save multiple course data to Excel"""
    try:
        df = pd.DataFrame(course_data_list)
        df.to_excel(filename, index=False)
        print(f"💾 All course data saved to: {filename}")
        return True
    except Exception as e:
        print(f"❌ Error saving to Excel: {e}")
        return False

def display_course_info(course_data):
    print("\n" + "="*80)
    print("📋 EXTRACTED COURSE INFORMATION")
    print("="*80)
    print(f"🏷️  COURSE NAME: {course_data['Course Name']}")
    print(f"🔗 URL: {course_data['Course URL']}")
    print("\n" + "="*80)
    print("📖 COURSE SYLLABUS")
    print("="*80)
    print(course_data['Course Syllabus'][:1000] + "..." if len(course_data['Course Syllabus']) > 1000 else course_data['Course Syllabus'])

# -------------------- MAIN EXECUTION --------------------
if __name__ == "__main__":
    print("🚀 STARTING MULTIPLE COURSE EXTRACTION")
    print("="*60)

    # 📘 Input Excel file path
    input_excel = r"C:\Users\taslim.siddiqui\Downloads\Croma campus all course.xlsx"  # must contain 'Course Link' column
    df = pd.read_excel(input_excel)

    if 'Course Link' not in df.columns:
        print("❌ 'Course Link' column not found in Excel file!")
    else:
        all_data = []
        for idx, url in enumerate(df['Course Link'].dropna(), 1):
            print(f"\n🔹 [{idx}] Processing URL: {url}")
            course_data = extract_single_course_data(url)
            if course_data:
                all_data.append(course_data)
                display_course_info(course_data)

        # 💾 Save all extracted data
        output_file = r"C:\Users\taslim.siddiqui\Downloads\multiple_courses_output_pending.xlsx"
        save_to_excel(all_data, output_file)
        print(f"\n✅ Process completed for {len(all_data)} courses! Check Excel file: {output_file}")



🚀 Processing 1/2: https://www.cromacampus.com/courses/ethical-hacking-training-in-noida

🚀 Processing 2/2: https://www.cromacampus.com/courses/cicd-jenkins-certification-training/

✅ All courses extracted successfully!
📁 Output saved to: C:\Users\taslim.siddiqui\Downloads\test_croma campus.xlsx
