NULEARN CCDC course EXtraction test for one course 

In [2]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from bs4 import BeautifulSoup
import time
import os
import re

# -------------------- DRIVER SETUP --------------------
def get_driver():
    options = uc.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    # options.add_argument("--headless")  # Uncomment for headless mode
    driver = uc.Chrome(options=options, version_main=138)
    return driver

# -------------------- SYLLABUS EXTRACTION --------------------
def extract_syllabus(soup):
    try:
        syllabus_text = ""
        modules = soup.find_all("h4", class_="que")
        for module in modules:
            module_title = module.get_text(strip=True)
            syllabus_text += f"\n📘 {module_title}\n"

            ul = module.find_next("ul", class_="syllabus-list")
            if ul:
                for li in ul.find_all("li", class_="syllabus-list-iitem"):
                    lesson = li.get_text(strip=True)
                    syllabus_text += f"   - {lesson}\n"

        return syllabus_text.strip() if syllabus_text else "Syllabus not available"
    except Exception as e:
        return f"Syllabus extraction failed: {str(e)}"

# -------------------- HELPERS --------------------
def extract_who_should_take_it(soup):
    """Find the 'Who should take it' list robustly."""
    heading = soup.find(
        lambda tag: tag.name in ["h2", "h3", "h4"]
        and re.search(r"(who\s+should\s+(take|attend)|who\s+is\s+this|who\s+can\s+apply)", tag.get_text(strip=True), re.I)
    )
    if heading:
        ul = heading.find_next("ul")
        if ul:
            items = [li.get_text(strip=True) for li in ul.find_all("li")]
            if items:
                return "; ".join(items)

    ul = soup.find("ul", class_=lambda c: c != "course-elig-list", attrs={"style": lambda v: v and "Poppins" in v})
    if ul:
        items = [li.get_text(strip=True) for li in ul.find_all("li")]
        if items:
            return "; ".join(items)

    candidate = None
    for possible in soup.find_all("ul"):
        if "course-elig-list" in (possible.get("class") or []):
            continue
        lis = possible.find_all("li")
        if len(lis) >= 3:
            candidate = possible
            break
    if candidate:
        items = [li.get_text(strip=True) for li in candidate.find_all("li")]
        if items:
            return "; ".join(items)

    return "Not available"

# -------------------- SCRAPER --------------------
def scrape_course_data(url):
    driver = get_driver()
    try:
        print(f"🌐 Accessing URL: {url}")
        driver.get(url)

        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "h1.course-page-name"))
        )

        # scroll to load content
        for _ in range(3):
            driver.execute_script("window.scrollBy(0, window.innerHeight)")
            time.sleep(1)

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # 1. Course Name
        course_name_tag = soup.find("h1", class_="course-page-name")
        course_name = course_name_tag.get_text(strip=True) if course_name_tag else "Course name not found"
        print(f"📛 Course Name: {course_name}")

        # 2. About Course
        about_section = soup.find("p")
        about_course = about_section.get_text(strip=True) if about_section else "About course not found"
        print(f"📝 About Course: {about_course[:80]}...")

        # 3. Duration & Learning Mode
        duration = "Duration not available"
        learning_mode = "Learning mode not available"

        icon_blocks = soup.find_all("div", class_="course-icon-block")
        for block in icon_blocks:
            title_tag = block.find("h4", class_="course-banner-sub-lael")
            value = title_tag.get_text(strip=True) if title_tag else ""

            label_tag = block.find("div")
            label = label_tag.get_text(strip=True).upper() if label_tag else ""

            if "DURATION" in label:
                duration = value
            elif "SESSION" in label or "CAMPUS" in label or "LEARNING" in value.upper():
                learning_mode = value

        print(f"⏳ Duration: {duration}")
        print(f"🎓 Learning Mode: {learning_mode}")

        # 4. Syllabus
        syllabus = extract_syllabus(soup)
        print(f"📚 Syllabus extracted ({len(syllabus.splitlines())} lines)")

        # 5. Price
        price_tag = soup.find("h2", class_="program-fee-amount")
        price = price_tag.get_text(" ", strip=True) if price_tag else "Price not available"
        print(f"💰 Price: {price}")

        # 6. Certificate image
        cert_tag = soup.find("a", class_="certificate-image")
        cert_img = cert_tag.find("img")["src"] if cert_tag and cert_tag.find("img") else "Certificate not available"
        print(f"📜 Certificate: {cert_img}")

        # 7. Who Should Take It
        who_should_take_it = extract_who_should_take_it(soup)
        print(f"🙋 Who Should Take It: {who_should_take_it[:80]}...")

        # 8. Eligibility (fixed & robust)
        eligibility_content = []

        # First eligibility block
        elig_list = soup.find("ul", class_="course-elig-list")
        if elig_list:
            eligibility_content.append(elig_list.get_text(" ", strip=True))

        # Second eligibility block
        elig_block = soup.find("div", class_="eilg-block-container")
        if elig_block:
            lis = elig_block.find_all("li")
            eligibility_content.extend([li.get_text(strip=True) for li in lis])

        eligibility = " \n ".join(eligibility_content) if eligibility_content else "Not available"

        return (
            course_name,
            about_course,
            syllabus,
            price,
            duration,
            learning_mode,
            cert_img,
            who_should_take_it,
            eligibility
        )

    except Exception as e:
        print(f"🔥 Scraping failed for {url}: {str(e)}")
        return ["Error"] * 9
    finally:
        driver.quit()
        print("🚪 Browser closed")

# -------------------- SAVE TO EXCEL --------------------
def save_to_excel(data, file_path, url):
    columns = [
        "Course Name",
        "About Course",
        "Syllabus",
        "Price",
        "Duration",
        "Learning Mode",
        "Certificate",
        "Who Should Take It",
        "Eligibility",
        "Course URL"
    ]

    try:
        if os.path.exists(file_path):
            df = pd.read_excel(file_path)
            for col in columns:
                if col not in df.columns:
                    df[col] = None
        else:
            df = pd.DataFrame(columns=columns)

        existing_urls = df["Course URL"].tolist() if "Course URL" in df.columns else []
        if url in existing_urls:
            print(f"🔄 Course already exists: {data[0]}")
            return

        new_row = pd.DataFrame([dict(zip(columns, [*data, url]))])
        df = pd.concat([df, new_row], ignore_index=True)

        df.to_excel(file_path, index=False)
        print(f"💾 Saved data for: {data[0]}")

    except Exception as e:
        print(f"❌ Excel save error: {e}")

# -------------------- MAIN --------------------
if __name__ == "__main__":
    course_urls = [
        "https://www.nulearn.in/courses/executive-development-program-in-strategic-hr-analytics"
    ]

    print("🚀 Starting scraping process...")
    file_path = r"C:\Users\taslim.siddiqui\Downloads\course00.xlsx"

    for course_url in course_urls:
        print(f"\n🔍 Processing: {course_url}")
        course_data = scrape_course_data(course_url)
        if all(item != "Error" for item in course_data):
            save_to_excel(course_data, file_path, course_url)
        else:
            print(f"❌ Failed to scrape complete data for {course_url}")

    print("\n✅ Process completed")


🚀 Starting scraping process...

🔍 Processing: https://www.nulearn.in/courses/executive-development-program-in-strategic-hr-analytics
🌐 Accessing URL: https://www.nulearn.in/courses/executive-development-program-in-strategic-hr-analytics
📛 Course Name: Executive Development Program in Strategic HR Analytics
📝 About Course: The Strategic HR Analytics Course by IIM Kashipur will specifically focus on app...
⏳ Duration: 6 Months
🎓 Learning Mode: Blended Learning
📚 Syllabus extracted (68 lines)
💰 Price: 1,00,000 Rs. 80,000 + GST* (Limited Time Offer)
📜 Certificate: https://www.nulearn.in/uploads/images/SHRA-CERTIFICATE.jpg
🙋 Who Should Take It: This program will be useful for managers in multinationals, Indian business firm...
🚪 Browser closed
💾 Saved data for: Executive Development Program in Strategic HR Analytics

✅ Process completed
