# MCKL course link only for one course

In [1]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time
import os

# -------------------- DRIVER SETUP --------------------
def get_driver():
    options = uc.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    # options.add_argument("--headless=new")  # Uncomment for silent scraping
    driver = uc.Chrome(options=options)
    return driver

# -------------------- CLEAN TEXT --------------------
def clean_text(text):
    return " ".join(text.split()) if text else text

# -------------------- SCRAPER --------------------
def scrape_course_data(url):
    driver = get_driver()
    try:
        print(f"🌐 Accessing URL: {url}")
        driver.get(url)
        WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        time.sleep(3)

        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        driver.execute_script("window.scrollTo(0, 0);")
        time.sleep(1)

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Course Name
        course_name = "Not Found"
        for selector in ["div.page-header h1", "h1.course-title", "h1.title", "h1", "title"]:
            element = soup.select_one(selector)
            if element:
                course_name = clean_text(element.get_text(strip=True))
                break
        print(f"📛 Course Name: {course_name}")

        # About Course (FULL DIV with bullet formatting)
        about_course = "Not Found"
        about_div = soup.find("div", class_="ci-text")
        if about_div:
            parts = []
            for tag in about_div.find_all(["p", "li"]):
                text = clean_text(tag.get_text(" ", strip=True))
                if tag.name == "li":
                    parts.append(f"- {text}")
                else:
                    parts.append(text)
            about_course = "\n".join(parts)
        print(f"📝 About Course: {about_course[:100]}...")

        # ---------- Syllabus (accordion) ----------
        syllabus = "Not Found"
        accordion = soup.find("div", class_="accordion", attrs={"class": "course-sc-syllabus"})
        if accordion:
            modules = []
            for card in accordion.find_all("div", class_="card"):
                # module title
                header = card.find("button")
                module_title = header.get_text(strip=True) if header else "Untitled Module"

                # module content
                content_div = card.find("div", class_="course-sc-syllabus-content")
                lessons = []
                if content_div:
                    for li in content_div.find_all("li"):
                        lessons.append("- " + li.get_text(strip=True))
                    for p in content_div.find_all("p"):
                        lessons.append("- " + p.get_text(" ", strip=True))

                if lessons:
                    modules.append(f"{module_title}\n" + "\n".join(lessons))
                else:
                    modules.append(module_title)

            if modules:
                syllabus = "\n\n".join(modules)

        print(f"📚 Syllabus extracted ({len(syllabus.splitlines())} lines)")

        # -------------------- Helper to extract normalized info --------------------
        def extract_field(keyword):
            elems = soup.find_all(string=lambda t: t and keyword in t.lower())
            for elem in elems:
                parent = elem.parent
                if parent:
                    raw = clean_text(parent.get_text(" ", strip=True))
                    value = raw
                    for k in ["duration", "level", "language", "mode", "learning"]:
                        value = value.replace(k, "", 1)
                        value = value.replace(k.capitalize(), "", 1)
                    value = value.replace(":", "").replace("layers-outline", "").strip()
                    return value
            return "Not Found"

        duration = extract_field("duration")
        level = extract_field("level")
        language = extract_field("language")
        learning_mode = extract_field("mode") if extract_field("mode") != "Not Found" else extract_field("learning")

        print(f"⏳ Duration: {duration}")
        print(f"📊 Course Level: {level}")
        print(f"🗣️ Language: {language}")
        print(f"🎓 Learning Mode: {learning_mode}")

        # Price
        price = "Not Found"
        try:
            price_td = soup.find("td", style=lambda s: s and "text-align:center" in s)
            if price_td:
                price = clean_text(price_td.get_text(" ", strip=True))
        except:
            pass
        print(f"💰 Price: {price}")

        # Certificate
        certificate = extract_field("certificate")
        print(f"🏆 Certificate: {certificate}")

        # -------------------- Eligibility (Combine both sources) --------------------
        eligibility_parts = []
        try:
            for box in soup.find_all("div", class_="cs-textbox"):
                title_div = box.find("div", class_="cs-textbox-title")
                info_div = box.find("div", class_="cs-textbox-info")
                if title_div and "eligibility" in title_div.get_text(strip=True).lower() and info_div:
                    list_items = info_div.find_all("li")
                    if list_items:
                        eligibility_parts.append(
                            "\n".join([f"- {li.get_text(strip=True)}" for li in list_items])
                        )
                    else:
                        eligibility_parts.append(clean_text(info_div.get_text(" ", strip=True)))
                    break

            section = soup.find("div", class_="course-section-content")
            if section:
                ul = section.find("ul")
                if ul:
                    lis = ul.find_all("li")
                    if lis:
                        eligibility_parts.append(
                            "\n".join([f"- {li.get_text(strip=True)}" for li in lis])
                        )
                    else:
                        eligibility_parts.append(clean_text(section.get_text(" ", strip=True)))

        except Exception as e:
            print(f"⚠️ Eligibility extraction issue: {e}")

        eligibility = "\n".join([part for part in eligibility_parts if part]) or "Not Found"
        print(f"👥 Eligibility:\n{eligibility}")

        # -------------------- Fee Structure --------------------
        fee_structure = "Not Found"
        try:
            offer_price = price if price != "Not Found" else None
            if offer_price:
                fee_structure = (
                    f"{offer_price}\n"
                    "- All other fees remain unchanged\n"
                    "- Education loans are available through leading banks and NBFCs."
                )
            else:
                fee_structure = (
                    "- All other fees remain unchanged\n"
                    "- Education loans are available through leading banks and NBFCs."
                )
        except Exception as e:
            print(f"⚠️ Fee structure extraction issue: {e}")
        print(f"🏦 Fee Structure:\n{fee_structure}")

        return [
            course_name, about_course, syllabus, duration, level, language,
            learning_mode, price, certificate, eligibility, fee_structure, url
        ]

    except Exception as e:
        print(f"🔥 Scraping failed: {str(e)}")
        import traceback
        traceback.print_exc()
        return ["Error"] * 12
    finally:
        driver.quit()
        print("🚪 Browser closed")

# -------------------- SAVE TO EXCEL --------------------
def save_to_excel(data, file_path):
    columns = [
        "Course Name", "About Course", "Syllabus", "Duration", "Course Level",
        "Language", "Learning Mode", "Price", "Certificate", "Eligibility",
        "Fee Structure", "Course URL"
    ]
    try:
        if os.path.exists(file_path):
            df = pd.read_excel(file_path)
            for col in columns:
                if col not in df.columns:
                    df[col] = None
        else:
            df = pd.DataFrame(columns=columns)

        if data[-1] in df["Course URL"].values:
            print(f"🔄 Course already exists: {data[0]}")
            return

        new_row = pd.DataFrame([dict(zip(columns, data))])
        df = pd.concat([df, new_row], ignore_index=True)
        df.to_excel(file_path, index=False)
        print(f"💾 Saved data for: {data[0]}")

    except Exception as e:
        print(f"❌ Excel save error: {e}")

# -------------------- MAIN EXECUTION --------------------
if __name__ == "__main__":
    course_urls = [
       "https://klic.mkcl.org/klic-courses/c-sharp"
    ]
    print("🚀 Starting scraping process...")
    file_path = r"C:\Users\taslim.siddiqui\Downloads\MKCL.xlsx"

    for course_url in course_urls:
        print(f"\n📖 Processing: {course_url}")
        course_data = scrape_course_data(course_url)
        if all(item != "Error" for item in course_data):
            save_to_excel(course_data, file_path)
        else:
            print(f"❌ Failed to scrape complete data for {course_url}")

    print("✅ Process completed")


🚀 Starting scraping process...

📖 Processing: https://klic.mkcl.org/klic-courses/c-sharp
🌐 Accessing URL: https://klic.mkcl.org/klic-courses/c-sharp
📛 Course Name: C Sharp (C#)
📝 About Course: KLiC Certificate in C# course is designed to provide a comprehensive understanding of C# programming...
📚 Syllabus extracted (190 lines)
⏳ Duration: 120 hours
📊 Course Level: Foundation
🗣️ Language: English
🎓 Learning Mode: Learn at ALC or Learn at Home
💰 Price: 6000/-
🏆 Certificate: KLiC Certificate in C# course is designed to provide a comprehensive understanding of C# programming. Covering fundamental to advanced topics, this course includes data types, control structures, object-oriented programming concepts, and the use of C# in web and mobile applications. With a blend of theoretical knowledge and practical skills, students will learn to construct, debug, and optimize C# programs, preparing them for real-world development challenges.
👥 Eligibility:
- Learner should preferably a std. 10th Pa

# MCKL course for input excel file

In [2]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time
import os

# -------------------- DRIVER SETUP --------------------
def get_driver():
    options = uc.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    # options.add_argument("--headless=new")  # Uncomment for silent scraping
    driver = uc.Chrome(options=options)
    return driver

# -------------------- CLEAN TEXT --------------------
def clean_text(text):
    return " ".join(text.split()) if text else text

# -------------------- SCRAPER --------------------
def scrape_course_data(url):
    driver = get_driver()
    try:
        print(f"🌐 Accessing URL: {url}")
        driver.get(url)
        WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        time.sleep(3)

        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        driver.execute_script("window.scrollTo(0, 0);")
        time.sleep(1)

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Course Name
        course_name = "Not Found"
        for selector in ["div.page-header h1", "h1.course-title", "h1.title", "h1", "title"]:
            element = soup.select_one(selector)
            if element:
                course_name = clean_text(element.get_text(strip=True))
                break
        print(f"📛 Course Name: {course_name}")

       # About Course (FULL DIV with bullet formatting)
        about_course = "Not Found"
        about_div = soup.find("div", class_="ci-text")
        if about_div:
            parts = []
            for tag in about_div.find_all(["p", "li"]):
                text = clean_text(tag.get_text(" ", strip=True))
                if tag.name == "li":
                    parts.append(f"- {text}")
                else:
                    parts.append(text)
            about_course = "\n".join(parts)
        print(f"📝 About Course: {about_course[:100]}...")

        # ---------- Syllabus (accordion) ----------
        syllabus = "Not Found"
        accordion = soup.find("div", class_="accordion", attrs={"class": "course-sc-syllabus"})
        if accordion:
            modules = []
            for card in accordion.find_all("div", class_="card"):
                # module title
                header = card.find("button")
                module_title = header.get_text(strip=True) if header else "Untitled Module"

                # module content
                content_div = card.find("div", class_="course-sc-syllabus-content")
                lessons = []
                if content_div:
                    # collect list items or paragraph text
                    for li in content_div.find_all("li"):
                        lessons.append("- " + li.get_text(strip=True))
                    for p in content_div.find_all("p"):
                        lessons.append("- " + p.get_text(" ", strip=True))

                # join module
                if lessons:
                    modules.append(f"{module_title}\n" + "\n".join(lessons))
                else:
                    modules.append(module_title)

            if modules:
                syllabus = "\n\n".join(modules)

        print(f"📚 Syllabus extracted ({len(syllabus.splitlines())} lines)")

        # -------------------- Helper to extract normalized info --------------------
        def extract_field(keyword):
            elems = soup.find_all(string=lambda t: t and keyword in t.lower())
            for elem in elems:
                parent = elem.parent
                if parent:
                    raw = clean_text(parent.get_text(" ", strip=True))
                    # remove keyword and extra symbols
                    value = raw
                    for k in ["duration", "level", "language", "mode", "learning"]:
                        value = value.replace(k, "", 1)
                        value = value.replace(k.capitalize(), "", 1)
                    value = value.replace(":", "").replace("layers-outline", "").strip()
                    return value
            return "Not Found"

        duration = extract_field("duration")
        level = extract_field("level")
        language = extract_field("language")
        learning_mode = extract_field("mode") if extract_field("mode") != "Not Found" else extract_field("learning")

        print(f"⏳ Duration: {duration}")
        print(f"📊 Course Level: {level}")
        print(f"🗣️ Language: {language}")
        print(f"🎓 Learning Mode: {learning_mode}")


        # Price
        price = "Not Found"
        try:
            price_td = soup.find("td", style=lambda s: s and "text-align:center" in s)
            if price_td:
                price = clean_text(price_td.get_text(" ", strip=True))
        except:
            pass
        print(f"💰 Price: {price}")

        # Certificate
        certificate = extract_field("certificate")
        print(f"🏆 Certificate: {certificate}")

        # -------------------- Eligibility (Combine both sources) --------------------
        eligibility_parts = []
        try:
            # --- From .cs-textbox ---
            for box in soup.find_all("div", class_="cs-textbox"):
                title_div = box.find("div", class_="cs-textbox-title")
                info_div = box.find("div", class_="cs-textbox-info")
                if title_div and "eligibility" in title_div.get_text(strip=True).lower() and info_div:
                    list_items = info_div.find_all("li")
                    if list_items:
                        eligibility_parts.append(
                            "\n".join([f"- {li.get_text(strip=True)}" for li in list_items])
                        )
                    else:
                        eligibility_parts.append(clean_text(info_div.get_text(" ", strip=True)))
                    break

            # --- From .course-section-content ---
            section = soup.find("div", class_="course-section-content")
            if section:
                ul = section.find("ul")
                if ul:
                    lis = ul.find_all("li")
                    if lis:
                        eligibility_parts.append(
                            "\n".join([f"- {li.get_text(strip=True)}" for li in lis])
                        )
                    else:
                        eligibility_parts.append(clean_text(section.get_text(" ", strip=True)))

        except Exception as e:
            print(f"⚠️ Eligibility extraction issue: {e}")

        # Merge all parts
        eligibility = "\n".join([part for part in eligibility_parts if part]) or "Not Found"
        print(f"👥 Eligibility:\n{eligibility}")


         # -------------------- Fee Structure --------------------
        fee_structure = "Not Found"
        try:
            offer_price = price if price != "Not Found" else None
            if offer_price:
                fee_structure = (
                    f"{offer_price}\n"
                    "- All other fees remain unchanged\n"
                    "- Education loans are available through leading banks and NBFCs."
                )
            else:
                fee_structure = (
                    "- All other fees remain unchanged\n"
                    "- Education loans are available through leading banks and NBFCs."
                )
        except Exception as e:
            print(f"⚠️ Fee structure extraction issue: {e}")
        print(f"🏦 Fee Structure:\n{fee_structure}")


        return [
            course_name, about_course, syllabus, duration, level, language,
            learning_mode, price, certificate, eligibility, url,fee_structure
        ]

    except Exception as e:
        print(f"🔥 Scraping failed: {str(e)}")
        import traceback
        traceback.print_exc()
        return ["Error"] * 11
    finally:
        driver.quit()
        print("🚪 Browser closed")

# -------------------- SAVE TO EXCEL --------------------
def save_to_excel(data, file_path):
    columns = [
        "Course Name", "About Course", "Syllabus", "Duration", "Course Level",
        "Language", "Learning Mode", "Price", "Certificate", "Eligibility", "Course URL","Fee Structure"
    ]
    try:
        if os.path.exists(file_path):
            df = pd.read_excel(file_path)
            for col in columns:
                if col not in df.columns:
                    df[col] = None
        else:
            df = pd.DataFrame(columns=columns)

        if data[-1] in df["Course URL"].values:
            print(f"🔄 Course already exists: {data[0]}")
            return

        new_row = pd.DataFrame([dict(zip(columns, data))])
        df = pd.concat([df, new_row], ignore_index=True)
        df.to_excel(file_path, index=False)
        print(f"💾 Saved data for: {data[0]}")

    except Exception as e:
        print(f"❌ Excel save error: {e}")

# -------------------- READ URLS FROM EXCEL --------------------
def read_urls_from_excel(file_path, sheet_name=0, column_name="Course URL"):
    try:
        df = pd.read_excel(file_path, sheet_name=sheet_name)
        if column_name not in df.columns:
            print(f"❌ Column '{column_name}' not found in the Excel file.")
            return []
        
        urls = df[column_name].dropna().tolist()
        print(f"📖 Found {len(urls)} URLs in the Excel file")
        return urls
    except Exception as e:
        print(f"❌ Error reading Excel file: {e}")
        return []

# -------------------- MAIN EXECUTION --------------------
if __name__ == "__main__":
    # Path to input Excel file with URLs
    input_file_path = r"C:\Users\taslim.siddiqui\Downloads\MCKL course all input ccdc.xlsx"  # Update this path
    output_file_path = r"C:\Users\taslim.siddiqui\Downloads\MKCL_all course link_output2new.xlsx"
    
    # Read URLs from Excel
    course_urls = read_urls_from_excel(input_file_path)
    
    if not course_urls:
        print("❌ No URLs found. Please check your input file.")
    else:
        print("🚀 Starting scraping process...")
        
        for course_url in course_urls:
            print(f"\n📖 Processing: {course_url}")
            course_data = scrape_course_data(course_url)
            if all(item != "Error" for item in course_data):
                save_to_excel(course_data, output_file_path)
            else:
                print(f"❌ Failed to scrape complete data for {course_url}")

    print("✅ Process completed")

📖 Found 244 URLs in the Excel file
🚀 Starting scraping process...

📖 Processing: https://klic.mkcl.org/klic-courses/advanced-excel
🌐 Accessing URL: https://klic.mkcl.org/klic-courses/advanced-excel
📛 Course Name: Advanced Excel
📝 About Course: Excel skills are as important as the subject knowledge. Those who know Excel can find a better payin...
📚 Syllabus extracted (102 lines)
⏳ Duration: 120 hours
📊 Course Level: Advanced
🗣️ Language: English, Marathi, Hindi
🎓 Learning Mode: Learn at ALC or Learn at Home
💰 Price: 6000/-
🏆 Certificate: Certificate of Completion
👥 Eligibility:
- Learner should preferably a std. 10th Pass student (Not Compulsory)
- It is desirable that Learner should have done MS-CIT Course (Not Compulsory)
- Accounting and Finance Students – To gain practical skills in managing data, preparing reports, and performing financial analysis.
- Business and Management Students – Essential for understanding data-driven decision-making, budgeting, and inventory management.
- D

KeyboardInterrupt: 

# Mega soft courses for one course

In [9]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time
import os

# -------------------- DRIVER SETUP --------------------
def get_driver():
    options = uc.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    # options.add_argument("--headless=new")  # Uncomment for silent scraping
    driver = uc.Chrome(options=options)
    return driver

# -------------------- CLEAN TEXT --------------------
def clean_text(text):
    return " ".join(text.split()) if text else text

# -------------------- EXTRACT COURSE NAME --------------------
def extract_course_name(soup):
    # Try multiple selectors for course name
    selectors = [
        "div.edublink-course-title h1.entry-title",
        "h1.course-title", 
        "h1.title",
        "h1",
        "title"
    ]
    
    for selector in selectors:
        element = soup.select_one(selector)
        if element:
            return clean_text(element.get_text(strip=True))
    
    return "Not Found"

# -------------------- EXTRACT ABOUT COURSE --------------------
def extract_about_course(soup):
    about_text = []
    
    # Look for the specific course description section in the HTML structure
    course_desc_heading = soup.find("h2", string=lambda text: text and "Course Description" in text)
    
    if course_desc_heading:
        # Find the parent container
        parent_container = course_desc_heading.find_parent("div", class_="elementor-element")
        if parent_container:
            # Find the text container that follows the heading
            text_container = parent_container.find_next_sibling("div", class_="elementor-element")
            if text_container:
                # Extract text from the widget container
                widget_container = text_container.find("div", class_="elementor-widget-container")
                if widget_container:
                    # Extract list items if present
                    if widget_container.find("ul"):
                        for li in widget_container.find_all("li"):
                            about_text.append(f"- {clean_text(li.get_text())}")
                    else:
                        # Extract paragraph text
                        about_text.append(clean_text(widget_container.get_text()))
    
    return "\n".join(about_text) if about_text else "Not Found"

# -------------------- EXTRACT WHO_SHOULD_TAKE --------------------
def extract_who_should_take(soup):
    who_text = []
    
    # Look for "Who Can Take This Course" section
    who_heading = soup.find("h2", string=lambda text: text and "Who Can Take This Course" in text)
    
    if who_heading:
        # Find the parent container
        parent_container = who_heading.find_parent("div", class_="elementor-element")
        if parent_container:
            # Find the text container that follows the heading
            text_container = parent_container.find_next_sibling("div", class_="elementor-element")
            if text_container:
                # Extract text from the widget container
                widget_container = text_container.find("div", class_="elementor-widget-container")
                if widget_container:
                    # Extract list items if present
                    if widget_container.find("ul"):
                        for li in widget_container.find_all("li"):
                            who_text.append(f"- {clean_text(li.get_text())}")
                    else:
                        # Extract paragraph text
                        who_text.append(clean_text(widget_container.get_text()))
    
    return "\n".join(who_text) if who_text else "Not Found"

# -------------------- EXTRACT SYLLABUS --------------------
def extract_syllabus(soup):
    syllabus_sections = []
    
    # Look for "Course Curriculum" section
    curriculum_heading = soup.find("h2", string=lambda text: text and "Course Curriculum" in text)
    
    if curriculum_heading:
        # Find the parent container of the curriculum heading
        curriculum_container = curriculum_heading.find_parent("div", class_="elementor-element")
        
        if curriculum_container:
            # Find all sibling containers that might contain syllabus modules
            next_element = curriculum_container.find_next_sibling("div", class_="elementor-element")
            
            while next_element:
                # Check if this is a syllabus module (has a heading and content)
                module_title_elem = next_element.find(["h3", "h4"])
                if module_title_elem:
                    # Look for the text editor widget container
                    text_widget = next_element.find("div", class_="elementor-widget-text-editor")
                    
                    if text_widget:
                        module_content_elem = text_widget.find("div", class_="elementor-widget-container")
                        
                        if module_content_elem:
                            module_title = clean_text(module_title_elem.get_text())
                            module_content = ""
                            
                            # Extract content (list items or text)
                            if module_content_elem.find("ul"):
                                items = []
                                for li in module_content_elem.find_all("li"):
                                    items.append(f"- {clean_text(li.get_text())}")
                                module_content = "\n".join(items)
                            else:
                                module_content = clean_text(module_content_elem.get_text())
                            
                            syllabus_sections.append(f"{module_title}\n{module_content}")
                
                # Move to the next sibling
                next_element = next_element.find_next_sibling("div", class_="elementor-element")
    
    return "\n\n".join(syllabus_sections) if syllabus_sections else "Not Found"

# -------------------- EXTRACT FACULTY --------------------
def extract_faculty(soup):
    faculty_info = []
    
    # Look for instructor information in the course header
    instructor_elements = soup.select("ul.eb-course-header-meta-items li.instructor")
    for instructor in instructor_elements:
        faculty_info.append(clean_text(instructor.get_text()))
    
    return "\n".join(faculty_info) if faculty_info else "Not Found"

# -------------------- SCRAPER --------------------
def scrape_course_data(url):
    driver = get_driver()
    try:
        print(f"🌐 Accessing URL: {url}")
        driver.get(url)
        WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        time.sleep(3)

        # Scroll to load all content
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        driver.execute_script("window.scrollTo(0, 0);")
        time.sleep(1)

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Extract only the required course information
        course_name = extract_course_name(soup)
        print(f"📛 Course Name: {course_name}")

        about_course = extract_about_course(soup)
        print(f"📝 About Course: {about_course[:100]}...")

        who_should_take = extract_who_should_take(soup)
        print(f"👥 Who Should Take: {who_should_take[:100]}...")

        syllabus = extract_syllabus(soup)
        print(f"📚 Syllabus extracted ({len(syllabus.splitlines())} lines)")

        faculty = extract_faculty(soup)
        print(f"👨‍🏫 Faculty: {faculty}")

        return [
            course_name, about_course, who_should_take, syllabus, faculty, url
        ]

    except Exception as e:
        print(f"🔥 Scraping failed: {str(e)}")
        import traceback
        traceback.print_exc()
        return ["Error"] * 6
    finally:
        driver.quit()
        print("🚪 Browser closed")

# -------------------- SAVE TO EXCEL --------------------
def save_to_excel(data, file_path):
    columns = [
        "Course Name", "About Course", "Who Should Take", "Syllabus", "Faculty", "Course URL"
    ]
    try:
        if os.path.exists(file_path):
            df = pd.read_excel(file_path)
            for col in columns:
                if col not in df.columns:
                    df[col] = None
        else:
            df = pd.DataFrame(columns=columns)

        if data[-1] in df["Course URL"].values:
            print(f"🔄 Course already exists: {data[0]}")
            return

        new_row = pd.DataFrame([dict(zip(columns, data))])
        df = pd.concat([df, new_row], ignore_index=True)
        df.to_excel(file_path, index=False)
        print(f"💾 Saved data for: {data[0]}")

    except Exception as e:
        print(f"❌ Excel save error: {e}")

# -------------------- MAIN EXECUTION --------------------
if __name__ == "__main__":
    course_urls = [
        "https://megasofttech.in/courses/data-science-analytics-courses/data-engineering/"
    ]
    print("🚀 Starting scraping process...")
    file_path = r"C:\Users\taslim.siddiqui\Downloads\MKCL.xlsx"

    for course_url in course_urls:
        print(f"\n📖 Processing: {course_url}")
        course_data = scrape_course_data(course_url)
        if all(item != "Error" for item in course_data):
            save_to_excel(course_data, file_path)
        else:
            print(f"❌ Failed to scrape complete data for {course_url}")

    print("✅ Process completed")

🚀 Starting scraping process...

📖 Processing: https://megasofttech.in/courses/data-science-analytics-courses/data-engineering/
🌐 Accessing URL: https://megasofttech.in/courses/data-science-analytics-courses/data-engineering/
📛 Course Name: Data Engineering
📝 About Course: - Focused training on building and managing data pipelines and infrastructure.
- Learn to design, co...
👥 Who Should Take: - IT professionals transitioning to data-focused roles.
- Software developers expanding into data en...
📚 Syllabus extracted (26 lines)
👨‍🏫 Faculty: By Pavan K
🚪 Browser closed
💾 Saved data for: Data Engineering
✅ Process completed


# Megasoft courses for input excel file

In [1]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time
import os

# -------------------- DRIVER SETUP --------------------
def get_driver():
    options = uc.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    # options.add_argument("--headless=new")  # Uncomment for silent scraping
    driver = uc.Chrome(options=options)
    return driver

# -------------------- CLEAN TEXT --------------------
def clean_text(text):
    return " ".join(text.split()) if text else text

# -------------------- EXTRACT COURSE NAME --------------------
def extract_course_name(soup):
    # Try multiple selectors for course name
    selectors = [
        "div.edublink-course-title h1.entry-title",
        "h1.course-title", 
        "h1.title",
        "h1",
        "title"
    ]
    
    for selector in selectors:
        element = soup.select_one(selector)
        if element:
            return clean_text(element.get_text(strip=True))
    
    return "Not Found"

# -------------------- EXTRACT ABOUT COURSE --------------------
def extract_about_course(soup):
    about_text = []
    
    # Look for the specific course description section in the HTML structure
    course_desc_heading = soup.find("h2", string=lambda text: text and "Course Description" in text)
    
    if course_desc_heading:
        # Find the parent container
        parent_container = course_desc_heading.find_parent("div", class_="elementor-element")
        if parent_container:
            # Find the text container that follows the heading
            text_container = parent_container.find_next_sibling("div", class_="elementor-element")
            if text_container:
                # Extract text from the widget container
                widget_container = text_container.find("div", class_="elementor-widget-container")
                if widget_container:
                    # Extract list items if present
                    if widget_container.find("ul"):
                        for li in widget_container.find_all("li"):
                            about_text.append(f"- {clean_text(li.get_text())}")
                    else:
                        # Extract paragraph text
                        about_text.append(clean_text(widget_container.get_text()))
    
    return "\n".join(about_text) if about_text else "Not Found"

# -------------------- EXTRACT WHO_SHOULD_TAKE --------------------
def extract_who_should_take(soup):
    who_text = []
    
    # Look for "Who Can Take This Course" section
    who_heading = soup.find("h2", string=lambda text: text and "Who Can Take This Course" in text)
    
    if who_heading:
        # Find the parent container
        parent_container = who_heading.find_parent("div", class_="elementor-element")
        if parent_container:
            # Find the text container that follows the heading
            text_container = parent_container.find_next_sibling("div", class_="elementor-element")
            if text_container:
                # Extract text from the widget container
                widget_container = text_container.find("div", class_="elementor-widget-container")
                if widget_container:
                    # Extract list items if present
                    if widget_container.find("ul"):
                        for li in widget_container.find_all("li"):
                            who_text.append(f"- {clean_text(li.get_text())}")
                    else:
                        # Extract paragraph text
                        who_text.append(clean_text(widget_container.get_text()))
    
    return "\n".join(who_text) if who_text else "Not Found"

# -------------------- EXTRACT SYLLABUS --------------------
def extract_syllabus(soup):
    syllabus_sections = []
    
    # Look for "Course Curriculum" section
    curriculum_heading = soup.find("h2", string=lambda text: text and "Course Curriculum" in text)
    
    if curriculum_heading:
        # Find the parent container of the curriculum heading
        curriculum_container = curriculum_heading.find_parent("div", class_="elementor-element")
        
        if curriculum_container:
            # Find all sibling containers that might contain syllabus modules
            next_element = curriculum_container.find_next_sibling("div", class_="elementor-element")
            
            while next_element:
                # Check if this is a syllabus module (has a heading and content)
                module_title_elem = next_element.find(["h3", "h4"])
                if module_title_elem:
                    # Look for the text editor widget container
                    text_widget = next_element.find("div", class_="elementor-widget-text-editor")
                    
                    if text_widget:
                        module_content_elem = text_widget.find("div", class_="elementor-widget-container")
                        
                        if module_content_elem:
                            module_title = clean_text(module_title_elem.get_text())
                            module_content = ""
                            
                            # Extract content (list items or text)
                            if module_content_elem.find("ul"):
                                items = []
                                for li in module_content_elem.find_all("li"):
                                    items.append(f"- {clean_text(li.get_text())}")
                                module_content = "\n".join(items)
                            else:
                                module_content = clean_text(module_content_elem.get_text())
                            
                            syllabus_sections.append(f"{module_title}\n{module_content}")
                
                # Move to the next sibling
                next_element = next_element.find_next_sibling("div", class_="elementor-element")
    
    return "\n\n".join(syllabus_sections) if syllabus_sections else "Not Found"

# -------------------- EXTRACT FACULTY --------------------
def extract_faculty(soup):
    faculty_info = []
    
    # Look for instructor information in the course header
    instructor_elements = soup.select("ul.eb-course-header-meta-items li.instructor")
    for instructor in instructor_elements:
        faculty_info.append(clean_text(instructor.get_text()))
    
    return "\n".join(faculty_info) if faculty_info else "Not Found"

# -------------------- SCRAPER --------------------
def scrape_course_data(url):
    driver = get_driver()
    try:
        print(f"🌐 Accessing URL: {url}")
        driver.get(url)
        WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        time.sleep(3)

        # Scroll to load all content
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        driver.execute_script("window.scrollTo(0, 0);")
        time.sleep(1)

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Extract only the required course information
        course_name = extract_course_name(soup)
        print(f"📛 Course Name: {course_name}")

        about_course = extract_about_course(soup)
        print(f"📝 About Course: {about_course[:100]}...")

        who_should_take = extract_who_should_take(soup)
        print(f"👥 Who Should Take: {who_should_take[:100]}...")

        syllabus = extract_syllabus(soup)
        print(f"📚 Syllabus extracted ({len(syllabus.splitlines())} lines)")

        faculty = extract_faculty(soup)
        print(f"👨‍🏫 Faculty: {faculty}")

        return [
            course_name, about_course, who_should_take, syllabus, faculty, url
        ]

    except Exception as e:
        print(f"🔥 Scraping failed: {str(e)}")
        import traceback
        traceback.print_exc()
        return ["Error"] * 6
    finally:
        driver.quit()
        print("🚪 Browser closed")

# -------------------- SAVE TO EXCEL --------------------
def save_to_excel(data, file_path):
    columns = [
        "Course Name", "About Course", "Who Should Take", "Syllabus", "Faculty", "Course URL"
    ]
    try:
        if os.path.exists(file_path):
            df = pd.read_excel(file_path)
            for col in columns:
                if col not in df.columns:
                    df[col] = None
        else:
            df = pd.DataFrame(columns=columns)

        if data[-1] in df["Course URL"].values:
            print(f"🔄 Course already exists: {data[0]}")
            return

        new_row = pd.DataFrame([dict(zip(columns, data))])
        df = pd.concat([df, new_row], ignore_index=True)
        df.to_excel(file_path, index=False)
        print(f"💾 Saved data for: {data[0]}")

    except Exception as e:
        print(f"❌ Excel save error: {e}")

# -------------------- READ URLS FROM EXCEL --------------------
def read_urls_from_excel(file_path, sheet_name=0, url_column="Course URL"):
    try:
        df = pd.read_excel(file_path, sheet_name=sheet_name)
        urls = df[url_column].dropna().tolist()
        return urls
    except Exception as e:
        print(f"❌ Error reading Excel file: {e}")
        return []

# -------------------- MAIN EXECUTION --------------------
if __name__ == "__main__":
    # Read input Excel file with course URLs
    input_file_path = r"C:\Users\taslim.siddiqui\Downloads\mega soft CCDC.xlsx"  # Update this path
    output_file_path = r"C:\Users\taslim.siddiqui\Downloads\megasoft.xlsx"
    
    print("🚀 Starting scraping process...")
    
    # Read URLs from input Excel file
    course_urls = read_urls_from_excel(input_file_path)
    
    if not course_urls:
        print("❌ No URLs found in the input file")
    else:
        print(f"📖 Found {len(course_urls)} URLs to process")
        
        for course_url in course_urls:
            print(f"\n📖 Processing: {course_url}")
            course_data = scrape_course_data(course_url)
            if all(item != "Error" for item in course_data):
                save_to_excel(course_data, output_file_path)
            else:
                print(f"❌ Failed to scrape complete data for {course_url}")

    print("✅ Process completed")

🚀 Starting scraping process...
📖 Found 14 URLs to process

📖 Processing: https://megasofttech.in/courses/data-science-analytics-courses/data-engineering/
🌐 Accessing URL: https://megasofttech.in/courses/data-science-analytics-courses/data-engineering/
📛 Course Name: Data Engineering
📝 About Course: - Focused training on building and managing data pipelines and infrastructure.
- Learn to design, co...
👥 Who Should Take: - IT professionals transitioning to data-focused roles.
- Software developers expanding into data en...
📚 Syllabus extracted (26 lines)
👨‍🏫 Faculty: By Pavan K
🚪 Browser closed
💾 Saved data for: Data Engineering

📖 Processing: https://megasofttech.in/courses/data-science-analytics-courses/data-science-course/
🌐 Accessing URL: https://megasofttech.in/courses/data-science-analytics-courses/data-science-course/
📛 Course Name: Data Science Course
📝 About Course: - Comprehensive training in analytics, machine learning, and AI.
- Designed for beginners and profes...
👥 Who Shou