Coding Block Web Scrapping

Web Scrapping for one link 

In [9]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from bs4 import BeautifulSoup
import time
import os

# -------------------- DRIVER SETUP --------------------
def get_driver():
    options = uc.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    # options.add_argument("--headless")  # Uncomment for headless mode
    driver = uc.Chrome(options=options, version_main=138)
    return driver

# -------------------- SYLLABUS EXTRACTION --------------------
def extract_syllabus(soup):
    try:
        syllabus_div = soup.find("div", class_=lambda c: c and "course-content" in c)
        if not syllabus_div:
            print("⚠️ Syllabus div not found")
            return "Syllabus not available"

        syllabus_text = ""
        for module in syllabus_div.find_all(
            "div", class_="row no-gutters align-items-center justify-content-between pointer p-30p py-4 hover-grey"
        ):
            module_title_tag = module.find("h6", class_=lambda c: c and "bold white" in c)
            module_title = module_title_tag.get_text(strip=True) if module_title_tag else "Unnamed Module"

            duration_info = module.find_next("div", class_="grey-2 word-spaced")
            duration = duration_info.get_text(strip=True) if duration_info else ""

            syllabus_text += f"\n🔹 {module_title} ({duration})\n"

            lessons_container = module.find_next_sibling("div", class_="body")
            if lessons_container:
                lesson_items = lessons_container.find_all("div", class_="row no-gutters justify-content-between my-4")
                for lesson in lesson_items:
                    lesson_text_tag = lesson.find("div", class_="markdown-body")
                    lesson_text = lesson_text_tag.get_text(strip=True) if lesson_text_tag else ""

                    duration_tag = lesson.find("div", class_="card-md font-normal t-align-r")
                    lesson_duration = duration_tag.get_text(strip=True) if duration_tag else ""

                    if lesson_text:
                        syllabus_text += f"    • {lesson_text} ({lesson_duration})\n"

        return syllabus_text.strip()
    except Exception as e:
        return f"Syllabus extraction failed: {str(e)}"

# -------------------- SCRAPER --------------------
def scrape_course_data(url):
    driver = get_driver()
    try:
        print(f"🌐 Accessing URL: {url}")
        driver.get(url)

        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "span.bold.font-xl"))
        )

        for _ in range(3):
            driver.execute_script("window.scrollBy(0, window.innerHeight)")
            time.sleep(1)

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # 1. Course Name
        course_name = soup.find("span", class_="bold font-xl").get_text(strip=True)
        print(f"📛 Course Name: {course_name}")

        # 2. About Course
        about_section = soup.find("p")
        about_course = about_section.get_text(strip=True) if about_section else "Not Found"
        print(f"📝 About Course: {about_course[:50]}...")

        # 3. Syllabus
        syllabus = extract_syllabus(soup)
        print(f"📚 Syllabus extracted ({len(syllabus.splitlines())} lines)")

        # 4. Original Price(s)
        pairs = []
        for tier in soup.find_all("h4", class_="tier-name"):
            price_tag = tier.find_next("del")
            if price_tag:
                tier_name = " ".join(tier.get_text().split())
                price_text = " ".join(price_tag.get_text().split())
                pairs.append(f"{tier_name} - {price_text}")

        if pairs:
            original_price = "🏷️ Original Price(s):\n" + "\n".join(pairs)
        else:
            original_price = "Original price not available"
        print(f"{original_price}")

        # 5. Language
        language = "Language not available"
        for info_item in soup.find_all("div", class_="info-item"):
            label = info_item.find("div")
            if label and "Course Language" in label.get_text(strip=True):
                value_divs = info_item.find_all("div")
                if len(value_divs) > 1:
                    language = value_divs[1].get_text(strip=True)
                break
        print(f"🗣️ Language: {language}")

        # 6. Duration
        duration = "Duration not available"
        duration_tag = soup.find("div", class_="card-md mt-1 grey-2")
        if duration_tag:
            duration_text = " ".join(duration_tag.get_text().split())
            duration = duration_text.replace("Valid for ", "")
        print(f"⏳ Duration: {duration}")

        # 7. Faculty
        faculty = "Faculty not available"
        for info_item in soup.find_all("div", class_="info-item"):
            label = info_item.find("div")
            if label and "Instructors" in label.get_text(strip=True):
                value_divs = info_item.find_all("div")
                if len(value_divs) > 1:
                    faculty = value_divs[1].get_text(strip=True)
                break
        print(f"👨‍🏫 Faculty: {faculty}")

        return course_name, about_course, syllabus, original_price, language, duration, faculty

    except Exception as e:
        print(f"🔥 Scraping failed for {url}: {str(e)}")
        return ["Error"] * 7
    finally:
        driver.quit()
        print("🚪 Browser closed")

# -------------------- SAVE TO EXCEL --------------------
def save_to_excel(data, file_path, url):
    columns = [
        "Course Name", 
        "About Course", 
        "Syllabus", 
        "Original Price",
        "Language",
        "Duration",
        "Course Faculty",
        "Course URL"
    ]

    try:
        if os.path.exists(file_path):
            df = pd.read_excel(file_path)
            for col in columns:
                if col not in df.columns:
                    df[col] = None
        else:
            df = pd.DataFrame(columns=columns)

        existing_urls = df["Course URL"].tolist() if "Course URL" in df.columns else []
        if url in existing_urls:
            print(f"🔄 Course already exists: {data[0]}")
            return

        new_row = pd.DataFrame([dict(zip(columns, [*data, url]))])
        df = pd.concat([df, new_row], ignore_index=True)
        
        df.to_excel(file_path, index=False)
        print(f"💾 Saved data for: {data[0]}")

    except Exception as e:
        print(f"❌ Excel save error: {e}")

# -------------------- MAIN --------------------
if __name__ == "__main__":
    course_urls = [
        "https://online.codingblocks.com/courses/web-development-online-course"
        "",
    ]

    print("🚀 Starting scraping process...")
    file_path = r"C:\Users\taslim.siddiqui\Downloads\course00.xlsx"

    for course_url in course_urls:
        print(f"\n🔍 Processing: {course_url}")
        course_data = scrape_course_data(course_url)
        if all(item != "Error" for item in course_data):
            save_to_excel(course_data, file_path, course_url)
        else:
            print(f"❌ Failed to scrape complete data for {course_url}")

    print("\n✅ Process completed")


🚀 Starting scraping process...

🔍 Processing: https://online.codingblocks.com/courses/web-development-online-course
🌐 Accessing URL: https://online.codingblocks.com/courses/web-development-online-course
📛 Course Name: Full Stack Web Development with NodeJS Master Course
📝 About Course: The online course for Web Development by Coding Bl...
📚 Syllabus extracted (261 lines)
🏷️ Original Price(s):
LITE - ₹ 7999
PREMIUM - ₹ 17999
CLASSROOM - ₹ 27999
🗣️ Language: English
⏳ Duration: 6 Months
👨‍🏫 Faculty: Arnav Gupta
🚪 Browser closed
🔄 Course already exists: Full Stack Web Development with NodeJS Master Course

✅ Process completed


Web Scrapping for entire excel file 

In [16]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from bs4 import BeautifulSoup
import time
import os

# -------------------- DRIVER SETUP --------------------
def get_driver():
    options = uc.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    driver = uc.Chrome(options=options, version_main=138)
    return driver

# -------------------- SYLLABUS EXTRACTION (ONLINE PORTAL) --------------------
def extract_syllabus_online(soup):
    try:
        syllabus_div = soup.find("div", class_=lambda c: c and "course-content" in c)
        if not syllabus_div:
            return "Syllabus not available"

        syllabus_text = ""
        for module in syllabus_div.find_all(
            "div", class_="row no-gutters align-items-center justify-content-between pointer p-30p py-4 hover-grey"
        ):
            module_title_tag = module.find("h6", class_=lambda c: c and "bold white" in c)
            module_title = module_title_tag.get_text(strip=True) if module_title_tag else "Unnamed Module"

            duration_info = module.find_next("div", class_="grey-2 word-spaced")
            duration = duration_info.get_text(strip=True) if duration_info else ""

            syllabus_text += f"\n🔹 {module_title} ({duration})\n"

            lessons_container = module.find_next_sibling("div", class_="body")
            if lessons_container:
                lesson_items = lessons_container.find_all("div", class_="row no-gutters justify-content-between my-4")
                for lesson in lesson_items:
                    lesson_text_tag = lesson.find("div", class_="markdown-body")
                    lesson_text = lesson_text_tag.get_text(strip=True) if lesson_text_tag else ""

                    duration_tag = lesson.find("div", class_="card-md font-normal t-align-r")
                    lesson_duration = duration_tag.get_text(strip=True) if duration_tag else ""

                    if lesson_text:
                        syllabus_text += f"    • {lesson_text} ({lesson_duration})\n"

        return syllabus_text.strip()
    except Exception as e:
        return f"Syllabus extraction failed: {str(e)}"

# -------------------- SCRAPER --------------------
def scrape_course_data(url):
    driver = get_driver()
    try:
        print(f"🌐 Accessing URL: {url}")
        driver.get(url)

        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        time.sleep(2)

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # ---------------- SITE TYPE CHECK ----------------
        if "online.codingblocks.com" in url:
            # ---------- Online portal ----------
            course_name = soup.find("span", class_="bold font-xl")
            course_name = course_name.get_text(strip=True) if course_name else "Not Found"

            about_section = soup.find("p")
            about_course = about_section.get_text(strip=True) if about_section else "Not Found"

            syllabus = extract_syllabus_online(soup)

        else:
            # ---------- Main codingblocks.com ----------
            course_name = soup.find("h1").get_text(strip=True) if soup.find("h1") else "Not Found"

            about_section = soup.find("div", {"class": "course-overview"})
            about_course = about_section.get_text(" ", strip=True) if about_section else "Not Found"

            # syllabus tab content
            syllabus_divs = soup.select("div[id^=syllabustabcontent]")
            syllabus_parts = []
            for div in syllabus_divs:
                syllabus_parts.append(div.get_text(" ", strip=True))
            syllabus = "\n".join(syllabus_parts) if syllabus_parts else "Syllabus not available"

        # ---------- Original Price(s) ----------
        pairs = []
        for tier in soup.find_all("h4", class_="tier-name"):
            price_tag = tier.find_next("del")
            if price_tag:
                tier_name = " ".join(tier.get_text().split())
                price_text = " ".join(price_tag.get_text().split())
                pairs.append(f"{tier_name} - {price_text}")

        if pairs:
            original_price = "🏷️ Original Price(s):\n" + "\n".join(pairs)
        else:
            original_price = "Original price not available"

        # ---------- Language ----------
        language = "Language not available"
        for info_item in soup.find_all("div", class_="info-item"):
            label = info_item.find("div")
            if label and "Course Language" in label.get_text(strip=True):
                value_divs = info_item.find_all("div")
                if len(value_divs) > 1:
                    language = value_divs[1].get_text(strip=True)
                break

        # ---------- Duration ----------
        duration = "Duration not available"
        duration_tag = soup.find("div", class_="card-md mt-1 grey-2")
        if duration_tag:
            duration_text = " ".join(duration_tag.get_text().split())
            duration = duration_text.replace("Valid for ", "")

        # ---------- Faculty ----------
        faculty = "Faculty not available"
        for info_item in soup.find_all("div", class_="info-item"):
            label = info_item.find("div")
            if label and "Instructors" in label.get_text(strip=True):
                value_divs = info_item.find_all("div")
                if len(value_divs) > 1:
                    faculty = value_divs[1].get_text(strip=True)
                break

        return course_name, about_course, syllabus, original_price, language, duration, faculty

    except Exception as e:
        print(f"🔥 Scraping failed for {url}: {str(e)}")
        return ["Error"] * 7
    finally:
        driver.quit()
        print("🚪 Browser closed")

# -------------------- SAVE TO EXCEL --------------------
def save_to_excel(data, file_path, url):
    columns = [
        "Course Name", 
        "About Course", 
        "Syllabus", 
        "Original Price",
        "Language",
        "Duration",
        "Course Faculty",
        "Course URL"
    ]

    if os.path.exists(file_path):
        df = pd.read_excel(file_path)
        for col in columns:
            if col not in df.columns:
                df[col] = None
    else:
        df = pd.DataFrame(columns=columns)

    # ✅ Fix: check against "Course URL" not "Course Links"
    if "Course URL" in df.columns and url in df["Course URL"].tolist():
        print(f"🔄 Already exists: {url}")
        return

    new_row = pd.DataFrame([dict(zip(columns, [*data, url]))])
    df = pd.concat([df, new_row], ignore_index=True)
    df.to_excel(file_path, index=False)
    print(f"💾 Saved: {data[0]}")

# -------------------- MAIN --------------------
if __name__ == "__main__":
    input_file = "C:\\Users\\taslim.siddiqui\\Downloads\\course_links.xlsx"
    file_path = "C:\\Users\\taslim.siddiqui\\Downloads\\course_data12.xlsx"

    # Load Excel
    urls_df = pd.read_excel(input_file)

    # Auto-detect column (take first column if 'Course Links' not present)
    if "Course Links" in urls_df.columns:
        course_urls = urls_df["Course Links"].dropna().tolist()
    else:
        first_col = urls_df.columns[0]
        print(f"⚠️ Column 'Course Links' not found. Using first column: {first_col}")
        course_urls = urls_df[first_col].dropna().tolist()

    print(f"🚀 Found {len(course_urls)} course URLs to process")

    for course_url in course_urls:
        print(f"\n🔍 Processing: {course_url}")
        course_data = scrape_course_data(course_url)
        if all(item != "Error" for item in course_data):
            save_to_excel(course_data, file_path, course_url)
        else:
            print(f"❌ Failed: {course_url}")

    print("\n✅ Process completed")


🚀 Found 39 course URLs to process

🔍 Processing: https://online.codingblocks.com/courses/backend-web-development
🌐 Accessing URL: https://online.codingblocks.com/courses/backend-web-development
🚪 Browser closed
💾 Saved: Backend Web Development using node.JS

🔍 Processing: https://codingblocks.com/devops.html
🌐 Accessing URL: https://codingblocks.com/devops.html
🚪 Browser closed
💾 Saved: 45x - 200x

🔍 Processing: https://online.codingblocks.com/courses/learn-dynamic-programming-online-course
🌐 Accessing URL: https://online.codingblocks.com/courses/learn-dynamic-programming-online-course
🚪 Browser closed
💾 Saved: Dynamic Programming

🔍 Processing: https://online.codingblocks.com/courses/web-development-online-course
🌐 Accessing URL: https://online.codingblocks.com/courses/web-development-online-course
🚪 Browser closed
💾 Saved: Full Stack Web Development with NodeJS Master Course

🔍 Processing: https://online.codingblocks.com/courses/reinforcement-learning-and-artificial-intelligence
🌐 Ac

For do input one course link that having end .html

In [31]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time

# -------------------- DRIVER SETUP --------------------
def get_driver():
    options = uc.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    driver = uc.Chrome(options=options, version_main=138)
    return driver

# -------------------- SYLLABUS EXTRACTION --------------------
def extract_syllabus_tabpan(soup):
    syllabus_div = soup.find("div", class_="syllabuscontent tabpan")
    if not syllabus_div:
        return "Syllabus not available"

    syllabus_text = ""
    left_tabs = syllabus_div.find("ul", {"id": "syllabusTab"})
    if not left_tabs:
        return "Syllabus not available"

    for li in left_tabs.find_all("a", class_="nav-link"):
        module_title = li.get_text(strip=True)
        target_id = li.get("href", "").replace("#", "")
        content_div = syllabus_div.find("div", {"id": target_id})

        syllabus_text += f"\n📘 {module_title}\n"
        if content_div:
            for li_content in content_div.find_all("li"):
                strong_tag = li_content.find("strong")
                subheading = strong_tag.get_text(strip=True) if strong_tag else ""
                text = li_content.get_text(" ", strip=True)
                if subheading:
                    syllabus_text += f"   • {subheading}: {text.replace(subheading, '').strip()}\n"
                else:
                    syllabus_text += f"   • {text}\n"

    return syllabus_text.strip()

# -------------------- SCRAPER --------------------
def scrape_course_data(url):
    driver = get_driver()
    try:
        print(f"🌐 Accessing URL: {url}")
        driver.get(url)

        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        time.sleep(2)

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # -------- Course Name --------
        course_name = "Not Found"
        banner_div = soup.find("div", class_="bannerimg")
        if banner_div and banner_div.find("h2"):
            course_name = banner_div.find("h2").get_text(strip=True)

        # -------- About Course --------
        about_section = soup.find("div", class_="subtitle")
        about_course = about_section.get_text(" ", strip=True) if about_section else soup.find("p").get_text(" ", strip=True)

        # -------- Syllabus --------
        syllabus = extract_syllabus_tabpan(soup)

        # -------- Price --------
        price_h4 = soup.find("h4")
        mode_p = soup.find("p", class_="last")
        price_text = "Price not available"
        if price_h4 and mode_p:
            price_val = price_h4.get_text(" ", strip=True).replace("Starting from", "").strip()
            mode_val = mode_p.get_text(" ", strip=True).replace("Mode of Delivery", "").strip()
            price_text = f"{price_val} - {mode_val}"

        # -------- Language --------
        language = "Language not available"
        for info_item in soup.find_all("div", class_="info-item"):
            label = info_item.find("div")
            if label and "Course Language" in label.get_text(strip=True):
                value_divs = info_item.find_all("div")
                if len(value_divs) > 1:
                    language = value_divs[1].get_text(strip=True)
                break

        # -------- Duration --------
        duration = "Duration not available"
        cousecontent_div = soup.find("div", class_="cousecontent")
        if cousecontent_div:
            for p_tag in cousecontent_div.find_all("p"):
                text = p_tag.get_text(" ", strip=True)
                if "month" in text.lower():  # matches month or months
                    span = p_tag.find("span")
                    if span:
                        span.extract()  # remove <span>Duration</span>
                    duration = p_tag.get_text(" ", strip=True)
                    break

        # -------- Faculty --------
        faculty = "Faculty not available"
        for info_item in soup.find_all("div", class_="info-item"):
            label = info_item.find("div")
            if label and "Instructors" in label.get_text(strip=True):
                value_divs = info_item.find_all("div")
                if len(value_divs) > 1:
                    faculty = value_divs[1].get_text(strip=True)
                break

        return {
            "Course Name": course_name,
            "About Course": about_course,
            "Syllabus": syllabus,
            "Price": price_text,
            "Language": language,
            "Duration": duration,
            "Faculty": faculty,
            "URL": url
        }

    except Exception as e:
        print(f"🔥 Scraping failed for {url}: {str(e)}")
        return None
    finally:
        driver.quit()
        print("🚪 Browser closed")

# -------------------- MAIN --------------------
if __name__ == "__main__":
    course_url = "https://codingblocks.com/full-stack-web-development-node-js.html"
    print("🚀 Starting scraping process...")
    data = scrape_course_data(course_url)
    if data:
        print("\n✅ Course Data Extracted:\n")
        for key, value in data.items():
            print(f"{key}: {value}\n")


🚀 Starting scraping process...
🌐 Accessing URL: https://codingblocks.com/full-stack-web-development-node-js.html
🚪 Browser closed

✅ Course Data Extracted:

Course Name: Master Full stack Web development with node.JS (MERN)

About Course: Master Full Stack Web Development with Node.JS (MERN) at Coding
              Blocks. This course covers front-end technologies (HTML, CSS,
              JavaScript) and back-end development (Node.js, MongoDB). Gain
              hands-on experience with real-world projects and mentorship to
              prepare for high-demand roles in web development.

Syllabus: 📘 Getting started with Web Development
   • Master HTML: In this you will learn how to create the
                                structure and the template of the webpages and
                                websites.
   • Mastering CSS: In this you will learn how to add styles and
                                make your webpages look stylish and responsive.
   • Basics of Programming: F

For do input Entire Excel file course link that having end .html

In [30]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time
import os

# -------------------- DRIVER SETUP --------------------
def get_driver():
    options = uc.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    driver = uc.Chrome(options=options, version_main=138)
    return driver

# -------------------- SYLLABUS EXTRACTION --------------------
def extract_syllabus_tabpan(soup):
    syllabus_div = soup.find("div", class_="syllabuscontent tabpan")
    if not syllabus_div:
        return "Syllabus not available"

    syllabus_text = ""
    left_tabs = syllabus_div.find("ul", {"id": "syllabusTab"})
    if not left_tabs:
        return "Syllabus not available"

    for li in left_tabs.find_all("a", class_="nav-link"):
        module_title = li.get_text(strip=True)
        target_id = li.get("href", "").replace("#", "")
        content_div = syllabus_div.find("div", {"id": target_id})

        syllabus_text += f"\n📘 {module_title}\n"
        if content_div:
            for li_content in content_div.find_all("li"):
                strong_tag = li_content.find("strong")
                subheading = strong_tag.get_text(strip=True) if strong_tag else ""
                text = li_content.get_text(" ", strip=True)
                if subheading:
                    syllabus_text += f"   • {subheading}: {text.replace(subheading, '').strip()}\n"
                else:
                    syllabus_text += f"   • {text}\n"

    return syllabus_text.strip()

# -------------------- SCRAPER --------------------
def scrape_course_data(url):
    driver = get_driver()
    try:
        print(f"🌐 Accessing URL: {url}")
        driver.get(url)

        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        time.sleep(2)

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # -------- Course Name --------
        course_name = "Not Found"
        banner_div = soup.find("div", class_="bannerimg")
        if banner_div and banner_div.find("h2"):
            course_name = banner_div.find("h2").get_text(strip=True)

        # -------- About Course --------
        about_section = soup.find("div", class_="subtitle")
        about_course = about_section.get_text(" ", strip=True) if about_section else soup.find("p").get_text(" ", strip=True)

        # -------- Syllabus --------
        syllabus = extract_syllabus_tabpan(soup)

        # -------- Price --------
        price_h4 = soup.find("h4")
        mode_p = soup.find("p", class_="last")
        price_text = "Price not available"
        if price_h4 and mode_p:
            price_val = price_h4.get_text(" ", strip=True).replace("Starting from", "").strip()
            mode_val = mode_p.get_text(" ", strip=True).replace("Mode of Delivery", "").strip()
            price_text = f"{price_val} - {mode_val}"

        # -------- Language --------
        language = "Language not available"
        for info_item in soup.find_all("div", class_="info-item"):
            label = info_item.find("div")
            if label and "Course Language" in label.get_text(strip=True):
                value_divs = info_item.find_all("div")
                if len(value_divs) > 1:
                    language = value_divs[1].get_text(strip=True)
                break

        # -------- Duration --------
        duration = "Duration not available"
        cousecontent_div = soup.find("div", class_="cousecontent")
        if cousecontent_div:
            for p_tag in cousecontent_div.find_all("p"):
                text = p_tag.get_text(" ", strip=True)
                if "month" in text.lower():  # matches month or months
                    span = p_tag.find("span")
                    if span:
                        span.extract()  # remove <span>Duration</span>
                    duration = p_tag.get_text(" ", strip=True)
                    break

        # -------- Faculty --------
        faculty = "Faculty not available"
        for info_item in soup.find_all("div", class_="info-item"):
            label = info_item.find("div")
            if label and "Instructors" in label.get_text(strip=True):
                value_divs = info_item.find_all("div")
                if len(value_divs) > 1:
                    faculty = value_divs[1].get_text(strip=True)
                break

        return {
            "Course Name": course_name,
            "About Course": about_course,
            "Syllabus": syllabus,
            "Price": price_text,
            "Language": language,
            "Duration": duration,
            "Faculty": faculty,
            "URL": url
        }

    except Exception as e:
        print(f"🔥 Scraping failed for {url}: {str(e)}")
        return None
    finally:
        driver.quit()
        print("🚪 Browser closed")

# -------------------- SAVE TO EXCEL --------------------
def save_to_excel(data_list, file_path):
    columns = [
        "Course Name",
        "About Course",
        "Syllabus",
        "Price",
        "Language",
        "Duration",
        "Faculty",
        "URL"
    ]

    if os.path.exists(file_path):
        df = pd.read_excel(file_path)
        for col in columns:
            if col not in df.columns:
                df[col] = None
    else:
        df = pd.DataFrame(columns=columns)

    for data in data_list:
        if data and data["URL"] not in df["URL"].tolist():
            df = pd.concat([df, pd.DataFrame([data])], ignore_index=True)

    df.to_excel(file_path, index=False)
    print(f"\n💾 All data saved to {file_path}")

# -------------------- MAIN --------------------
if __name__ == "__main__":
    input_file = r"C:\Users\taslim.siddiqui\Downloads\Remain data.xlsx"
    output_file = "C:\\Users\\taslim.siddiqui\\Downloads\\course_data_coding.xlsx"

    urls_df = pd.read_excel(input_file)

    if "Course Links" in urls_df.columns:
        course_urls = urls_df["Course Links"].dropna().tolist()
    else:
        first_col = urls_df.columns[0]
        print(f"⚠️ Column 'Course Links' not found. Using first column: {first_col}")
        course_urls = urls_df[first_col].dropna().tolist()

    print(f"🚀 Found {len(course_urls)} course URLs to process")

    all_data = []
    for url in course_urls:
        print(f"\n🔍 Processing: {url}")
        course_data = scrape_course_data(url)
        if course_data:
            all_data.append(course_data)
        else:
            print(f"❌ Failed to scrape: {url}")

    save_to_excel(all_data, output_file)
    print("\n✅ Process completed for all courses")


🚀 Found 18 course URLs to process

🔍 Processing: https://codingblocks.com/devops.html
🌐 Accessing URL: https://codingblocks.com/devops.html
🚪 Browser closed

🔍 Processing: https://codingblocks.com/sampoorna.html#syllabustabcontent2
🌐 Accessing URL: https://codingblocks.com/sampoorna.html#syllabustabcontent2
🚪 Browser closed

🔍 Processing: https://codingblocks.com/competitive-programming.html
🌐 Accessing URL: https://codingblocks.com/competitive-programming.html
🚪 Browser closed

🔍 Processing: https://www.codingblocks.com/interview-preparation-using-c-plus-plus.html#syllabustabcontent8
🌐 Accessing URL: https://www.codingblocks.com/interview-preparation-using-c-plus-plus.html#syllabustabcontent8
🚪 Browser closed

🔍 Processing: https://codingblocks.com/data-structures-and-algorithms-using-python.html
🌐 Accessing URL: https://codingblocks.com/data-structures-and-algorithms-using-python.html
🚪 Browser closed

🔍 Processing: https://codingblocks.com/system-design.html
🌐 Accessing URL: https:/