Do for One link

In [6]:
import time
import os
import re
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# -------------------- DRIVER SETUP --------------------
def get_driver(headless=False):
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    if headless:
        options.add_argument("--headless=new")  # modern headless
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

# -------------------- TEXT CLEANING --------------------
def clean_text(text):
    """Remove specified words from text"""
    words_to_remove = [
        "Objective:", "Objective :",
        "Eligibility:", "Eligibility :",
        "Duration:", "Duration :",
        "Professional Skills"
    ]
    for word in words_to_remove:
        text = text.replace(word, "")
    return text.strip()

# -------------------- BULLET FORMATTING --------------------
def format_bullets(text):
    """
    Ensure there is a newline between each bullet point.
    Example: "- Topic 1... .- Topic 2..." → "- Topic 1...\n\n- Topic 2..."
    """
    if not text or text == "Not available":
        return text
    text = text.strip()
    text = re.sub(r'\.\s*-\s*', r'.\n\n- ', text)
    return text

# -------------------- SYLLABUS EXTRACTION --------------------
def extract_syllabus(soup):
    """Extract syllabus content up to Job Opportunities or Career"""
    headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
    
    for heading in headings:
        heading_text = heading.get_text(strip=True).lower()
        if 'syllabus' in heading_text:
            syllabus_content = []
            elements_to_process = []
            current_elem = heading
            while current_elem:
                current_elem = current_elem.find_next_sibling()
                if not current_elem:
                    break
                if current_elem.name in ['h1','h2','h3','h4','h5','h6']:
                    heading_text = current_elem.get_text(strip=True).lower()
                    if 'job opportunit' in heading_text or 'career' in heading_text:
                        break
                elements_to_process.append(current_elem)

            for elem in elements_to_process:
                elem_text = elem.get_text(strip=True)
                if 'job opportunities' in elem_text.lower() and len(elem_text) < 100:
                    break
                if elem.name in ['p', 'ul', 'ol', 'div']:
                    if elem.name in ['ul','ol']:
                        for item in elem.find_all('li'):
                            item_text = clean_text(item.get_text(strip=True))
                            if item_text and 'job opportunit' not in item_text.lower():
                                syllabus_content.append(f"- {item_text}")
                    else:
                        paragraph_text = clean_text(elem.get_text(strip=True))
                        if paragraph_text and 'job opportunit' not in paragraph_text.lower():
                            syllabus_content.append(paragraph_text)
            if syllabus_content:
                return format_bullets("\n".join(syllabus_content))
    return "Not available"

def extract_old_syllabus(soup):
    """Old method for syllabus extraction"""
    syllabus_content = []
    stop_markers = [
        "After completing", "Graduates",
        "After successful", "After ",
        "Job Opportunities", "job opportunities"
    ]

    syllabus_section = soup.find(string=re.compile("Syllabus", re.IGNORECASE))
    if syllabus_section:
        next_elem = syllabus_section.find_next()
        while next_elem and next_elem.name in ["p", "ul", "ol"]:
            text_content = next_elem.get_text(strip=True)

            if any(marker.lower() in text_content.lower() for marker in stop_markers):
                break

            if next_elem.name in ["ul", "ol"]:
                for item in next_elem.find_all("li"):
                    item_text = clean_text(item.get_text(strip=True))
                    if not any(marker.lower() in item_text.lower() for marker in stop_markers):
                        syllabus_content.append(f"- {item_text}")
                    else:
                        break
            else:
                if text_content:
                    syllabus_content.append(f"- {clean_text(text_content)}")

            next_elem = next_elem.find_next_sibling()

    return format_bullets("\n".join(syllabus_content)) if syllabus_content else "Not available"

# -------------------- SCRAPER --------------------
def scrape_course_data(url):
    driver = get_driver()
    try:
        print(f"🌐 Accessing URL: {url}")
        driver.get(url)

        # Wait for page to load
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "h1")))
        time.sleep(3)

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # 1. Course Name
        course_name_tag = soup.find("h1", class_="product_title entry-title")
        course_name = course_name_tag.get_text(strip=True) if course_name_tag else "Course name not found"
        print(f"📛 Course Name: {course_name}")

        # 2. About Course
        about_course = "About course not found"
        about_sections = soup.find_all("p")
        for section in about_sections:
            strong_tag = section.find("strong")
            if strong_tag and "objective" in strong_tag.get_text(strip=True).lower():
                about_course = clean_text(section.get_text(" ", strip=True))
                break
        print(f"📝 About Course: {about_course[:80]}...")

        # 3. Eligibility
        eligibility = "Not available"
        for section in about_sections:
            strong_tag = section.find("strong")
            if strong_tag and "eligibility" in strong_tag.get_text(strip=True).lower():
                eligibility = clean_text(section.get_text(strip=True))
                break
        print(f"✅ Eligibility: {eligibility}")

        # 4. Duration
        duration = "Not available"
        for section in about_sections:
            strong_tag = section.find("strong")
            if strong_tag and "duration" in strong_tag.get_text(strip=True).lower():
                duration = clean_text(section.get_text(strip=True))
                break
        print(f"⏱️ Duration: {duration}")

        # 5. Price
        price = "Not available"
        price_tag = soup.find("bdi")
        if price_tag:
            price = price_tag.get_text(strip=True)
        print(f"💰 Price: {price}")

        # 6. Who Should Take It
        who_content = []
        who_strong = soup.find("strong", string=re.compile(r"Who Should Enroll|Who Should Take It", re.IGNORECASE))
        if who_strong:
            next_ul = who_strong.find_next("ul")
            if next_ul:
                for li in next_ul.find_all("li"):
                    who_content.append(clean_text(li.get_text(strip=True)))
        who_should_take = "\n".join([f"- {item}" for item in who_content]) if who_content else "Not available"
        print(f"👥 Who Should Take It: {who_should_take}")

        # 7A. Old syllabus extraction
        syllabus_old = extract_old_syllabus(soup)
        # 7B. New syllabus extraction
        syllabus_new = extract_syllabus(soup)

        print(f"📚 Syllabus (Old): {syllabus_old[:80]}...")
        print(f"📚 Syllabus (New): {syllabus_new[:80]}...")

        # 8. Certificate
        cert_link = "Certificate not available"
        cert_tag = soup.find("a", href=re.compile(r"\.pdf$"))
        if cert_tag and cert_tag.has_attr("href"):
            cert_link = cert_tag["href"]
        else:
            cert_img = soup.find("img", src=re.compile(r"certificate", re.IGNORECASE))
            if cert_img and cert_img.has_attr("src"):
                cert_link = cert_img["src"]
        print(f"📜 Certificate: {cert_link}")

        return course_name, about_course, eligibility, duration, price, who_should_take, syllabus_old, syllabus_new, cert_link

    except Exception as e:
        print(f"🔥 Scraping failed for {url}: {str(e)}")
        import traceback
        traceback.print_exc()
        return ["Error"] * 9
    finally:
        driver.quit()
        print("🚪 Browser closed")

# -------------------- SAVE TO EXCEL --------------------
def save_to_excel(data, file_path, url):
    columns = [
        "Course Name",
        "About Course",
        "Eligibility",
        "Duration",
        "Price",
        "Who Should Take It",
        "Syllabus (Old)",
        "Syllabus (New)",
        "Certificate",
        "Course URL"
    ]
    (course_name, about_course, eligibility, duration, price,
     who_should_take, syllabus_old, syllabus_new, cert_link) = data
    try:
        if os.path.exists(file_path):
            df = pd.read_excel(file_path)
        else:
            df = pd.DataFrame(columns=columns)

        # ✅ Avoid duplicate entries
        if url not in df["Course URL"].values:
            row = {
                "Course Name": course_name,
                "About Course": about_course,
                "Eligibility": eligibility,
                "Duration": duration,
                "Price": price,
                "Who Should Take It": who_should_take,
                "Syllabus (Old)": syllabus_old,
                "Syllabus (New)": syllabus_new,
                "Certificate": cert_link,
                "Course URL": url
            }
            df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
            df.to_excel(file_path, index=False)
            print(f"💾 Saved data for: {course_name}")
        else:
            print(f"⚠️ Skipped duplicate: {url}")

    except Exception as e:
        print(f"❌ Excel save error: {e}")

# -------------------- MAIN --------------------
if __name__ == "__main__":
    course_urls = [
        "https://surewinindia.com/product/certificate-in-yoga-and-naturopathy/"
    ]

    print("🚀 Starting scraping process...")
    file_path = r"C:\Users\taslim.siddiqui\Downloads\course.xlsx"

    for course_url in course_urls:
        print(f"\n🔍 Processing: {course_url}")
        course_data = scrape_course_data(course_url)
        if all(item != "Error" for item in course_data):
            save_to_excel(course_data, file_path, course_url)
        else:
            print(f"❌ Failed to scrape complete data for {course_url}")

    print("\n✅ Process completed")


🚀 Starting scraping process...

🔍 Processing: https://surewinindia.com/product/certificate-in-yoga-and-naturopathy/
🌐 Accessing URL: https://surewinindia.com/product/certificate-in-yoga-and-naturopathy/
📛 Course Name: Certificate in Yoga and Naturopathy
📝 About Course: A Certificate in Yoga and Naturopathy opens doors to a rewarding career in the w...
✅ Eligibility: Completion of10+2 (higher Secondary)or equivalent.
⏱️ Duration: Three Months.
💰 Price: Rs.7,000.00
👥 Who Should Take It: Not available
📚 Syllabus (Old): - Introduction to Yoga and Naturopathy:History and philosophy of yoga, Principle...
📚 Syllabus (New): Introduction to Yoga and Naturopathy:History and philosophy of yoga, Principles ...
📜 Certificate: https://iisdt.in/wp-content/uploads/2025/06/Sample-Diploma.pdf
🚪 Browser closed
💾 Saved data for: Certificate in Yoga and Naturopathy

✅ Process completed


Do for Entire Excel file

In [5]:
import time
import os
import re
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# -------------------- DRIVER SETUP --------------------
def get_driver(headless=False):
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    if headless:
        options.add_argument("--headless=new")  # modern headless
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

# -------------------- TEXT CLEANING --------------------
def clean_text(text):
    """Remove specified words from text"""
    words_to_remove = [
        "Objective:", "Objective :",
        "Eligibility:", "Eligibility :",
        "Duration:", "Duration :",
        "Professional Skills"
    ]
    for word in words_to_remove:
        text = text.replace(word, "")
    return text.strip()

# -------------------- BULLET FORMATTING --------------------
def format_bullets(text):
    """
    Ensure there is a newline between each bullet point.
    Example: "- Topic 1... .- Topic 2..." → "- Topic 1...\n\n- Topic 2..."
    """
    if not text or text == "Not available":
        return text
    text = text.strip()
    text = re.sub(r'\.\s*-\s*', r'.\n\n- ', text)
    return text

# -------------------- SYLLABUS EXTRACTION --------------------
def extract_syllabus(soup):
    """Extract syllabus content up to Job Opportunities or Career"""
    headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
    
    for heading in headings:
        heading_text = heading.get_text(strip=True).lower()
        if 'syllabus' in heading_text:
            syllabus_content = []
            elements_to_process = []
            current_elem = heading
            while current_elem:
                current_elem = current_elem.find_next_sibling()
                if not current_elem:
                    break
                if current_elem.name in ['h1','h2','h3','h4','h5','h6']:
                    heading_text = current_elem.get_text(strip=True).lower()
                    if 'job opportunit' in heading_text or 'career' in heading_text:
                        break
                elements_to_process.append(current_elem)

            for elem in elements_to_process:
                elem_text = elem.get_text(strip=True)
                if 'job opportunities' in elem_text.lower() and len(elem_text) < 100:
                    break
                if elem.name in ['p', 'ul', 'ol', 'div']:
                    if elem.name in ['ul','ol']:
                        for item in elem.find_all('li'):
                            item_text = clean_text(item.get_text(strip=True))
                            if item_text and 'job opportunit' not in item_text.lower():
                                syllabus_content.append(f"- {item_text}")
                    else:
                        paragraph_text = clean_text(elem.get_text(strip=True))
                        if paragraph_text and 'job opportunit' not in paragraph_text.lower():
                            syllabus_content.append(paragraph_text)
            if syllabus_content:
                return format_bullets("\n".join(syllabus_content))
    return "Not available"

def extract_old_syllabus(soup):
    """Old method for syllabus extraction"""
    syllabus_content = []
    stop_markers = [
        "After completing", "Graduates",
        "After successful", "After ",
        "Job Opportunities", "job opportunities"
    ]

    syllabus_section = soup.find(string=re.compile("Syllabus", re.IGNORECASE))
    if syllabus_section:
        next_elem = syllabus_section.find_next()
        while next_elem and next_elem.name in ["p", "ul", "ol"]:
            text_content = next_elem.get_text(strip=True)

            if any(marker.lower() in text_content.lower() for marker in stop_markers):
                break

            if next_elem.name in ["ul", "ol"]:
                for item in next_elem.find_all("li"):
                    item_text = clean_text(item.get_text(strip=True))
                    if not any(marker.lower() in item_text.lower() for marker in stop_markers):
                        syllabus_content.append(f"- {item_text}")
                    else:
                        break
            else:
                if text_content:
                    syllabus_content.append(f"- {clean_text(text_content)}")

            next_elem = next_elem.find_next_sibling()

    return format_bullets("\n".join(syllabus_content)) if syllabus_content else "Not available"

# -------------------- SCRAPER --------------------
def scrape_course_data(url):
    driver = get_driver()
    try:
        print(f"🌐 Accessing URL: {url}")
        driver.get(url)

        # Wait for page to load
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "h1")))
        time.sleep(3)

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # 1. Course Name
        course_name_tag = soup.find("h1", class_="product_title entry-title")
        course_name = course_name_tag.get_text(strip=True) if course_name_tag else "Course name not found"
        print(f"📛 Course Name: {course_name}")

        # 2. About Course
        about_course = "About course not found"
        about_sections = soup.find_all("p")
        for section in about_sections:
            strong_tag = section.find("strong")
            if strong_tag and "objective" in strong_tag.get_text(strip=True).lower():
                about_course = clean_text(section.get_text(" ", strip=True))
                break
        print(f"📝 About Course: {about_course[:80]}...")

        # 3. Eligibility
        eligibility = "Not available"
        for section in about_sections:
            strong_tag = section.find("strong")
            if strong_tag and "eligibility" in strong_tag.get_text(strip=True).lower():
                eligibility = clean_text(section.get_text(strip=True))
                break
        print(f"✅ Eligibility: {eligibility}")

        # 4. Duration
        duration = "Not available"
        for section in about_sections:
            strong_tag = section.find("strong")
            if strong_tag and "duration" in strong_tag.get_text(strip=True).lower():
                duration = clean_text(section.get_text(strip=True))
                break
        print(f"⏱️ Duration: {duration}")

        # 5. Price
        price = "Not available"
        price_tag = soup.find("bdi")
        if price_tag:
            price = price_tag.get_text(strip=True)
        print(f"💰 Price: {price}")

        # 6. Who Should Take It
        who_content = []
        who_strong = soup.find("strong", string=re.compile(r"Who Should Enroll|Who Should Take It", re.IGNORECASE))
        if who_strong:
            next_ul = who_strong.find_next("ul")
            if next_ul:
                for li in next_ul.find_all("li"):
                    who_content.append(clean_text(li.get_text(strip=True)))
        who_should_take = "\n".join([f"- {item}" for item in who_content]) if who_content else "Not available"
        print(f"👥 Who Should Take It: {who_should_take}")

        # 7A. Old syllabus extraction
        syllabus_old = extract_old_syllabus(soup)
        # 7B. New syllabus extraction
        syllabus_new = extract_syllabus(soup)

        print(f"📚 Syllabus (Old): {syllabus_old[:80]}...")
        print(f"📚 Syllabus (New): {syllabus_new[:80]}...")

        # 8. Certificate
        cert_link = "Certificate not available"
        cert_tag = soup.find("a", href=re.compile(r"\.pdf$"))
        if cert_tag and cert_tag.has_attr("href"):
            cert_link = cert_tag["href"]
        else:
            cert_img = soup.find("img", src=re.compile(r"certificate", re.IGNORECASE))
            if cert_img and cert_img.has_attr("src"):
                cert_link = cert_img["src"]
        print(f"📜 Certificate: {cert_link}")

        return course_name, about_course, eligibility, duration, price, who_should_take, syllabus_old, syllabus_new, cert_link

    except Exception as e:
        print(f"🔥 Scraping failed for {url}: {str(e)}")
        import traceback
        traceback.print_exc()
        return ["Error"] * 9
    finally:
        driver.quit()
        print("🚪 Browser closed")

# -------------------- SAVE TO EXCEL --------------------
def save_to_excel(data, file_path, url):
    columns = [
        "Course Name",
        "About Course",
        "Eligibility",
        "Duration",
        "Price",
        "Who Should Take It",
        "Syllabus (Old)",
        "Syllabus (New)",
        "Certificate",
        "Course URL"
    ]
    (course_name, about_course, eligibility, duration, price,
     who_should_take, syllabus_old, syllabus_new, cert_link) = data
    try:
        if os.path.exists(file_path):
            df = pd.read_excel(file_path)
        else:
            df = pd.DataFrame(columns=columns)

        # ✅ Avoid duplicate entries
        if url not in df["Course URL"].values:
            row = {
                "Course Name": course_name,
                "About Course": about_course,
                "Eligibility": eligibility,
                "Duration": duration,
                "Price": price,
                "Who Should Take It": who_should_take,
                "Syllabus (Old)": syllabus_old,
                "Syllabus (New)": syllabus_new,
                "Certificate": cert_link,
                "Course URL": url
            }
            df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
            df.to_excel(file_path, index=False)
            print(f"💾 Saved data for: {course_name}")
        else:
            print(f"⚠️ Skipped duplicate: {url}")

    except Exception as e:
        print(f"❌ Excel save error: {e}")

# -------------------- MAIN --------------------
if __name__ == "__main__":
    input_excel = r"C:\Users\taslim.siddiqui\Downloads\test for one.xlsx"
    df_urls = pd.read_excel(input_excel)
    
    if "Course URL" not in df_urls.columns:
        raise ValueError("Excel file must contain a column named 'Course URL'")

    course_urls = df_urls["Course URL"].dropna().tolist()
    output_file = r"C:\Users\taslim.siddiqui\Downloads\Surewin__.xlsx"

    print("🚀 Starting scraping process...")

    for course_url in course_urls:
        print(f"\n🔍 Processing: {course_url}")
        course_data = scrape_course_data(course_url)
        if all(item != "Error" for item in course_data):
            # Print all columns before saving
            columns = [
                "Course Name",
                "About Course",
                "Eligibility",
                "Duration",
                "Price",
                "Who Should Take It",
                "Syllabus (Old)",
                "Syllabus (New)",
                "Certificate"
            ]
            print("\n📄 Course Data:")
            for col_name, value in zip(columns, course_data[:-1]):  # exclude last URL for printing
                print(f"{col_name}: {value}\n")
            
            save_to_excel(course_data, output_file, course_url)
        else:
            print(f"❌ Failed to scrape complete data for {course_url}")

    print("\n✅ Process completed")


🚀 Starting scraping process...

🔍 Processing: https://surewinindia.com/product/professional-course-in-travel-tour-management/
🌐 Accessing URL: https://surewinindia.com/product/professional-course-in-travel-tour-management/
📛 Course Name: Course name not found
📝 About Course: A Professional Course in Travel & Tour Management is an ideal program for indivi...
✅ Eligibility: Completion of10th Grade (high School)or equivalent.
⏱️ Duration: Three Months.
💰 Price: Rs.7,000.00
👥 Who Should Take It: Not available
📚 Syllabus (Old): - Introduction to Travel and Tourism:Concept and Scope of Travel & Tourism, Hist...
📚 Syllabus (New): Introduction to Travel and Tourism:Concept and Scope of Travel & Tourism, Histor...
📜 Certificate: https://iisdt.in/wp-content/uploads/2025/06/Sample-Diploma.pdf
🚪 Browser closed

📄 Course Data:
Course Name: Course name not found

About Course: A Professional Course in Travel & Tour Management is an ideal program for individuals passionate about travel, culture, and 

Courses link surewin 325

In [21]:
import time
import os
import re
import math
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# -------------------- DRIVER SETUP --------------------
def get_driver(headless=False):
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    if headless:
        options.add_argument("--headless=new")  # modern headless
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

# -------------------- TEXT CLEANING --------------------
def clean_text(text):
    """Remove specified words from text"""
    words_to_remove = [
        "Objective:", "Objective :",
        "Eligibility:", "Eligibility :",
        "Duration:", "Duration :",
        "Professional Skills"
    ]
    for word in words_to_remove:
        text = text.replace(word, "")
    return text.strip()

# -------------------- ROUND HOURS FUNCTION --------------------
def roundup_hours_simple(time_value):
    """Simple function to handle all time formats"""
    if pd.isna(time_value) or time_value == "Not available" or not time_value:
        return 0
    
    try:
        # Handle string formats
        if isinstance(time_value, str):
            # Remove any non-numeric characters except colons and decimal points
            time_value = re.sub(r'[^\d:.]', '', time_value)
            
            # Handle HH:MM:SS format (like "0:53:00")
            if ':' in time_value:
                parts = time_value.split(':')
                h = int(parts[0]) if len(parts) > 0 and parts[0] else 0
                m = int(parts[1]) if len(parts) > 1 and parts[1] else 0
                s = int(parts[2]) if len(parts) > 2 and parts[2] else 0
                total_hours = h + m/60 + s/3600
            
            # Handle decimal format
            elif '.' in time_value:
                total_hours = float(time_value)
            
            # Handle pure number (assume hours)
            else:
                total_hours = float(time_value)
        
        # Handle numeric formats
        else:
            total_hours = float(time_value)
        
        # Round up to nearest whole hour
        return math.ceil(total_hours)
    
    except Exception as e:
        print(f"Error rounding hours for '{time_value}': {e}")
        return 0

# -------------------- EXTRACT ABOUT COURSE --------------------
def extract_about_course(soup):
    """Extract about course from course description sections"""
    # Method 1: Look for course description in tab-pane overview
    overview_tab = soup.find('div', class_='tab-pane overview-content active')
    if overview_tab:
        # Find the course description section
        description_heading = overview_tab.find('h4', string=re.compile('Course description', re.IGNORECASE))
        if description_heading:
            # Get all paragraphs after the description heading
            description_content = []
            next_elem = description_heading.find_next_sibling()
            while next_elem and next_elem.name != 'h4':
                if next_elem.name == 'p':
                    paragraph_text = next_elem.get_text(strip=True)
                    if paragraph_text and paragraph_text not in ['', 'Tags']:
                        description_content.append(paragraph_text)
                next_elem = next_elem.find_next_sibling()
            
            if description_content:
                return " ".join(description_content)
    
    # Method 2: Look for general description in the content
    description_keywords = ['course description', 'about this course', 'overview', 'introduction']
    for keyword in description_keywords:
        element = soup.find(string=re.compile(keyword, re.IGNORECASE))
        if element:
            parent = element.find_parent()
            if parent:
                # Get content after the heading
                content_parts = []
                next_sibling = parent.find_next_sibling()
                while next_sibling and next_sibling.name in ['p', 'div']:
                    text_content = next_sibling.get_text(strip=True)
                    if text_content:
                        content_parts.append(text_content)
                    next_sibling = next_sibling.find_next_sibling()
                
                if content_parts:
                    return " ".join(content_parts)
    
    return "About course not found"

# -------------------- EXTRACT PRICE --------------------
def extract_price(soup):
    """Extract price from various price-related elements"""
    # Method 1: Look for price-current class
    price_current = soup.find('span', class_='price-current')
    if price_current:
        price_text = price_current.get_text(strip=True)
        if price_text:
            return price_text
    
    # Method 2: Look for bdi element (WordPress price)
    price_bdi = soup.find('bdi')
    if price_bdi:
        price_text = price_bdi.get_text(strip=True)
        if price_text:
            return price_text
    
    # Method 3: Look for common price patterns
    price_patterns = [
        r'₹\s*\d+[,\d]*\.?\d*',
        r'$\s*\d+[,\d]*\.?\d*',
        r'€\s*\d+[,\d]*\.?\d*',
        r'£\s*\d+[,\d]*\.?\d*',
        r'\d+[,\d]*\.?\d*\s*₹',
        r'\d+[,\d]*\.?\d*\s*$',
    ]
    
    for pattern in price_patterns:
        price_match = soup.find(string=re.compile(pattern))
        if price_match:
            return price_match.strip()
    
    return "Not available"

# -------------------- EXTRACT DURATION --------------------
def extract_duration(soup):
    """Extract duration from course features format"""
    # Method 1: Look for course-features-custom with duration
    course_features = soup.find_all('div', class_='course-features-custom')
    for feature in course_features:
        duration_number = feature.find('div', class_='feature-custom-number')
        feature_text = feature.find('div', class_='feature-custom-text')
        
        if duration_number and feature_text:
            feature_text_text = feature_text.get_text(strip=True)
            if feature_text_text and 'duration' in feature_text_text.lower():
                duration_num_text = duration_number.get_text(strip=True)
                if duration_num_text:
                    return duration_num_text
    
    # Method 2: Look for duration in traditional format
    about_sections = soup.find_all("p")
    for section in about_sections:
        strong_tag = section.find("strong")
        if strong_tag and "duration" in strong_tag.get_text(strip=True).lower():
            duration_text = clean_text(section.get_text(strip=True))
            if duration_text:
                return duration_text
    
    # Method 3: Look for common duration patterns
    duration_patterns = [
        r'\d+\s*(months?|weeks?|days?|hours?|minutes?)',
        r'\d+:\d+:\d+',  # HH:MM:SS format
        r'\d+:\d+',      # HH:MM format
    ]
    
    for pattern in duration_patterns:
        duration_match = soup.find(string=re.compile(pattern, re.IGNORECASE))
        if duration_match:
            return duration_match.strip()
    
    return "Not available"

# -------------------- EXTRACT LANGUAGE --------------------
def extract_language(soup):
    """Extract language information from the page"""
    # Method 1: Look for language text with language icon
    language_icon = soup.find('i', class_='fa-language')
    if language_icon:
        language_parent = language_icon.find_parent('p')
        if language_parent:
            language_text = language_parent.get_text(strip=True)
            # Remove the icon text and keep only the language info
            language_text = re.sub(r'.*Language', '', language_text).strip()
            if language_text:
                return language_text
    
    # Method 2: Look for common languages in the text
    languages = ['Hindi', 'English', 'Spanish', 'French', 'German', 'Chinese', 'Japanese']
    for lang in languages:
        lang_match = soup.find(string=re.compile(lang, re.IGNORECASE))
        if lang_match:
            return lang
    
    return "Not available"

# -------------------- EXTRACT SYLLABUS FROM BUTTONS --------------------
def extract_syllabus_from_buttons(soup):
    """Extract syllabus content from button elements with video classes"""
    syllabus_content = []
    
    # Find all buttons with video-related classes
    syllabus_buttons = soup.find_all('a', class_=re.compile(r'button.*video|video.*button', re.IGNORECASE))
    
    for button in syllabus_buttons:
        button_text = button.get_text(strip=True)
        if button_text and button_text not in ['', 'Play', 'Watch']:
            # Clean the text and add to syllabus
            clean_button_text = clean_text(button_text)
            if clean_button_text:
                syllabus_content.append(f"- {clean_button_text}")
    
    # Also look for elements with play icons
    play_icons = soup.find_all('i', class_=re.compile(r'fa-play|fa-video', re.IGNORECASE))
    for icon in play_icons:
        parent_text = icon.find_parent().get_text(strip=True)
        if parent_text:
            clean_parent_text = clean_text(parent_text.replace('Play', '').replace('Watch', '').strip())
            if clean_parent_text and clean_parent_text not in [item.replace('- ', '') for item in syllabus_content]:
                syllabus_content.append(f"- {clean_parent_text}")
    
    return "\n".join(syllabus_content) if syllabus_content else "Not available"

# -------------------- SCRAPER --------------------
def scrape_course_data(url):
    driver = get_driver()
    try:
        print(f"🌐 Accessing URL: {url}")
        driver.get(url)

        # Wait for page to load
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "h1")))
        time.sleep(3)

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # 1. Course Name - UPDATED to target h5 with mb-2 class
        course_name_tag = soup.find("h5", class_="mb-2")
        if not course_name_tag:
            # Fallback methods if h5.mb-2 not found
            course_name_tag = soup.find("h1", class_=re.compile(r"product_title|entry-title|title", re.IGNORECASE))
            if not course_name_tag:
                course_name_tag = soup.find("h1")
        course_name = course_name_tag.get_text(strip=True) if course_name_tag else "Course name not found"
        print(f"📛 Course Name: {course_name}")

        # 2. Language
        language = extract_language(soup)
        print(f"🗣️ Language: {language}")

        # 3. About Course - UPDATED
        about_course = extract_about_course(soup)
        print(f"📝 About Course: {about_course[:80]}...")

        # 4. Eligibility
        eligibility = "Not available"
        about_sections = soup.find_all("p")
        for section in about_sections:
            strong_tag = section.find("strong")
            if strong_tag and "eligibility" in strong_tag.get_text(strip=True).lower():
                eligibility = clean_text(section.get_text(strip=True))
                break
        print(f"✅ Eligibility: {eligibility}")

        # 5. Duration - UPDATED to use new extraction method
        duration = extract_duration(soup)
        print(f"⏱️ Duration: {duration}")

        # 6. Price - UPDATED
        price = extract_price(soup)
        print(f"💰 Price: {price}")

        # 7. Who Should Take It
        who_content = []
        who_strong = soup.find("strong", string=re.compile(r"Who Should Enroll|Who Should Take It", re.IGNORECASE))
        if who_strong:
            next_ul = who_strong.find_next("ul")
            if next_ul:
                for li in next_ul.find_all("li"):
                    who_content.append(clean_text(li.get_text(strip=True)))
        who_should_take = "\n".join([f"- {item}" for item in who_content]) if who_content else "Not available"
        print(f"👥 Who Should Take It: {who_should_take}")

        # 8. Syllabus extraction - From buttons only
        syllabus_buttons = extract_syllabus_from_buttons(soup)
        print(f"📚 Syllabus: {syllabus_buttons[:80]}...")

        # 9. Certificate
        cert_link = "Certificate not available"
        # Method 1: Look for certificate images
        cert_imgs = soup.find_all('img', src=re.compile(r'certificate', re.IGNORECASE))
        for cert_img in cert_imgs:
            if cert_img.has_attr('src'):
                cert_link = cert_img['src']
                break
        
        # Method 2: Look for PDF links
        if cert_link == "Certificate not available":
            cert_tag = soup.find("a", href=re.compile(r"\.pdf$"))
            if cert_tag and cert_tag.has_attr("href"):
                cert_link = cert_tag["href"]
        
        print(f"📜 Certificate: {cert_link}")

        return (course_name, language, about_course, eligibility, 
                duration, price, who_should_take, syllabus_buttons, cert_link)

    except Exception as e:
        print(f"🔥 Scraping failed for {url}: {str(e)}")
        import traceback
        traceback.print_exc()
        return ["Error"] * 9
    finally:
        driver.quit()
        print("🚪 Browser closed")

# -------------------- SAVE TO EXCEL --------------------
def save_to_excel(data, file_path, url):
    columns = [
        "Course Name",
        "Language", 
        "About Course",
        "Eligibility",
        "Duration", 
        "Price",
        "Who Should Take It",
        "Syllabus",
        "Certificate",
        "Round Hours",
        "Course URL"
    ]
    
    (course_name, language, about_course, eligibility, 
     duration, price, who_should_take, syllabus_buttons, cert_link) = data
     
    try:
        if os.path.exists(file_path):
            df = pd.read_excel(file_path)
        else:
            df = pd.DataFrame(columns=columns)

        # Calculate rounded hours
        rounded_hours = roundup_hours_simple(duration)
        print(f"⏰ Rounded Hours: {rounded_hours}")

        # ✅ Avoid duplicate entries
        if url not in df["Course URL"].values:
            row = {
                "Course Name": course_name,
                "Language": language,
                "About Course": about_course,
                "Eligibility": eligibility,
                "Duration": duration,
                "Price": price,
                "Who Should Take It": who_should_take,
                "Syllabus": syllabus_buttons,
                "Certificate": cert_link,
                "Round Hours": rounded_hours,
                "Course URL": url
            }
            df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
            df.to_excel(file_path, index=False)
            print(f"💾 Saved data for: {course_name}")
            
            # Print results for verification
            print("\n📊 Results:")
            print(df[['Course Name', 'Duration', 'Round Hours']].tail())
        else:
            print(f"⚠️ Skipped duplicate: {url}")

    except Exception as e:
        print(f"❌ Excel save error: {e}")

# -------------------- MAIN --------------------
if __name__ == "__main__":
    course_urls = [
        "https://iisdt.com/course/big-data-analytics-hadoop-college"
    ]

    print("🚀 Starting scraping process...")
    file_path = r"C:\Users\taslim.siddiqui\Downloads\cours1.xlsx"

    for course_url in course_urls:
        print(f"\n🔍 Processing: {course_url}")
        course_data = scrape_course_data(course_url)
        if all(item != "Error" for item in course_data):
            save_to_excel(course_data, file_path, course_url)
        else:
            print(f"❌ Failed to scrape complete data for {course_url}")

    print("\n✅ Process completed")

🚀 Starting scraping process...

🔍 Processing: https://iisdt.com/course/big-data-analytics-hadoop-college
🌐 Accessing URL: https://iisdt.com/course/big-data-analytics-hadoop-college
📛 Course Name: Big Data Analytics (Hadoop)
🗣️ Language: English
📝 About Course: Big data analytics is the application of cutting-edge methods to various types o...
✅ Eligibility: Not available
⏱️ Duration: 4:14:04
💰 Price: ₹2,500.00
👥 Who Should Take It: Not available
📚 Syllabus: - Introduction and outline of big data analytics
- What is big data analytics in...
📜 Certificate: https://iisdt.com/public/uploads/b2b_certificate/demo_certificate/27.png
🚪 Browser closed
⏰ Rounded Hours: 5
💾 Saved data for: Big Data Analytics (Hadoop)

📊 Results:
                   Course Name Duration  Round Hours
0           Verify Certificate  6:44:06            7
1  Big Data Analytics (Hadoop)  4:14:04            5

✅ Process completed


Entire excel file  https://iisdt.com/course/starting-e-commerce-business-from-scratch

In [23]:
import time
import os
import re
import math
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# -------------------- DRIVER SETUP --------------------
def get_driver(headless=False):
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    if headless:
        options.add_argument("--headless=new")  # modern headless
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

# -------------------- TEXT CLEANING --------------------
def clean_text(text):
    """Remove specified words from text"""
    words_to_remove = [
        "Objective:", "Objective :",
        "Eligibility:", "Eligibility :",
        "Duration:", "Duration :",
        "Professional Skills"
    ]
    for word in words_to_remove:
        text = text.replace(word, "")
    return text.strip()

# -------------------- ROUND HOURS FUNCTION --------------------
def roundup_hours_simple(time_value):
    """Simple function to handle all time formats"""
    if pd.isna(time_value) or time_value == "Not available" or not time_value:
        return 0
    
    try:
        # Handle string formats
        if isinstance(time_value, str):
            # Remove any non-numeric characters except colons and decimal points
            time_value = re.sub(r'[^\d:.]', '', time_value)
            
            # Handle HH:MM:SS format (like "0:53:00")
            if ':' in time_value:
                parts = time_value.split(':')
                h = int(parts[0]) if len(parts) > 0 and parts[0] else 0
                m = int(parts[1]) if len(parts) > 1 and parts[1] else 0
                s = int(parts[2]) if len(parts) > 2 and parts[2] else 0
                total_hours = h + m/60 + s/3600
            
            # Handle decimal format
            elif '.' in time_value:
                total_hours = float(time_value)
            
            # Handle pure number (assume hours)
            else:
                total_hours = float(time_value)
        
        # Handle numeric formats
        else:
            total_hours = float(time_value)
        
        # Round up to nearest whole hour
        return math.ceil(total_hours)
    
    except Exception as e:
        print(f"Error rounding hours for '{time_value}': {e}")
        return 0

# -------------------- EXTRACT ABOUT COURSE --------------------
def extract_about_course(soup):
    """Extract about course from course description sections"""
    # Method 1: Look for course description in tab-pane overview
    overview_tab = soup.find('div', class_='tab-pane overview-content active')
    if overview_tab:
        # Find the course description section
        description_heading = overview_tab.find('h4', string=re.compile('Course description', re.IGNORECASE))
        if description_heading:
            # Get all paragraphs after the description heading
            description_content = []
            next_elem = description_heading.find_next_sibling()
            while next_elem and next_elem.name != 'h4':
                if next_elem.name == 'p':
                    paragraph_text = next_elem.get_text(strip=True)
                    if paragraph_text and paragraph_text not in ['', 'Tags']:
                        description_content.append(paragraph_text)
                next_elem = next_elem.find_next_sibling()
            
            if description_content:
                return " ".join(description_content)
    
    # Method 2: Look for general description in the content
    description_keywords = ['course description', 'about this course', 'overview', 'introduction']
    for keyword in description_keywords:
        element = soup.find(string=re.compile(keyword, re.IGNORECASE))
        if element:
            parent = element.find_parent()
            if parent:
                # Get content after the heading
                content_parts = []
                next_sibling = parent.find_next_sibling()
                while next_sibling and next_sibling.name in ['p', 'div']:
                    text_content = next_sibling.get_text(strip=True)
                    if text_content:
                        content_parts.append(text_content)
                    next_sibling = next_sibling.find_next_sibling()
                
                if content_parts:
                    return " ".join(content_parts)
    
    return "About course not found"

# -------------------- EXTRACT PRICE --------------------
def extract_price(soup):
    """Extract price from various price-related elements"""
    # Method 1: Look for price-current class
    price_current = soup.find('span', class_='price-current')
    if price_current:
        price_text = price_current.get_text(strip=True)
        if price_text:
            return price_text
    
    # Method 2: Look for bdi element (WordPress price)
    price_bdi = soup.find('bdi')
    if price_bdi:
        price_text = price_bdi.get_text(strip=True)
        if price_text:
            return price_text
    
    # Method 3: Look for common price patterns
    price_patterns = [
        r'₹\s*\d+[,\d]*\.?\d*',
        r'$\s*\d+[,\d]*\.?\d*',
        r'€\s*\d+[,\d]*\.?\d*',
        r'£\s*\d+[,\d]*\.?\d*',
        r'\d+[,\d]*\.?\d*\s*₹',
        r'\d+[,\d]*\.?\d*\s*$',
    ]
    
    for pattern in price_patterns:
        price_match = soup.find(string=re.compile(pattern))
        if price_match:
            return price_match.strip()
    
    return "Not available"

# -------------------- EXTRACT DURATION --------------------
def extract_duration(soup):
    """Extract duration from course features format"""
    # Method 1: Look for course-features-custom with duration
    course_features = soup.find_all('div', class_='course-features-custom')
    for feature in course_features:
        duration_number = feature.find('div', class_='feature-custom-number')
        feature_text = feature.find('div', class_='feature-custom-text')
        
        if duration_number and feature_text:
            feature_text_text = feature_text.get_text(strip=True)
            if feature_text_text and 'duration' in feature_text_text.lower():
                duration_num_text = duration_number.get_text(strip=True)
                if duration_num_text:
                    return duration_num_text
    
    # Method 2: Look for duration in traditional format
    about_sections = soup.find_all("p")
    for section in about_sections:
        strong_tag = section.find("strong")
        if strong_tag and "duration" in strong_tag.get_text(strip=True).lower():
            duration_text = clean_text(section.get_text(strip=True))
            if duration_text:
                return duration_text
    
    # Method 3: Look for common duration patterns
    duration_patterns = [
        r'\d+\s*(months?|weeks?|days?|hours?|minutes?)',
        r'\d+:\d+:\d+',  # HH:MM:SS format
        r'\d+:\d+',      # HH:MM format
    ]
    
    for pattern in duration_patterns:
        duration_match = soup.find(string=re.compile(pattern, re.IGNORECASE))
        if duration_match:
            return duration_match.strip()
    
    return "Not available"

# -------------------- EXTRACT LANGUAGE --------------------
def extract_language(soup):
    """Extract language information from the page"""
    # Method 1: Look for language text with language icon
    language_icon = soup.find('i', class_='fa-language')
    if language_icon:
        language_parent = language_icon.find_parent('p')
        if language_parent:
            language_text = language_parent.get_text(strip=True)
            # Remove the icon text and keep only the language info
            language_text = re.sub(r'.*Language', '', language_text).strip()
            if language_text:
                return language_text
    
    # Method 2: Look for common languages in the text
    languages = ['Hindi', 'English', 'Spanish', 'French', 'German', 'Chinese', 'Japanese']
    for lang in languages:
        lang_match = soup.find(string=re.compile(lang, re.IGNORECASE))
        if lang_match:
            return lang
    
    return "Not available"

# -------------------- EXTRACT SYLLABUS FROM BUTTONS --------------------
def extract_syllabus_from_buttons(soup):
    """Extract syllabus content from button elements with video classes"""
    syllabus_content = []
    
    # Find all buttons with video-related classes
    syllabus_buttons = soup.find_all('a', class_=re.compile(r'button.*video|video.*button', re.IGNORECASE))
    
    for button in syllabus_buttons:
        button_text = button.get_text(strip=True)
        if button_text and button_text not in ['', 'Play', 'Watch']:
            # Clean the text and add to syllabus
            clean_button_text = clean_text(button_text)
            if clean_button_text:
                syllabus_content.append(f"- {clean_button_text}")
    
    # Also look for elements with play icons
    play_icons = soup.find_all('i', class_=re.compile(r'fa-play|fa-video', re.IGNORECASE))
    for icon in play_icons:
        parent_text = icon.find_parent().get_text(strip=True)
        if parent_text:
            clean_parent_text = clean_text(parent_text.replace('Play', '').replace('Watch', '').strip())
            if clean_parent_text and clean_parent_text not in [item.replace('- ', '') for item in syllabus_content]:
                syllabus_content.append(f"- {clean_parent_text}")
    
    return "\n".join(syllabus_content) if syllabus_content else "Not available"

# -------------------- SCRAPER --------------------
def scrape_course_data(url):
    driver = get_driver()
    try:
        print(f"🌐 Accessing URL: {url}")
        driver.get(url)

        # Wait for page to load
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "h1")))
        time.sleep(3)

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # 1. Course Name - UPDATED to target h5 with mb-2 class
        course_name_tag = soup.find("h5", class_="mb-2")
        if not course_name_tag:
            # Fallback methods if h5.mb-2 not found
            course_name_tag = soup.find("h1", class_=re.compile(r"product_title|entry-title|title", re.IGNORECASE))
            if not course_name_tag:
                course_name_tag = soup.find("h1")
        course_name = course_name_tag.get_text(strip=True) if course_name_tag else "Course name not found"
        print(f"📛 Course Name: {course_name}")

        # 2. Language
        language = extract_language(soup)
        print(f"🗣️ Language: {language}")

        # 3. About Course - UPDATED
        about_course = extract_about_course(soup)
        print(f"📝 About Course: {about_course[:80]}...")

        # 4. Eligibility
        eligibility = "Not available"
        about_sections = soup.find_all("p")
        for section in about_sections:
            strong_tag = section.find("strong")
            if strong_tag and "eligibility" in strong_tag.get_text(strip=True).lower():
                eligibility = clean_text(section.get_text(strip=True))
                break
        print(f"✅ Eligibility: {eligibility}")

        # 5. Duration - UPDATED to use new extraction method
        duration = extract_duration(soup)
        print(f"⏱️ Duration: {duration}")

        # 6. Price - UPDATED
        price = extract_price(soup)
        print(f"💰 Price: {price}")

        # 7. Who Should Take It
        who_content = []
        who_strong = soup.find("strong", string=re.compile(r"Who Should Enroll|Who Should Take It", re.IGNORECASE))
        if who_strong:
            next_ul = who_strong.find_next("ul")
            if next_ul:
                for li in next_ul.find_all("li"):
                    who_content.append(clean_text(li.get_text(strip=True)))
        who_should_take = "\n".join([f"- {item}" for item in who_content]) if who_content else "Not available"
        print(f"👥 Who Should Take It: {who_should_take}")

        # 8. Syllabus extraction - From buttons only
        syllabus_buttons = extract_syllabus_from_buttons(soup)
        print(f"📚 Syllabus: {syllabus_buttons[:80]}...")

        # 9. Certificate
        cert_link = "Certificate not available"
        # Method 1: Look for certificate images
        cert_imgs = soup.find_all('img', src=re.compile(r'certificate', re.IGNORECASE))
        for cert_img in cert_imgs:
            if cert_img.has_attr('src'):
                cert_link = cert_img['src']
                break
        
        # Method 2: Look for PDF links
        if cert_link == "Certificate not available":
            cert_tag = soup.find("a", href=re.compile(r"\.pdf$"))
            if cert_tag and cert_tag.has_attr("href"):
                cert_link = cert_tag["href"]
        
        print(f"📜 Certificate: {cert_link}")

        return (course_name, language, about_course, eligibility, 
                duration, price, who_should_take, syllabus_buttons, cert_link)

    except Exception as e:
        print(f"🔥 Scraping failed for {url}: {str(e)}")
        import traceback
        traceback.print_exc()
        return ["Error"] * 9
    finally:
        driver.quit()
        print("🚪 Browser closed")

# -------------------- SAVE TO EXCEL --------------------
def save_to_excel(data, file_path, url):
    columns = [
        "Course Name",
        "Language", 
        "About Course",
        "Eligibility",
        "Duration", 
        "Price",
        "Who Should Take It",
        "Syllabus",
        "Certificate",
        "Round Hours",
        "Course URL"
    ]
    
    (course_name, language, about_course, eligibility, 
     duration, price, who_should_take, syllabus_buttons, cert_link) = data
     
    try:
        if os.path.exists(file_path):
            df = pd.read_excel(file_path)
        else:
            df = pd.DataFrame(columns=columns)

        # Calculate rounded hours
        rounded_hours = roundup_hours_simple(duration)
        print(f"⏰ Rounded Hours: {rounded_hours}")

        # ✅ Avoid duplicate entries
        if url not in df["Course URL"].values:
            row = {
                "Course Name": course_name,
                "Language": language,
                "About Course": about_course,
                "Eligibility": eligibility,
                "Duration": duration,
                "Price": price,
                "Who Should Take It": who_should_take,
                "Syllabus": syllabus_buttons,
                "Certificate": cert_link,
                "Round Hours": rounded_hours,
                "Course URL": url
            }
            df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
            df.to_excel(file_path, index=False)
            print(f"💾 Saved data for: {course_name}")
            
            # Print results for verification
            print("\n📊 Results:")
            print(df[['Course Name', 'Duration', 'Round Hours']].tail())
        else:
            print(f"⚠️ Skipped duplicate: {url}")

    except Exception as e:
        print(f"❌ Excel save error: {e}")

# -------------------- MAIN --------------------
if __name__ == "__main__":
    print("🚀 Starting scraping process...")
    input_file = r"C:\Users\taslim.siddiqui\Downloads\course_duration_data.xlsx"
    output_file = r"C:\Users\taslim.siddiqui\Downloads\surewin_output.xlsx"
    try:
        df = pd.read_excel(input_file)
        if "Course Link" not in df.columns:
            raise ValueError("❌ Excel must contain a 'Course Link' column")
        course_urls = df["Course Link"].dropna().tolist()
    except Exception as e:
        print(f"❌ Failed to read input file: {e}")
        course_urls = []
    for course_url in course_urls:
        print(f"\n🔍 Processing: {course_url}")
        course_data = scrape_course_data(course_url)
        if all(item != "Error" for item in course_data):
            save_to_excel(course_data, output_file, course_url)
        else:
            print(f"❌ Failed to scrape complete data for {course_url}")
    print("\n✅ Process completed")


🚀 Starting scraping process...

🔍 Processing: https://iisdt.com/course/starting-e-commerce-business-from-scratch
🌐 Accessing URL: https://iisdt.com/course/starting-e-commerce-business-from-scratch
📛 Course Name: Starting E-Commerce Business From Scratch
🗣️ Language: Hindi
📝 About Course: If you think of an item and type the word into your web browser, you will mostly...
✅ Eligibility: Not available
⏱️ Duration: 6:44:06
💰 Price: ₹2,500.00
👥 Who Should Take It: Not available
📚 Syllabus: - Introduction to the Program | Nishkarsh Sharma
- Introduction to e-Commerce
- ...
📜 Certificate: https://iisdt.com/public/uploads/b2b_certificate/demo_certificate/27.png
🚪 Browser closed
⏰ Rounded Hours: 7
💾 Saved data for: Starting E-Commerce Business From Scratch

📊 Results:
                                 Course Name Duration Round Hours
0  Starting E-Commerce Business From Scratch  6:44:06           7

🔍 Processing: https://iisdt.com/course/jewellery-crafting-handmade-2
🌐 Accessing URL: https://iis