Test for one course 

In [3]:
import time
import os
import re
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# -------------------- DRIVER SETUP --------------------
def get_driver(headless=False):
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    if headless:
        options.add_argument("--headless=new")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

# -------------------- TEXT CLEANING --------------------
def clean_text(text):
    """Remove specified words from text"""
    words_to_remove = [
        "Objective:", "Objective :",
        "Eligibility:", "Eligibility :",
        "Duration:", "Duration :",
        "Professional Skills"
    ]
    for word in words_to_remove:
        text = text.replace(word, "")
    return text.strip()

# -------------------- IMPROVED SYLLABUS EXTRACTION --------------------
def extract_syllabus(soup):
    """Extract all modules from the specific HTML structure"""
    modules = []
    
    print("🔍 Searching for syllabus modules...")
    
    # Strategy 1: Look for the specific structure with data-start and data-end attributes
    module_paragraphs = soup.find_all('p', attrs={'data-start': True, 'data-end': True})
    
    for p_tag in module_paragraphs:
        strong_tag = p_tag.find('strong')
        if strong_tag:
            module_title = strong_tag.get_text(strip=True)
            full_text = p_tag.get_text(strip=True)
            module_content = full_text.replace(module_title, '').strip()
            module_content = re.sub(r'^:\s*', '', module_content)
            
            if module_title and module_content:
                modules.append({
                    "title": clean_text(module_title),
                    "content": clean_text(module_content)
                })
                print(f"✅ Found module: {module_title[:50]}...")
    
    # Strategy 2: span with font-size 12pt
    if not modules:
        spans_with_strong = soup.find_all('span', style=re.compile(r'font-size:\s*12pt'))
        for span in spans_with_strong:
            strong_tag = span.find('strong')
            if strong_tag:
                module_title = strong_tag.get_text(strip=True)
                if 'module' in module_title.lower():
                    full_text = span.get_text(strip=True)
                    module_content = full_text.replace(module_title, '').strip()
                    module_content = re.sub(r'^:\s*', '', module_content)
                    
                    if module_title and module_content:
                        modules.append({
                            "title": clean_text(module_title),
                            "content": clean_text(module_content)
                        })
    
    # Strategy 3: any strong tag containing "Module"
    if not modules:
        all_strong_tags = soup.find_all('strong')
        for strong in all_strong_tags:
            text = strong.get_text(strip=True)
            if 'module' in text.lower():
                parent = strong.find_parent(['p', 'div'])
                if parent:
                    full_text = parent.get_text(strip=True)
                    module_content = full_text.replace(text, '').strip()
                    module_content = re.sub(r'^:\s*', '', module_content)
                    
                    if text and module_content:
                        modules.append({
                            "title": clean_text(text),
                            "content": clean_text(module_content)
                        })
    
    print(f"📊 Total modules found: {len(modules)}")
    
    # Filter non-module content
    filtered_modules = []
    for module in modules:
        title_lower = module['title'].lower()
        if ('module' in title_lower and 
            not any(word in title_lower for word in ['job', 'career', 'opportunit', 'placement'])):
            filtered_modules.append(module)
    
    return filtered_modules if filtered_modules else [{"title": "Syllabus not available", "content": "Could not extract syllabus content"}]

# -------------------- CLEAN MODULE TITLES --------------------
def clean_module_title(title):
    """Remove duplicate 'Module X:' from titles"""
    title = re.sub(r'^(Module\s+\d+:\s*)+', '', title)
    return title.strip()

# -------------------- FORMAT SYLLABUS FOR EXCEL --------------------
def format_syllabus_for_excel(modules):
    """Format the modules into a clean syllabus structure for Excel"""
    if not modules or (len(modules) == 1 and "not available" in modules[0]["title"].lower()):
        return "Syllabus not available"
    
    syllabus_text = ""
    
    for i, module in enumerate(modules, 1):
        clean_title = clean_module_title(module['title'])
        content = module['content']
        if content and content != "Content not available":
            syllabus_text += f"Module {i}: {clean_title}: {content}\n\n"
        else:
            syllabus_text += f"Module {i}: {clean_title}: Content details to be provided\n\n"
    
    return syllabus_text.strip()

# -------------------- SYLLABUS TRANSFORM --------------------
def transform_syllabus(text: str) -> str:
    """Replace '::' after module headers with a newline, keeping data intact."""
    if pd.isna(text) or not isinstance(text, str):
        return text

    MODULE_HEADER_RE = re.compile(r"(Module\s*\d+\s*:\s*[^:]+?)::\s*", flags=re.IGNORECASE)

    def _repl(m: re.Match) -> str:
        return m.group(1) + "\n "

    return MODULE_HEADER_RE.sub(_repl, text)

# -------------------- IMPROVED SCRAPER --------------------
def scrape_course_data(url):
    driver = get_driver()
    try:
        print(f"🌐 Accessing URL: {url}")
        driver.get(url)
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        time.sleep(5)

        # Click description/syllabus tab if available
        try:
            tabs = driver.find_elements(By.CSS_SELECTOR, ".woocommerce-tabs li a, .tabs li a, .wc-tabs li a")
            for tab in tabs:
                if 'description' in tab.text.lower() or 'syllabus' in tab.text.lower() or 'curriculum' in tab.text.lower():
                    print(f"📑 Clicking on tab: {tab.text}")
                    driver.execute_script("arguments[0].click();", tab)
                    time.sleep(3)
                    break
        except Exception as e:
            print(f"ℹ️ No tabs found or clickable: {e}")

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Course Name
        course_name_tag = soup.find("h1", class_="product_title entry-title")
        course_name = course_name_tag.get_text(strip=True) if course_name_tag else "Course name not found"
        print(f"📛 Course Name: {course_name}")

        # About Course
        about_course = "About course not found"
        about_patterns = [
            ("strong", re.compile(r"objective|about|course overview", re.IGNORECASE)),
            ("h2", re.compile(r"course objective|about", re.IGNORECASE)),
            ("h3", re.compile(r"objective|overview", re.IGNORECASE))
        ]
        for tag_name, pattern in about_patterns:
            elements = soup.find_all(tag_name, string=pattern)
            for element in elements:
                parent = element.find_parent(['p', 'div'])
                if parent:
                    about_course = clean_text(parent.get_text(" ", strip=True))
                    break
            if about_course != "About course not found":
                break
        print(f"📝 About Course: {about_course[:100]}...")

        # Eligibility
        eligibility = "Not available"
        eligibility_elements = soup.find_all(['strong', 'h3', 'h4'], string=re.compile(r"eligibility", re.IGNORECASE))
        for element in eligibility_elements:
            parent = element.find_parent(['p', 'div'])
            if parent:
                eligibility = clean_text(parent.get_text(strip=True))
                break
        print(f"✅ Eligibility: {eligibility}")

        # Duration
        duration = "Not available"
        duration_elements = soup.find_all(['strong', 'h3', 'h4'], string=re.compile(r"duration", re.IGNORECASE))
        for element in duration_elements:
            parent = element.find_parent(['p', 'div'])
            if parent:
                duration = clean_text(parent.get_text(strip=True))
                break
        print(f"⏱️ Duration: {duration}")

        # Price
        price = "Not available"
        price_selectors = [".price .amount", ".woocommerce-Price-amount", "bdi", ".product_price", ".course-price"]
        for selector in price_selectors:
            price_tag = soup.select_one(selector)
            if price_tag:
                price = price_tag.get_text(strip=True)
                if price and price != "Not available":
                    break
        print(f"💰 Price: {price}")

        fee_structure = (
            f"{price} \n- All other fees remain unchanged\n"
            "- Education loans are available through leading banks and NBFCs."
        ) if price and price != "Not available" else (
            "- All other fees remain unchanged\n"
            "- Education loans are available through leading banks and NBFCs."
        )
        print(f"💳 Fee Structure: {fee_structure}")

        # Who Should Take It
        who_content = []
        who_patterns = ["strong", "h3", "h4"]
        for pattern in who_patterns:
            who_elements = soup.find_all(pattern, string=re.compile(r"who should|target audience|audience", re.IGNORECASE))
            for element in who_elements:
                next_ul = element.find_next("ul")
                if next_ul:
                    for li in next_ul.find_all("li"):
                        who_content.append(clean_text(li.get_text(strip=True)))
                else:
                    parent = element.find_parent(['p', 'div'])
                    if parent:
                        text = parent.get_text(strip=True).replace(element.get_text(strip=True), "").strip()
                        if text:
                            who_content.append(text)
        who_should_take = "\n".join([f"- {item}" for item in who_content]) if who_content else "Not available"
        print(f"👥 Who Should Take It: {who_should_take}")

        # Syllabus
        modules = extract_syllabus(soup)
        syllabus_content = format_syllabus_for_excel(modules)
        syllabus_content = transform_syllabus(syllabus_content)
        print(f"📚 Found {len(modules)} modules in syllabus")

        # Certificate
        cert_link = "Certificate not available"
        cert_selectors = ["a[href*='certificate']", "a[href*='.pdf']", "img[src*='certificate']", "a[href*='certif']"]
        for selector in cert_selectors:
            cert_element = soup.select_one(selector)
            if cert_element:
                if cert_element.has_attr('href'):
                    cert_link = cert_element['href']
                elif cert_element.has_attr('src'):
                    cert_link = cert_element['src']
                break
        print(f"📜 Certificate: {cert_link}")

        return course_name, about_course, eligibility, duration, price, who_should_take, syllabus_content, cert_link, fee_structure

    except Exception as e:
        print(f"🔥 Scraping failed for {url}: {str(e)}")
        import traceback
        traceback.print_exc()
        return ["Error"] * 9
    finally:
        driver.quit()
        print("🚪 Browser closed")

# -------------------- SAVE TO EXCEL --------------------
def save_to_excel(data, file_path, url):
    columns = [
        "Course Name", "About Course", "Eligibility", "Duration",
        "Price", "Who Should Take It", "syllabus", "Certificate",
        "Fee Structure", "Course URL"
    ]
    
    (course_name, about_course, eligibility, duration, price,
     who_should_take, syllabus_content, cert_link, fee_structure) = data
     
    try:
        if os.path.exists(file_path):
            os.remove(file_path)
            print("🗑️ Removed existing file to avoid duplicates")
        
        df = pd.DataFrame(columns=columns)
        row = {
            "Course Name": course_name,
            "About Course": about_course,
            "Eligibility": eligibility,
            "Duration": duration,
            "Price": price,
            "Who Should Take It": who_should_take,
            "syllabus": syllabus_content,
            "Certificate": cert_link,
            "Fee Structure": fee_structure,
            "Course URL": url
        }
        df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
        df.to_excel(file_path, index=False)
        print(f"💾 Saved data for: {course_name}")
        print(f"📊 Syllabus column contains: {len(syllabus_content)} characters")

    except Exception as e:
        print(f"❌ Excel save error: {e}")

# -------------------- MAIN --------------------
if __name__ == "__main__":
    course_urls = [
        "https://skillcouncil.in/product/certified-certified-forklift-operator/"
    ]

    print("🚀 Starting scraping process...")
    file_path = r"C:\Users\taslim.siddiqui\Downloads\course_test.xlsx"

    for course_url in course_urls:
        print(f"\n🔍 Processing: {course_url}")
        course_data = scrape_course_data(course_url)
        if all(item != "Error" for item in course_data):
            save_to_excel(course_data, file_path, course_url)
        else:
            print(f"❌ Failed to scrape complete data for {course_url}")

    print("\n✅ Process completed")


🚀 Starting scraping process...

🔍 Processing: https://skillcouncil.in/product/certified-certified-forklift-operator/
🌐 Accessing URL: https://skillcouncil.in/product/certified-certified-forklift-operator/
📑 Clicking on tab: Description
📛 Course Name: Certified Certified Forklift Operator
📝 About Course: The Certified Forklift Operator course is designed to equip participants with the knowledge and hand...
✅ Eligibility: Graduation or Equivalent is required.
⏱️ Duration: One Month.
💰 Price: ₹6,000
💳 Fee Structure: ₹6,000 
- All other fees remain unchanged
- Education loans are available through leading banks and NBFCs.
👥 Who Should Take It: Not available
🔍 Searching for syllabus modules...
✅ Found module: Module 1: Introduction to Forklift Operations and ...
✅ Found module: Module 2: Pre-Operational Checks and Safety Inspec...
✅ Found module: Module 3: Safe Driving and Maneuvering Techniques:...
✅ Found module: Module 4: Load Handling, Stacking, and Unloading P...
✅ Found module: Module

Entire Excel file

In [6]:
import time
import os
import re
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# -------------------- DRIVER SETUP --------------------
def get_driver(headless=False):
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    if headless:
        options.add_argument("--headless=new")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

# -------------------- TEXT CLEANING --------------------
def clean_text(text):
    words_to_remove = [
        "Objective:", "Objective :",
        "Eligibility:", "Eligibility :",
        "Duration:", "Duration :",
        "Professional Skills"
    ]
    for word in words_to_remove:
        text = text.replace(word, "")
    return text.strip()

# -------------------- SYLLABUS EXTRACTION --------------------
def extract_syllabus(soup):
    modules = []
    module_paragraphs = soup.find_all('p', attrs={'data-start': True, 'data-end': True})
    for p_tag in module_paragraphs:
        strong_tag = p_tag.find('strong')
        if strong_tag:
            module_title = strong_tag.get_text(strip=True)
            full_text = p_tag.get_text(strip=True)
            module_content = full_text.replace(module_title, '').strip()
            module_content = re.sub(r'^:\s*', '', module_content)
            if module_title and module_content:
                modules.append({
                    "title": clean_text(module_title),
                    "content": clean_text(module_content)
                })
    # Strategy 2: span with font-size 12pt
    if not modules:
        spans_with_strong = soup.find_all('span', style=re.compile(r'font-size:\s*12pt'))
        for span in spans_with_strong:
            strong_tag = span.find('strong')
            if strong_tag:
                module_title = strong_tag.get_text(strip=True)
                if 'module' in module_title.lower():
                    full_text = span.get_text(strip=True)
                    module_content = full_text.replace(module_title, '').strip()
                    module_content = re.sub(r'^:\s*', '', module_content)
                    if module_title and module_content:
                        modules.append({
                            "title": clean_text(module_title),
                            "content": clean_text(module_content)
                        })
    # Strategy 3: any strong tag containing "Module"
    if not modules:
        all_strong_tags = soup.find_all('strong')
        for strong in all_strong_tags:
            text = strong.get_text(strip=True)
            if 'module' in text.lower():
                parent = strong.find_parent(['p', 'div'])
                if parent:
                    full_text = parent.get_text(strip=True)
                    module_content = full_text.replace(text, '').strip()
                    module_content = re.sub(r'^:\s*', '', module_content)
                    if text and module_content:
                        modules.append({
                            "title": clean_text(text),
                            "content": clean_text(module_content)
                        })
    filtered_modules = []
    for module in modules:
        title_lower = module['title'].lower()
        if ('module' in title_lower and not any(word in title_lower for word in ['job', 'career', 'opportunit', 'placement'])):
            filtered_modules.append(module)
    return filtered_modules if filtered_modules else [{"title": "Syllabus not available", "content": "Could not extract syllabus content"}]

# -------------------- CLEAN MODULE TITLES --------------------
def clean_module_title(title):
    title = re.sub(r'^(Module\s+\d+:\s*)+', '', title)
    return title.strip()

# -------------------- FORMAT SYLLABUS FOR EXCEL --------------------
def format_syllabus_for_excel(modules):
    if not modules or (len(modules) == 1 and "not available" in modules[0]["title"].lower()):
        return "Syllabus not available"
    syllabus_text = ""
    for i, module in enumerate(modules, 1):
        clean_title = clean_module_title(module['title'])
        content = module['content']
        if content and content != "Content not available":
            syllabus_text += f"Module {i}: {clean_title}: {content}\n\n"
        else:
            syllabus_text += f"Module {i}: {clean_title}: Content details to be provided\n\n"
    return syllabus_text.strip()

# -------------------- SYLLABUS TRANSFORM --------------------
def transform_syllabus(text: str) -> str:
    if pd.isna(text) or not isinstance(text, str):
        return text
    MODULE_HEADER_RE = re.compile(r"(Module\s*\d+\s*:\s*[^:]+?)::\s*", flags=re.IGNORECASE)
    def _repl(m: re.Match) -> str:
        return m.group(1) + "\n "
    return MODULE_HEADER_RE.sub(_repl, text)

# -------------------- SCRAPER --------------------
def scrape_course_data(url):
    driver = get_driver()
    try:
        driver.get(url)
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        time.sleep(5)
        # Click syllabus/description tab if exists
        try:
            tabs = driver.find_elements(By.CSS_SELECTOR, ".woocommerce-tabs li a, .tabs li a, .wc-tabs li a")
            for tab in tabs:
                if 'description' in tab.text.lower() or 'syllabus' in tab.text.lower() or 'curriculum' in tab.text.lower():
                    driver.execute_script("arguments[0].click();", tab)
                    time.sleep(3)
                    break
        except:
            pass
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Course Name
        course_name_tag = soup.find("h1", class_="product_title entry-title")
        course_name = course_name_tag.get_text(strip=True) if course_name_tag else "Course name not found"

        # About Course
        about_course = "About course not found"
        about_patterns = [
            ("strong", re.compile(r"objective|about|course overview", re.IGNORECASE)),
            ("h2", re.compile(r"course objective|about", re.IGNORECASE)),
            ("h3", re.compile(r"objective|overview", re.IGNORECASE))
        ]
        for tag_name, pattern in about_patterns:
            elements = soup.find_all(tag_name, string=pattern)
            for element in elements:
                parent = element.find_parent(['p', 'div'])
                if parent:
                    about_course = clean_text(parent.get_text(" ", strip=True))
                    break
            if about_course != "About course not found":
                break

        # Eligibility
        eligibility = "Not available"
        eligibility_elements = soup.find_all(['strong', 'h3', 'h4'], string=re.compile(r"eligibility", re.IGNORECASE))
        for element in eligibility_elements:
            parent = element.find_parent(['p', 'div'])
            if parent:
                eligibility = clean_text(parent.get_text(strip=True))
                break

        # Duration
        duration = "Not available"
        duration_elements = soup.find_all(['strong', 'h3', 'h4'], string=re.compile(r"duration", re.IGNORECASE))
        for element in duration_elements:
            parent = element.find_parent(['p', 'div'])
            if parent:
                duration = clean_text(parent.get_text(strip=True))
                break

        # Price
        price = "Not available"
        price_selectors = [".price .amount", ".woocommerce-Price-amount", "bdi", ".product_price", ".course-price"]
        for selector in price_selectors:
            price_tag = soup.select_one(selector)
            if price_tag:
                price = price_tag.get_text(strip=True)
                if price and price != "Not available":
                    break

        fee_structure = (
            f"{price} \n- All other fees remain unchanged\n- Education loans are available through leading banks and NBFCs."
        ) if price and price != "Not available" else "- All other fees remain unchanged\n- Education loans are available through leading banks and NBFCs."

        # Who Should Take It
        who_content = []
        who_patterns = ["strong", "h3", "h4"]
        for pattern in who_patterns:
            who_elements = soup.find_all(pattern, string=re.compile(r"who should|target audience|audience", re.IGNORECASE))
            for element in who_elements:
                next_ul = element.find_next("ul")
                if next_ul:
                    for li in next_ul.find_all("li"):
                        who_content.append(clean_text(li.get_text(strip=True)))
                else:
                    parent = element.find_parent(['p', 'div'])
                    if parent:
                        text = parent.get_text(strip=True).replace(element.get_text(strip=True), "").strip()
                        if text:
                            who_content.append(text)
        who_should_take = "\n".join([f"- {item}" for item in who_content]) if who_content else "Not available"

        # Syllabus
        modules = extract_syllabus(soup)
        syllabus_content = format_syllabus_for_excel(modules)
        syllabus_content = transform_syllabus(syllabus_content)

        # Certificate
        cert_link = "Certificate not available"
        cert_selectors = ["a[href*='certificate']", "a[href*='.pdf']", "img[src*='certificate']", "a[href*='certif']"]
        for selector in cert_selectors:
            cert_element = soup.select_one(selector)
            if cert_element:
                if cert_element.has_attr('href'):
                    cert_link = cert_element['href']
                elif cert_element.has_attr('src'):
                    cert_link = cert_element['src']
                break

        return course_name, about_course, eligibility, duration, price, who_should_take, syllabus_content, cert_link, fee_structure

    except Exception as e:
        print(f"🔥 Scraping failed for {url}: {e}")
        import traceback
        traceback.print_exc()
        return ["Error"] * 9
    finally:
        driver.quit()

# -------------------- READ URLS --------------------
def read_course_urls(input_file_path):
    if not os.path.exists(input_file_path):
        print(f"❌ Input file not found: {input_file_path}")
        return []
    df = pd.read_excel(input_file_path)
    if 'Course URL' not in df.columns:
        print("❌ 'Course URL' column missing in input Excel")
        return []
    return df['Course URL'].dropna().tolist()

# -------------------- SAVE BATCH TO EXCEL --------------------
def save_batch_to_excel(all_course_data, output_file_path):
    columns = [
        "Course Name", "About Course", "Eligibility", "Duration",
        "Price", "Who Should Take It", "syllabus", "Certificate",
        "Fee Structure", "Course URL"
    ]
    df = pd.DataFrame(all_course_data, columns=columns)
    df.to_excel(output_file_path, index=False)
    print(f"💾 Saved batch to: {output_file_path}")

# -------------------- OUTPUT PREVIEW --------------------
def print_output_preview(all_course_data, n=3):
    print("\n📌 Preview of first courses:")
    for i, row in enumerate(all_course_data[:n], 1):
        print(f"\n[{i}] Course Name: {row[0]}")
        print(f"Syllabus (first 200 chars): {row[6][:200]}...")

# -------------------- MAIN --------------------
def main():
    input_file_path = r"C:\Users\taslim.siddiqui\Downloads\excel.xlsx"
    output_file_path = r"C:\Users\taslim.siddiqui\Downloads\lot.xlsx"

    print("🚀 Starting batch course scraping...")
    course_urls = read_course_urls(input_file_path)
    if not course_urls:
        return

    all_course_data = []
    for i, url in enumerate(course_urls, 1):
        print(f"\n🔍 [{i}/{len(course_urls)}] Scraping: {url}")
        data = scrape_course_data(url)
        full_data = list(data) + [url]
        all_course_data.append(full_data)
        time.sleep(2)

    save_batch_to_excel(all_course_data, output_file_path)
    print_output_preview(all_course_data)
    print("✅ Process completed!")

if __name__ == "__main__":
    main()


🚀 Starting batch course scraping...

🔍 [1/1] Scraping: https://skillcouncil.in/product/certified-industrial-designer/
💾 Saved batch to: C:\Users\taslim.siddiqui\Downloads\lot.xlsx

📌 Preview of first courses:

[1] Course Name: Certified Industrial Designer
Syllabus (first 200 chars): Module 1: Fundamentals of Industrial Design
 Design principles, Elements of design, Design thinking process, Ergonomics and human factors, Materials and manufacturing basics, History of industrial des...
✅ Process completed!
