do for one course link

In [None]:
import time
import re
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# -------------------- DRIVER SETUP --------------------
def get_driver(headless=True):
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    if headless:
        options.add_argument("--headless=new")
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# -------------------- COURSE EXTRACTION FUNCTION --------------------
def extract_course_info(url, driver):
    try:
        driver.get(url)
        WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        time.sleep(5)

        # Expand collapsible sections
        try:
            expand_buttons = driver.find_elements(By.CSS_SELECTOR, ".card-header.collapsed")
            for btn in expand_buttons:
                try:
                    driver.execute_script("arguments[0].click();", btn)
                    time.sleep(0.5)
                except:
                    pass
        except:
            pass

        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # -------------------- COURSE NAME --------------------
        h1 = soup.find('h1')
        course_name = h1.get_text(strip=True) if h1 else "Course name not found"

        # -------------------- COURSE DESCRIPTION --------------------
        course_description = "Course description not found"
        desc_el = soup.find('p', class_='pg-about-description')
        if desc_el:
            course_description = desc_el.get_text(strip=True)
        else:
            alt_desc = soup.find('p', class_='master-project-work-carousel-description')
            if alt_desc:
                course_description = alt_desc.get_text(strip=True)

        # -------------------- LEARNING FORMAT & DURATION --------------------
        learning_format = "Not found"
        course_duration = "Not found"
        live_classes_duration = "Not found"

        for el in soup.find_all('div', class_='pg-key-feature-items'):
            head = el.find('p', class_='pg-key-feature-head')
            txt = el.find('p', class_='pg-key-feature-text')
            if not head or not txt:
                continue
            h = head.get_text(strip=True)
            t = txt.get_text(strip=True)
            if 'Learning Format' in h:
                learning_format = t
            elif 'Duration' in h:
                course_duration = t
            elif 'Live Classes' in h:
                live_classes_duration = t

        if live_classes_duration != "Not found":
            if course_duration != "Not found":
                course_duration = f"{course_duration} (Live Classes: {live_classes_duration})"
            else:
                course_duration = f"Live Classes: {live_classes_duration}"

        # -------------------- COURSE PRICE --------------------
        course_price = "Price not found"
        el = soup.find('h3', class_='pg-programe-fee-price')
        if el:
            course_price = el.get_text(strip=True)
        if course_price == "Price not found":
            cont = soup.find('span', class_='OnClassEnrLP')
            if cont:
                amt = cont.find('span', class_='woocommerce-Price-amount')
                if amt:
                    course_price = amt.get_text(strip=True)
        if course_price == "Price not found":
            for sel in [
                '.course-regular-price',
                '.master-fee-online-class-main-price',
                '.woocommerce-Price-amount',
                'div.price',
                'span.price',
                'h3.price'
            ]:
                el = soup.select_one(sel)
                if el and el.get_text(strip=True):
                    course_price = el.get_text(strip=True)
                    break
        if course_price != "Price not found":
            course_price = re.sub(r'\s+', ' ', course_price).replace('Â', '').strip()

        # -------------------- CERTIFICATE IMAGE --------------------
        certificate_image_url = "Not found"
        cert_sec = soup.find('div', class_='master-certification-one')
        if cert_sec:
            img = cert_sec.find('img')
            if img:
                certificate_image_url = img.get('src', '') or img.get('data-src', '') or "Not found"

        # -------------------- FACULTY --------------------
        course_faculty = "Not found"
        mentor_carousel = soup.find('div', class_='master-mentor-carousel')
        if mentor_carousel:
            names = []
            for h3 in mentor_carousel.find_all('h3'):
                name = h3.get_text(strip=True)
                if (name and len(name) > 3 and ' ' in name and
                    not any(w in name.lower() for w in ['about','course','program','fee','duration','certificate']) and
                    not name.isupper()):
                    names.append(name)
            if names:
                course_faculty = ", ".join(names)
        if course_faculty == "Not found":
            names = []
            for item in soup.find_all('div', class_='master-mentor-carousel-item'):
                h3 = item.find('h3')
                if h3:
                    name = h3.get_text(strip=True)
                    if name and len(name) > 3 and ' ' in name:
                        names.append(name)
            if names:
                course_faculty = ", ".join(names)

        # -------------------- WHO SHOULD TAKE --------------------
        who_should_take = "Not found"
        wsec = soup.find('div', class_='who-can-apply-content')
        if wsec:
            lis = wsec.find_all('li')
            if lis:
                who_should_take = "\n".join([f"• {li.get_text(strip=True)}" for li in lis]).strip()

        # -------------------- SYLLABUS --------------------
        syllabus_text = "not found"
        syllabus_section = soup.find('div', id='master-curriculum-accordion')
        if syllabus_section:
            syllabus_data = []
            current_section = None
            for child in syllabus_section.children:
                cname = getattr(child, "name", None)
                if cname == 'h3':
                    current_section = child.get_text(strip=True)
                    continue
                if cname == 'div' and 'card' in (child.get('class') or []):
                    mt = child.find('h3', class_='master-curriculum-accordion-heading')
                    module_title = mt.get_text(strip=True) if mt else "Untitled Module"
                    module_title = re.sub(r'<[^>]+>', '', module_title)
                    module_lines = []
                    body = child.find('div', class_='card-body')
                    if body:
                        for node in body.find_all(recursive=False):
                            if node.name == 'p':
                                txt = node.get_text(separator=" ", strip=True)
                                if txt and not txt.startswith('Download Brochure'):
                                    module_lines.append(txt)
                            elif node.name == 'ul':
                                for li in node.find_all('li', recursive=False):
                                    li_txt = li.get_text(separator=" ", strip=True)
                                    if li_txt:
                                        module_lines.append(f"• {li_txt}")
                            elif node.name == 'div':
                                cn = " ".join(node.get('class', []))
                                if 'download' in cn.lower() or 'brochure' in cn.lower():
                                    continue
                                raw = node.get_text(separator=" ", strip=True)
                                if raw and not raw.startswith('Download Brochure'):
                                    module_lines.append(raw)
                    module_content = "\n".join(module_lines).strip()
                    syllabus_data.append({
                        'section': (current_section or "").strip(),
                        'module_title': module_title,
                        'module_content': module_content
                    })
            if syllabus_data:
                out = []
                last_section = None
                for it in syllabus_data:
                    sec = it['section']
                    if sec and sec != last_section:
                        out.append(f"\n\n{sec.upper()}\n")
                        last_section = sec
                    out.append(f"{it['module_title']}\n")
                    if it['module_content']:
                        out.append(f"{it['module_content']}\n\n")
                syllabus_text = "".join(out).strip()

        return {
            "course_link": url,
            "course_name": course_name,
            "course_description": course_description,
            "learning_format": learning_format,
            "course_duration": course_duration,
            "course_price": course_price,
            "certificate_image_url": certificate_image_url,
            "course_faculty": course_faculty,
            "who_should_take": who_should_take,
            "syllabus": syllabus_text
        }

    except Exception as e:
        return {
            "course_link": url,
            "course_name": "Error extracting name",
            "course_description": "Error extracting description",
            "learning_format": "Error extracting learning format",
            "course_duration": "Error extracting duration",
            "course_price": "Error extracting price",
            "certificate_image_url": "Error extracting certificate image URL",
            "course_faculty": "Error extracting course faculty",
            "who_should_take": "Error extracting who should take",
            "syllabus": f"Error extracting syllabus: {str(e)}"
        }

# -------------------- ONE COURSE TEST --------------------
if __name__ == "__main__":
    TEST_URL = "https://intellipaat.com/epgc-ui-ux-ihub-iit-roorkee/"

    driver = get_driver(headless=False)
    try:
        info = extract_course_info(TEST_URL, driver)

        print("\n================= COURSE DATA =================")
        for key, value in info.items():
            print(f"{key}: {value}\n")
        print("================================================\n")

        save_path = r"C:\Users\taslim.siddiqui\Downloads\Intellipaat_test_output.xlsx"
        pd.DataFrame([info]).to_excel(save_path, index=False)
        print(f"Course data saved to {save_path}")

    finally:
        driver.quit()


Do for Excel file # Multiple course link

In [20]:
import time
import os
import re
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# -------------------- DRIVER SETUP --------------------
def get_driver(headless=True):
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    if headless:
        options.add_argument("--headless=new")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

# -------------------- COURSE EXTRACTION FUNCTION --------------------
def extract_course_info(url, driver):
    try:
        driver.get(url)
        
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        
        time.sleep(5)
        
        # Expand all sections if collapsed
        try:
            expand_buttons = driver.find_elements(By.CSS_SELECTOR, ".card-header.collapsed")
            for button in expand_buttons:
                try:
                    driver.execute_script("arguments[0].click();", button)
                    time.sleep(0.5)
                except:
                    continue
        except:
            print("Could not expand all sections automatically")
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # -------------------- COURSE NAME --------------------
        course_name_element = soup.find('h1')
        course_name = course_name_element.text.strip() if course_name_element else "Course name not found"
        
        # -------------------- COURSE DESCRIPTION --------------------
        course_description_element = soup.find('p', class_='pg-about-description')
        course_description = course_description_element.text.strip() if course_description_element else "Course description not found"
        
        if course_description == "Course description not found":
            alt_description_element = soup.find('p', class_='master-project-work-carousel-description')
            if alt_description_element:
                course_description = alt_description_element.text.strip()
        
        # -------------------- LEARNING FORMAT & DURATION --------------------
        learning_format = "Not found"
        course_duration = "Not found"
        live_classes_duration = "Not found"
        
        learning_mode_elements = soup.find_all('div', class_='pg-key-feature-items')
        for element in learning_mode_elements:
            heading = element.find('p', class_='pg-key-feature-head')
            text_element = element.find('p', class_='pg-key-feature-text')
            if heading and text_element:
                heading_text = heading.text.strip()
                text_text = text_element.get_text(strip=True)
                
                if 'Learning Format' in heading_text:
                    learning_format = text_text
                elif 'Duration' in heading_text:
                    course_duration = text_text
                elif 'Live Classes' in heading_text:
                    live_classes_duration = text_text
        
        # Merge Live Classes duration into course_duration
        if live_classes_duration != "Not found":
            if course_duration != "Not found":
                course_duration = f"{course_duration} (Live Classes: {live_classes_duration})"
            else:
                course_duration = f"Live Classes: {live_classes_duration}"
        
        # -------------------- COURSE PRICE --------------------
        course_price = "Price not found"

        # 1) Original selector
        price_element = soup.find('h3', class_='pg-programe-fee-price')
        if price_element:
            course_price = price_element.get_text(strip=True)

        # 2) New HTML structure (OnClassEnrLP -> woocommerce-Price-amount)
        if course_price == "Price not found":
            price_container = soup.find('span', class_='OnClassEnrLP')
            if price_container:
                price_amount = price_container.find('span', class_='woocommerce-Price-amount')
                if price_amount:
                    course_price = price_amount.get_text(strip=True)

        # 3) Alternative selectors (try a few likely candidates)
        if course_price == "Price not found":
            price_selectors = [
                '.course-regular-price',
                '.master-fee-online-class-main-price',
                '.woocommerce-Price-amount',
                'div.price',
                'span.price',
                'h3.price'
            ]
            for sel in price_selectors:
                el = soup.select_one(sel)
                if el and el.get_text(strip=True):
                    course_price = el.get_text(strip=True)
                    break

        # 4) Clean up odd characters/whitespace
        if course_price != "Price not found":
            course_price = re.sub(r'\s+', ' ', course_price)
            course_price = course_price.replace('Â', '').strip()
        
        # -------------------- CERTIFICATE --------------------
        certificate_name = "Certificate not found"
        certificate_image_url = "Not found"
        
        certification_section = soup.find('div', class_='master-certification-one')
        if certification_section:
            certificate_img = certification_section.find('img')
            if certificate_img:
                certificate_name = certificate_img.get('alt', '') or certificate_img.get('title', '')
                certificate_image_url = certificate_img.get('src', '') or certificate_img.get('data-src', '')
        
        # -------------------- FACULTY --------------------
        course_faculty = "Not found"
        mentor_carousel = soup.find('div', class_='master-mentor-carousel')
        if mentor_carousel:
            faculty_headings = mentor_carousel.find_all('h3')
            faculty_names = []
            for heading in faculty_headings:
                faculty_name = heading.get_text(strip=True)
                if (faculty_name and len(faculty_name) > 3 and 
                    not any(word in faculty_name.lower() for word in ['about', 'course', 'program', 'fee', 'duration', 'certificate']) and
                    not faculty_name.isupper() and ' ' in faculty_name):
                    faculty_names.append(faculty_name)
            if faculty_names:
                course_faculty = ", ".join(faculty_names)
        
        if course_faculty == "Not found":
            mentor_items = soup.find_all('div', class_='master-mentor-carousel-item')
            faculty_names = []
            for item in mentor_items:
                h3_element = item.find('h3')
                if h3_element:
                    faculty_name = h3_element.get_text(strip=True)
                    if faculty_name and len(faculty_name) > 3 and ' ' in faculty_name:
                        faculty_names.append(faculty_name)
            if faculty_names:
                course_faculty = ", ".join(faculty_names)
        
        # -------------------- WHO SHOULD TAKE --------------------
        who_should_take = "Not found"
        who_should_take_section = soup.find('div', class_='who-can-apply-content')
        if who_should_take_section:
            list_items = who_should_take_section.find_all('li')
            if list_items:
                who_should_take = ""
                for li in list_items:
                    who_should_take += f"• {li.get_text(strip=True)}\n"
                who_should_take = who_should_take.strip()
        
        # -------------------- SYLLABUS --------------------
        # -------------------- SYLLABUS --------------------
        syllabus_text = "not found."
        syllabus_section = soup.find('div', id='master-curriculum-accordion')
        if syllabus_section:
            syllabus_data = []
            current_section = None
            for child in syllabus_section.children:
                cname = getattr(child, "name", None)
                if cname == 'h3':
                    current_section = child.get_text(strip=True)
                    continue
                if cname == 'div' and 'card' in (child.get('class') or []):
                    mt = child.find('h3', class_='master-curriculum-accordion-heading')
                    module_title = mt.get_text(strip=True) if mt else "Untitled Module"
                    module_title = re.sub(r'<[^>]+>', '', module_title)
                    module_lines = []
                    body = child.find('div', class_='card-body')
                    if body:
                        for node in body.find_all(recursive=False):
                            if node.name == 'p':
                                txt = node.get_text(separator=" ", strip=True)
                                if txt and not txt.startswith('Download Brochure'):
                                    module_lines.append(txt)
                            elif node.name == 'ul':
                                for li in node.find_all('li', recursive=False):
                                    li_txt = li.get_text(separator=" ", strip=True)
                                    if li_txt:
                                        module_lines.append(f"• {li_txt}")
                            elif node.name == 'div':
                                cn = " ".join(node.get('class', []))
                                if 'download' in cn.lower() or 'brochure' in cn.lower():
                                    continue
                                raw = node.get_text(separator=" ", strip=True)
                                if raw and not raw.startswith('Download Brochure'):
                                    module_lines.append(raw)
                    module_content = "\n".join(module_lines).strip()
                    syllabus_data.append({
                        'section': (current_section or "").strip(),
                        'module_title': module_title,
                        'module_content': module_content
                    })
            if syllabus_data:
                out = []
                last_section = None
                for it in syllabus_data:
                    sec = it['section']
                    if sec and sec != last_section:
                        out.append(f"\n\n{sec.upper()}\n")
                        last_section = sec
                    out.append(f"{it['module_title']}\n")
                    if it['module_content']:
                        out.append(f"{it['module_content']}\n\n")
                syllabus_text = "".join(out).strip()

        
        return {
            "course_link": url,
            "course_name": course_name,
            "course_description": course_description,
            "learning_format": learning_format,
            "course_duration": course_duration,
            "course_price": course_price,
            "certificate": certificate_name,
            "certificate_image_url": certificate_image_url,
            "course_faculty": course_faculty,
            "who_should_take": who_should_take,
            "syllabus": syllabus_text
        }
        
    except Exception as e:
        print(f"Error occurred for {url}: {str(e)}")
        return {
            "course_link": url,
            "course_name": "Error extracting name",
            "course_description": "Error extracting description",
            "learning_format": "Error extracting learning format",
            "course_duration": "Error extracting duration",
            "course_price": "Error extracting price",
            "certificate": "Error extracting certificate",
            "certificate_image_url": "Error extracting certificate image URL",
            "course_faculty": "Error extracting course faculty",
            "who_should_take": "Error extracting who should take",
            "syllabus": f"Error extracting syllabus: {str(e)}"
        }

# -------------------- MAIN --------------------
if __name__ == "__main__":
    excel_path = r"C:\Users\taslim.siddiqui\Downloads\intellipaat_course links.xlsx"
    
    if not os.path.exists(excel_path):
        print(f"Excel file not found at {excel_path}")
        exit()
    
    df_urls = pd.read_excel(excel_path)
    
    if "Course URL" not in df_urls.columns:
        print("Excel file must contain a column named 'Course URL'")
        exit()
    
    all_course_info = []
    
    # Open driver once
    driver = get_driver(headless=False)
    
    for idx, row in df_urls.iterrows():
        url = row["Course URL"]
        print(f"\nExtracting course {idx+1}/{len(df_urls)}: {url}")
        course_info = extract_course_info(url, driver)
        all_course_info.append(course_info)

        # -------- PRINT ALL COLUMNS IN CONSOLE --------
        print("\n================= COURSE DATA =================")
        for key, value in course_info.items():
            print(f"{key}: {value}\n")
        print("================================================\n")
    
    driver.quit()
    
    save_path = r"C:\Users\taslim.siddiqui\Downloads\Intellipaat_output_NEWONE.xlsx"
    pd.DataFrame(all_course_info).to_excel(save_path, index=False)
    print(f"\nAll course data saved to {save_path}")



Extracting course 1/61: https://intellipaat.com/salesforce-apex-integration-training/

course_link: https://intellipaat.com/salesforce-apex-integration-training/

course_name: Salesforce Course

course_description: In this Salesforce online course, top Salesforce-certified industry professionals will train you in managing and deploying Salesforce instances, platforms, databases, application design, platform configuration, automation processes, security controls, wave analytics, application development using Salesforce Lightning, and more.

learning_format: Not found

course_duration: Not found

course_price: ₹26,049

certificate: certificateimage

certificate_image_url: https://intellipaat.com/wp-content/themes/intellipaat/images/certificate.jpg

course_faculty: Not found

who_should_take: Not found

syllabus: Getting Started with the Fundamentals of Salesforce
• Get started with the basics of Salesforce, its different types of editions, and their features.
• Customise the Salesforce 