Futurense Cpp data Extraction  for one course link

In [1]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from bs4 import BeautifulSoup
import time
import os
import re

# -------------------- DRIVER SETUP --------------------
def get_driver(headless=False):
    options = uc.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    if headless:
        options.add_argument("--headless")
    driver = uc.Chrome(options=options, version_main=138)
    return driver

# -------------------- HELPERS --------------------
def normalize_name(name: str) -> str:
    """Clean institute name for matching"""
    name = name.lower()
    name = name.replace("&", "and")
    name = re.sub(r"program\s+structure", "", name)
    name = re.sub(r"[-–]", " ", name)
    name = re.sub(r"[^\w\s]", "", name)
    name = re.sub(r"\s+", " ", name)
    return name.strip()

# -------------------- SCRAPER --------------------
def scrape_course_data(url):
    driver = get_driver()
    try:
        print(f"🌐 Accessing URL: {url}")
        driver.get(url)

        # Wait for page to load
        WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.TAG_NAME, "h1")))
        time.sleep(2)

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # 1. Course Name
        course_name_tag = soup.find("h1", class_="heading-h2 is-2rem is-white") or soup.find("h1")
        course_name = course_name_tag.get_text(strip=True) if course_name_tag else "Course name not found"
        print(f"📛 Course Name: {course_name}")

        # 2. About Course
        about_main = soup.find("div", class_="text-block-39") \
                     or soup.find("div", class_="course-description") \
                     or soup.find("p")
        about_main_text = about_main.get_text(strip=True) if about_main else ""

        bullet_texts = []
        for ul in soup.find_all("ul"):
            for li in ul.find_all("li"):
                bullet_texts.append(li.get_text(strip=True))

        if about_main_text and bullet_texts:
            about_course = about_main_text + "\n- " + "\n- ".join(bullet_texts)
        elif bullet_texts:
            about_course = "- " + "\n- ".join(bullet_texts)
        else:
            about_course = about_main_text

        print(f"📝 About Course:\n{about_course}")

        # 3. Institute-wise Prices
        price_map = {}
        price_blocks = soup.find_all("div", class_="div-block-446")
        for block in price_blocks:
            inst_tag = block.find("div", class_=re.compile(r"text-block-14[0-9]+"))
            fee_tag = block.find("div", class_="text-block-78")
            if inst_tag and fee_tag:
                inst_name = inst_tag.get_text(strip=True)
                inst_fee = fee_tag.get_text(strip=True)
                price_map[inst_name] = inst_fee  # Keep original name

        institute_prices_str = " ; ".join([f"{k}: {v}" for k, v in price_map.items()]) if price_map else "Not available"
        print(f"🏫 Institute Prices: {institute_prices_str}")

        # 4. Institute-wise Details (Duration + Learning Mode)
        institute_details = []
        inst_blocks = soup.find_all("div", class_="div-block-382")
        for block in inst_blocks:
            inst_name_tag = block.find("h3", class_="heading-h3")
            inst_name = inst_name_tag.get_text(strip=True) if inst_name_tag else "Not available"

            duration_i, learning_mode_i = "Not available", "Not available"
            info_blocks = block.find_all("div", class_="div-block-384")
            for ib in info_blocks:
                label = ib.find("div", class_="text-block-40")
                value = ib.find("div", class_="text-block-41")
                if label and value:
                    label_text = label.get_text(strip=True).lower()
                    if "duration" in label_text:
                        duration_i = value.get_text(strip=True)
                    elif "learning" in label_text:
                        learning_mode_i = value.get_text(strip=True)

            institute_details.append((inst_name, duration_i, learning_mode_i))

        print(f"🏫 Institute Details: {institute_details if institute_details else 'Not available'}")

        # 5. Certificate
        cert_tag = soup.find("a", class_="certificate-image")
        cert_img = (
            cert_tag.find("img")["src"]
            if cert_tag and cert_tag.find("img")
            else "Certificate not available"
        )
        print(f"📜 Certificate: {cert_img}")

        # 6. Eligibility
        eligibility_content = []
        elig_cards = soup.find_all("div", class_="eligibility-card")
        for card in elig_cards:
            heading = card.find("div", class_="e-card-heading")
            subheading = card.find("div", class_="e-card-subheading")
            if heading and subheading:
                eligibility_content.append(f"{heading.get_text(strip=True)} ({subheading.get_text(strip=True)})")

        eligibility = " ; ".join(eligibility_content) if eligibility_content else "Not available"
        print(f"✅ Eligibility: {eligibility}")

        return course_name, about_course, cert_img, eligibility, institute_details, institute_prices_str

    except Exception as e:
        print(f"🔥 Scraping failed for {url}: {str(e)}")
        return ["Error"] * 6
    finally:
        driver.quit()
        print("🚪 Browser closed")

# -------------------- SAVE TO EXCEL --------------------
def save_to_excel(data, file_path, url):
    columns = [
        "Course Name",
        "About Course",
        "Certificate",
        "Eligibility",
        "Institute Name",
        "Institute Duration",
        "Institute Learning Mode",
        "Course URL",
        "Institute Prices"
    ]

    course_name, about_course, cert_img, eligibility, institute_details, institute_prices_str = data

    if os.path.exists(file_path):
        df = pd.read_excel(file_path)
    else:
        df = pd.DataFrame(columns=columns)

    rows = []
    if institute_details:
        for inst_name, duration_i, learning_mode_i in institute_details:
            rows.append({
                "Course Name": course_name,
                "About Course": about_course,
                "Certificate": cert_img,
                "Eligibility": eligibility,
                "Institute Name": inst_name,
                "Institute Duration": duration_i,
                "Institute Learning Mode": learning_mode_i,
                "Course URL": url,
                "Institute Prices": institute_prices_str,
            })
    else:
        rows.append({
            "Course Name": course_name,
            "About Course": about_course,
            "Certificate": cert_img,
            "Eligibility": eligibility,
            "Institute Name": "Not available",
            "Institute Duration": "Not available",
            "Institute Learning Mode": "Not available",
            "Course URL": url,
            "Institute Prices": institute_prices_str,
        })

    df = pd.concat([df, pd.DataFrame(rows)], ignore_index=True)
    df.to_excel(file_path, index=False)
    print(f"💾 Saved data for: {course_name}")

# -------------------- MAIN --------------------
if __name__ == "__main__":
    course_urls = [
        "https://futurense.com/usp/masters-manufacturing-and-mechanical-systems-integration"
    ]

    print("🚀 Starting scraping process...")
    file_path = r"C:\Users\taslim.siddiqui\Downloads\course00.xlsx"

    for course_url in course_urls:
        print(f"\n🔍 Processing: {course_url}")
        course_data = scrape_course_data(course_url)
        if all(item != "Error" for item in course_data):
            save_to_excel(course_data, file_path, course_url)
        else:
            print(f"❌ Failed to scrape complete data for {course_url}")

    print("\n✅ Process completed")


🚀 Starting scraping process...

🔍 Processing: https://futurense.com/usp/masters-manufacturing-and-mechanical-systems-integration
🌐 Accessing URL: https://futurense.com/usp/masters-manufacturing-and-mechanical-systems-integration
📛 Course Name: Master's in Manufacturing and Mechanical Systems Integration
📝 About Course:
Advanced Certification Program in Manufacturing and Mechanical Systems Integration
- IIT Advanced Certificate Program
- Begin Your US Chapter
- Combines Academic Lectures by top faculty at IIT Jodhpur with Hands-On Practical Sessions by Leading Industry Experts
- Assignments and Capstone Projects for an Immersive Learning
- Ranked 41st among "Most Innovative Schools" for curriculum upgrade
- Assignments and Capstone Projects for an Immersive Learning
🏫 Institute Prices: IIT Jodhpur: ₹3,54,000 ; Rochester Institute of Technology: ₹21,70,050
🏫 Institute Details: [('IIT - JODHPUR Program Stucture', '3 months', 'Online'), ('Rochester Institute of Technology', '12 Months', 'O

Do for Entire Excel file

In [3]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from bs4 import BeautifulSoup
import time
import os
import re

# -------------------- DRIVER SETUP --------------------
def get_driver(headless=False):
    options = uc.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    if headless:
        options.add_argument("--headless")
    driver = uc.Chrome(options=options, version_main=138)
    return driver

# -------------------- HELPERS --------------------
def normalize_name(name: str) -> str:
    """Clean institute name for matching"""
    name = name.lower()
    name = name.replace("&", "and")
    name = re.sub(r"program\s+structure", "", name)
    name = re.sub(r"[-–]", " ", name)
    name = re.sub(r"[^\w\s]", "", name)
    name = re.sub(r"\s+", " ", name)
    return name.strip()

# -------------------- SCRAPER --------------------
def scrape_course_data(url):
    driver = get_driver()
    try:
        print(f"🌐 Accessing URL: {url}")
        driver.get(url)
        WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.TAG_NAME, "h1")))
        time.sleep(2)

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Course Name
        course_name_tag = soup.find("h1", class_="heading-h2 is-2rem is-white") or soup.find("h1")
        course_name = course_name_tag.get_text(strip=True) if course_name_tag else "Course name not found"

        # About Course
        about_main = soup.find("div", class_="text-block-39") \
                     or soup.find("div", class_="course-description") \
                     or soup.find("p")
        about_main_text = about_main.get_text(strip=True) if about_main else ""
        bullet_texts = []
        for ul in soup.find_all("ul"):
            for li in ul.find_all("li"):
                bullet_texts.append(li.get_text(strip=True))

        if about_main_text and bullet_texts:
            about_course = about_main_text + "\n- " + "\n- ".join(bullet_texts)
        elif bullet_texts:
            about_course = "- " + "\n- ".join(bullet_texts)
        else:
            about_course = about_main_text

        # Institute-wise Prices
        price_map = {}
        price_blocks = soup.find_all("div", class_="div-block-446")
        for block in price_blocks:
            inst_tag = block.find("div", class_=re.compile(r"text-block-14[0-9]+"))
            fee_tag = block.find("div", class_="text-block-78")
            if inst_tag and fee_tag:
                inst_name = inst_tag.get_text(strip=True)
                inst_fee = fee_tag.get_text(strip=True)
                price_map[inst_name] = inst_fee
        institute_prices_str = " ; ".join([f"{k}: {v}" for k, v in price_map.items()]) if price_map else "Not available"

        # Institute Details (Duration + Learning Mode)
        institute_details = []
        inst_blocks = soup.find_all("div", class_="div-block-382")
        for block in inst_blocks:
            inst_name_tag = block.find("h3", class_="heading-h3")
            inst_name = inst_name_tag.get_text(strip=True) if inst_name_tag else "Not available"

            duration_i, learning_mode_i = "Not available", "Not available"
            info_blocks = block.find_all("div", class_="div-block-384")
            for ib in info_blocks:
                label = ib.find("div", class_="text-block-40")
                value = ib.find("div", class_="text-block-41")
                if label and value:
                    label_text = label.get_text(strip=True).lower()
                    if "duration" in label_text:
                        duration_i = value.get_text(strip=True)
                    elif "learning" in label_text:
                        learning_mode_i = value.get_text(strip=True)
            institute_details.append((inst_name, duration_i, learning_mode_i))

        # Certificate
        cert_tag = soup.find("a", class_="certificate-image")
        cert_img = cert_tag.find("img")["src"] if cert_tag and cert_tag.find("img") else "Certificate not available"

        # Eligibility
        eligibility_content = []
        elig_cards = soup.find_all("div", class_="eligibility-card")
        for card in elig_cards:
            heading = card.find("div", class_="e-card-heading")
            subheading = card.find("div", class_="e-card-subheading")
            if heading and subheading:
                eligibility_content.append(f"{heading.get_text(strip=True)} ({subheading.get_text(strip=True)})")
        eligibility = " ; ".join(eligibility_content) if eligibility_content else "Not available"

        return course_name, about_course, cert_img, eligibility, institute_details, institute_prices_str

    except Exception as e:
        print(f"🔥 Scraping failed for {url}: {str(e)}")
        return ["Error"] * 6
    finally:
        driver.quit()

# -------------------- SAVE TO EXCEL --------------------
def save_to_excel(data, file_path, url):
    columns = [
        "Course Name",
        "About Course",
        "Certificate",
        "Eligibility",
        "Institute Name",
        "Institute Duration",
        "Institute Learning Mode",
        "Course URL",
        "Institute Prices"
    ]

    course_name, about_course, cert_img, eligibility, institute_details, institute_prices_str = data

    if os.path.exists(file_path):
        df = pd.read_excel(file_path)
    else:
        df = pd.DataFrame(columns=columns)

    rows = []
    if institute_details:
        for inst_name, duration_i, learning_mode_i in institute_details:
            rows.append({
                "Course Name": course_name,
                "About Course": about_course,
                "Certificate": cert_img,
                "Eligibility": eligibility,
                "Institute Name": inst_name,
                "Institute Duration": duration_i,
                "Institute Learning Mode": learning_mode_i,
                "Course URL": url,
                "Institute Prices": institute_prices_str,
            })
    else:
        rows.append({
            "Course Name": course_name,
            "About Course": about_course,
            "Certificate": cert_img,
            "Eligibility": eligibility,
            "Institute Name": "Not available",
            "Institute Duration": "Not available",
            "Institute Learning Mode": "Not available",
            "Course URL": url,
            "Institute Prices": institute_prices_str,
        })

    df = pd.concat([df, pd.DataFrame(rows)], ignore_index=True)
    df.to_excel(file_path, index=False)
    print(f"💾 Saved data for: {course_name}")

# -------------------- MAIN --------------------
if __name__ == "__main__":
    input_file = r"C:\Users\taslim.siddiqui\Downloads\course_links.xlsx"
    file_path = r"C:\Users\taslim.siddiqui\Downloads\course_Futurense1.xlsx"

    # Read URLs from Excel column "Course Link"
    df_input = pd.read_excel(input_file)
    course_urls = df_input["Course Link"].dropna().tolist()

    print("🚀 Starting scraping process...")

    for course_url in course_urls:
        print(f"\n🔍 Processing: {course_url}")
        course_data = scrape_course_data(course_url)
        if all(item != "Error" for item in course_data):
            save_to_excel(course_data, file_path, course_url)
        else:
            print(f"❌ Failed to scrape complete data for {course_url}")

    print("\n✅ Process completed")


🚀 Starting scraping process...

🔍 Processing: https://futurense.com/usp/masters-in-computer-information-systems-business-analytics
🌐 Accessing URL: https://futurense.com/usp/masters-in-computer-information-systems-business-analytics
💾 Saved data for: MS in Computer Information Systems & Business Analytics

🔍 Processing: https://futurense.com/usp/masters-in-business-analytics-2
🌐 Accessing URL: https://futurense.com/usp/masters-in-business-analytics-2
💾 Saved data for: Master's in Business Analytics

🔍 Processing: https://futurense.com/usp/masters-in-robotics-and-autonomy
🌐 Accessing URL: https://futurense.com/usp/masters-in-robotics-and-autonomy
💾 Saved data for: Master's in Robotics and Autonomy

🔍 Processing: https://futurense.com/usp/masters-in-artificial-intelligence-and-machine-learning
🌐 Accessing URL: https://futurense.com/usp/masters-in-artificial-intelligence-and-machine-learning
💾 Saved data for: Master's in Artificial Intelligence and Machine Learning

🔍 Processing: https://