Guvi Geek input file is in excel extract course name , course syllabus , price , Duration , learning mode ,Certificate , language

In [4]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from bs4 import BeautifulSoup
import time
import os
from urllib.parse import urljoin

# -------------------- CONFIGURATION --------------------
OUTPUT_FILE = r"C:\Users\taslim.siddiqui\Downloads\Guvi Geek Data scrape.xlsx"
MAX_RETRIES = 2
WAIT_TIME = 3

# -------------------- DRIVER SETUP --------------------
def get_driver():
    options = uc.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    # options.add_argument("--headless")  # Enable for silent scraping
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    driver = uc.Chrome(options=options, version_main=138)
    return driver

# -------------------- SYLLABUS EXTRACTION --------------------
def extract_syllabus(soup):
    try:
        syllabus_div = soup.find("div", class_="topic-you-learn course-flow")
        if not syllabus_div:
            print("⚠️ Syllabus div not found")
            return "Syllabus not available"

        syllabus_text = ""
        modules = syllabus_div.find_all("div", class_="card")

        for module in modules:
            module_title_tag = module.find("h3", class_="mb-0")
            module_title = module_title_tag.get_text(strip=True) if module_title_tag else "Unnamed Module"
            syllabus_text += f"\n🔹 {module_title}\n"

            lessons = module.find_all("li")
            for lesson in lessons:
                lesson_text = lesson.get_text(strip=True)
                if lesson_text:
                    syllabus_text += f"    • {lesson_text}\n"

        return syllabus_text.strip()
    except Exception as e:
        print(f"❌ Syllabus extraction error: {str(e)}")
        return "Syllabus extraction failed"

# -------------------- SCRAPER --------------------
def scrape_course_data(url, driver):
    try:
        print(f"\n🌐 Accessing URL: {url}")
        driver.get(url)
        time.sleep(WAIT_TIME)  # Initial page load wait

        # Wait for course title
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "h1.main-heading.mb-2.courseTitle"))
        )

        # Scroll to load dynamic content
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
        time.sleep(WAIT_TIME)

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # 1. Course Name
        course_name = soup.find("h1", class_="main-heading mb-2 courseTitle").get_text(strip=True)
        print(f"📛 Course Name: {course_name}")

        # 2. About Course
        about_section = soup.find("div", id="about_description")
        about_course = about_section.find("p", class_="whats_in_it pb-3").get_text(strip=True) if about_section else "Not Found"
        print(f"📝 About Course: {about_course[:50]}...")

        # 3. Syllabus
        syllabus = extract_syllabus(soup)
        print(f"📚 Syllabus extracted ({len(syllabus.splitlines())} lines)")

        # 4. Certificate
        cert_img = soup.find("img", class_="certificateTemplate")
        if cert_img:
            certificate_url = cert_img["src"]
            certificate_url = urljoin("https://www.guvi.in", certificate_url)
        else:
            certificate_url = "No Certificate Found"
        print(f"🏆 Certificate URL: {certificate_url}")

        # 5. Price
        price_tag = soup.find("p", class_="mr-2 actual-price")
        price = price_tag.find("del").get_text(strip=True) if price_tag and price_tag.find("del") else "Price not available"
        print(f"💰 Price: {price}")

        # 6. Duration
        duration_div = soup.find("div", class_="border-right border-none")
        if duration_div:
            duration_tag = duration_div.find("p", class_="course-info-head")
            duration = duration_tag.find("span", class_="course_duration").get_text(strip=True) + " Hours" if duration_tag and duration_tag.find("span", class_="course_duration") else "Duration not available"
        else:
            duration = "Duration not available"
        print(f"⏳ Duration: {duration}")

        # 7. Language
        language_div = soup.find("div", class_="col-2")
        if language_div:
            language_tag = language_div.find("p", class_="course-info-head")
            language = language_tag.find("span", class_="course_language").get_text(strip=True) if language_tag and language_tag.find("span", class_="course_language") else "Language not available"
        else:
            language = "Language not available"
        print(f"🗣️ Language: {language}")

        # 8. Learning Mode
        learning_mode_tag = soup.find(lambda tag: tag.name == "p" and "of Recorded Content" in tag.text)
        learning_mode = learning_mode_tag.get_text(strip=True) if learning_mode_tag else "Learning mode not available"
        print(f"🎓 Learning Mode: {learning_mode}")

        return course_name, about_course, syllabus, certificate_url, price, duration, language, learning_mode

    except Exception as e:
        print(f"🔥 Scraping failed for {url}: {str(e)}")
        return ["Error"] * 8  # Return 8 error values for all columns

# -------------------- SAVE TO EXCEL --------------------
def save_to_excel(data, file_path):
    columns = [
        "Course Name", 
        "About Course", 
        "Syllabus", 
        "Certificate URL", 
        "Price",
        "Duration",
        "Language",
        "Learning Mode",
        "Course URL"
    ]

    try:
        if os.path.exists(file_path):
            df = pd.read_excel(file_path)
            for col in columns:
                if col not in df.columns:
                    df[col] = None
        else:
            df = pd.DataFrame(columns=columns)

        # Skip if course already exists
        if data[-1] in df["Course URL"].values:
            print(f"🔄 Course already exists: {data[0]}")
            return False

        # Add new row with matching columns
        new_row = pd.DataFrame([dict(zip(columns, [*data[:8], data[-1]]))])
        df = pd.concat([df, new_row], ignore_index=True)
        df.to_excel(file_path, index=False)
        print(f"💾 Saved data for: {data[0]}")
        return True

    except Exception as e:
        print(f"❌ Excel save error: {e}")
        return False

# -------------------- MAIN EXECUTION --------------------
if __name__ == "__main__":
    # Path to the Excel file containing course URLs
    INPUT_FILE = r"C:\Users\taslim.siddiqui\Downloads\Guvi geek course link.xlsx"
    URL_COLUMN = "Course URL"  # The column name in Excel

    # Read URLs from Excel
    try:
        df_urls = pd.read_excel(INPUT_FILE)
        if URL_COLUMN not in df_urls.columns:
            raise ValueError(f"❌ Column '{URL_COLUMN}' not found in {INPUT_FILE}")
        course_urls = df_urls[URL_COLUMN].dropna().tolist()
    except Exception as e:
        print(f"❌ Failed to read course URLs: {e}")
        course_urls = []

    print("🚀 Starting scraping process...")
    driver = get_driver()
    success_count = 0

    try:
        for course_url in course_urls:
            for attempt in range(MAX_RETRIES):
                try:
                    course_data = scrape_course_data(course_url, driver)
                    if all(item != "Error" for item in course_data):
                        if save_to_excel((*course_data, course_url), OUTPUT_FILE):
                            success_count += 1
                        break
                    else:
                        print(f"⚠️ Attempt {attempt + 1} failed for {course_url}")
                        if attempt == MAX_RETRIES - 1:
                            print(f"❌ Max retries reached for {course_url}")
                except Exception as e:
                    print(f"❌ Error during scraping {course_url}: {str(e)}")
                    if attempt == MAX_RETRIES - 1:
                        print(f"❌ Max retries reached for {course_url}")

        print(f"\n✅ Process completed. Successfully scraped {success_count} of {len(course_urls)} courses")
    finally:
        driver.quit()
        print("🚪 Browser closed")


🚀 Starting scraping process...

🌐 Accessing URL: https://www.guvi.in/courses/database-and-cloud-computing/aws/
📛 Course Name: 
📝 About Course: ...
⚠️ Syllabus div not found
📚 Syllabus extracted (1 lines)
🏆 Certificate URL: https://www.guvi.in/web-build/images/guvi-certificate.1437cfe91d767c2a24ea3afebde76dc7.png
💰 Price: Price not available
⏳ Duration:  Hours
🗣️ Language: 
🎓 Learning Mode: of Recorded Content
💾 Saved data for: 

🌐 Accessing URL: https://www.guvi.in/zen-class/devops-course/
🔥 Scraping failed for https://www.guvi.in/zen-class/devops-course/: Message: 
Stacktrace:
	GetHandleVerifier [0x0x57ba83+63395]
	GetHandleVerifier [0x0x57bac4+63460]
	(No symbol) [0x0x3c2113]
	(No symbol) [0x0x40a85e]
	(No symbol) [0x0x40abfb]
	(No symbol) [0x0x452f92]
	(No symbol) [0x0x42f3f4]
	(No symbol) [0x0x4507ba]
	(No symbol) [0x0x42f1a6]
	(No symbol) [0x0x3fe7b2]
	(No symbol) [0x0x3ff654]
	GetHandleVerifier [0x0x7f8883+2672035]
	GetHandleVerifier [0x0x7f3cba+2652634]
	GetHandleVerifier [0x0x5

Correct code for Guvi Greek  for url input

In [None]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from bs4 import BeautifulSoup
import time
import os
from urllib.parse import urljoin

# -------------------- CONFIGURATION --------------------
OUTPUT_FILE = r"C:\Users\taslim.siddiqui\Downloads\course001.xlsx"
MAX_RETRIES = 2
WAIT_TIME = 3

# -------------------- DRIVER SETUP --------------------
def get_driver():
    options = uc.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    # options.add_argument("--headless")  # Enable for silent scraping
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    driver = uc.Chrome(options=options, version_main=138)
    return driver

# -------------------- SYLLABUS EXTRACTION --------------------
def extract_syllabus(soup):
    try:
        syllabus_div = soup.find("div", class_="topic-you-learn course-flow")
        if not syllabus_div:
            print("⚠️ Syllabus div not found")
            return "Syllabus not available"

        syllabus_text = ""
        modules = syllabus_div.find_all("div", class_="card")

        for module in modules:
            module_title_tag = module.find("h3", class_="mb-0")
            module_title = module_title_tag.get_text(strip=True) if module_title_tag else "Unnamed Module"
            syllabus_text += f"\n🔹 {module_title}\n"

            lessons = module.find_all("li")
            for lesson in lessons:
                lesson_text = lesson.get_text(strip=True)
                if lesson_text:
                    syllabus_text += f"    • {lesson_text}\n"

        return syllabus_text.strip()
    except Exception as e:
        print(f"❌ Syllabus extraction error: {str(e)}")
        return "Syllabus extraction failed"

# -------------------- SCRAPER --------------------
def scrape_course_data(url, driver):
    try:
        print(f"\n🌐 Accessing URL: {url}")
        driver.get(url)
        time.sleep(WAIT_TIME)  # Initial page load wait

        # Wait for course title
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "h1.main-heading.mb-2.courseTitle"))
        )

        # Scroll to load dynamic content
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
        time.sleep(WAIT_TIME)

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # 1. Course Name
        course_name = soup.find("h1", class_="main-heading mb-2 courseTitle").get_text(strip=True)
        print(f"📛 Course Name: {course_name}")

        # 2. About Course
        about_section = soup.find("div", id="about_description")
        about_course = about_section.find("p", class_="whats_in_it pb-3").get_text(strip=True) if about_section else "Not Found"
        print(f"📝 About Course: {about_course[:50]}...")

        # 3. Syllabus
        syllabus = extract_syllabus(soup)
        print(f"📚 Syllabus extracted ({len(syllabus.splitlines())} lines)")

        # 4. Certificate
        cert_img = soup.find("img", class_="certificateTemplate")
        if cert_img:
            certificate_url = cert_img["src"]
            certificate_url = urljoin("https://www.guvi.in", certificate_url)
        else:
            certificate_url = "No Certificate Found"
        print(f"🏆 Certificate URL: {certificate_url}")

        # 5. Price
        price_tag = soup.find("p", class_="mr-2 actual-price")
        price = price_tag.find("del").get_text(strip=True) if price_tag and price_tag.find("del") else "Price not available"
        print(f"💰 Price: {price}")

        # 6. Duration
        duration_div = soup.find("div", class_="border-right border-none")
        if duration_div:
            duration_tag = duration_div.find("p", class_="course-info-head")
            duration = duration_tag.find("span", class_="course_duration").get_text(strip=True) + " Hours" if duration_tag and duration_tag.find("span", class_="course_duration") else "Duration not available"
        else:
            duration = "Duration not available"
        print(f"⏳ Duration: {duration}")

        # 7. Language
        language_div = soup.find("div", class_="col-2")
        if language_div:
            language_tag = language_div.find("p", class_="course-info-head")
            language = language_tag.find("span", class_="course_language").get_text(strip=True) if language_tag and language_tag.find("span", class_="course_language") else "Language not available"
        else:
            language = "Language not available"
        print(f"🗣️ Language: {language}")

        # 8. Learning Mode
        learning_mode_tag = soup.find(lambda tag: tag.name == "p" and "of Recorded Content" in tag.text)
        learning_mode = learning_mode_tag.get_text(strip=True) if learning_mode_tag else "Learning mode not available"
        print(f"🎓 Learning Mode: {learning_mode}")

        return course_name, about_course, syllabus, certificate_url, price, duration, language, learning_mode

    except Exception as e:
        print(f"🔥 Scraping failed for {url}: {str(e)}")
        return ["Error"] * 8  # Return 8 error values for all columns

# -------------------- SAVE TO EXCEL --------------------
def save_to_excel(data, file_path):
    columns = [
        "Course Name", 
        "About Course", 
        "Syllabus", 
        "Certificate URL", 
        "Price",
        "Duration",
        "Language",
        "Learning Mode",
        "Course URL"
    ]

    try:
        if os.path.exists(file_path):
            df = pd.read_excel(file_path)
            for col in columns:
                if col not in df.columns:
                    df[col] = None
        else:
            df = pd.DataFrame(columns=columns)

        # Skip if course already exists
        if data[-1] in df["Course URL"].values:
            print(f"🔄 Course already exists: {data[0]}")
            return False

        # Add new row with matching columns
        new_row = pd.DataFrame([dict(zip(columns, [*data[:8], data[-1]]))])
        df = pd.concat([df, new_row], ignore_index=True)
        df.to_excel(file_path, index=False)
        print(f"💾 Saved data for: {data[0]}")
        return True

    except Exception as e:
        print(f"❌ Excel save error: {e}")
        return False

# -------------------- MAIN EXECUTION --------------------
if __name__ == "__main__":
    course_urls = [
        "https://www.guvi.in/courses/machine-learning-and-ai/chatbot-web-application/",
        "https://www.guvi.in/courses/software-testing-and-automation/mastering-puppet",
        "https://www.guvi.in/courses/machine-learning-and-ai/nlg-and-nlu/",
        # Add more URLs here
    ]

    print("🚀 Starting scraping process...")
    driver = get_driver()
    success_count = 0

    try:
        for course_url in course_urls:
            for attempt in range(MAX_RETRIES):
                try:
                    course_data = scrape_course_data(course_url, driver)
                    if all(item != "Error" for item in course_data):
                        if save_to_excel((*course_data, course_url), OUTPUT_FILE):
                            success_count += 1
                        break
                    else:
                        print(f"⚠️ Attempt {attempt + 1} failed for {course_url}")
                        if attempt == MAX_RETRIES - 1:
                            print(f"❌ Max retries reached for {course_url}")
                except Exception as e:
                    print(f"❌ Error during scraping {course_url}: {str(e)}")
                    if attempt == MAX_RETRIES - 1:
                        print(f"❌ Max retries reached for {course_url}")

        print(f"\n✅ Process completed. Successfully scraped {success_count} of {len(course_urls)} courses")
    finally:
        driver.quit()
        print("🚪 Browser closed")