INPUT FILE IS IN EXCEL WITHOUT  FAQ

In [None]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
import pandas as pd
from bs4 import BeautifulSoup
import time
import os


# ================= Driver Setup =================
def get_driver():
    options = uc.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    return uc.Chrome(options=options, version_main=138)


def clean_text(text):
    """Clean and normalize text"""
    if text:
        text = ' '.join(text.split())  # Remove extra whitespace
        text = text.replace('\xa0', ' ')  # Replace non-breaking spaces
        text = text.replace('"', "'")  # Normalize quotes
    return text.strip() if text else ""


# ================= Certificate Extraction =================
def extract_certificate_image(soup):
    """Extract certificate image URL from the page"""
    certificate_div = soup.find(
        "div", id=lambda x: x and ("cerificate" in x.lower() or "certificate" in x.lower())
    )
    if certificate_div:
        img_tag = certificate_div.find("img")
        if img_tag and "src" in img_tag.attrs:
            return img_tag["src"]
    return "Certificate image not found"


# ================= Syllabus Extraction =================
def extract_syllabus(soup):
    syllabus = []
    curriculum_div = soup.find("div", id="curriculum")

    if curriculum_div:
        modules = curriculum_div.find_all("li", class_="bg-white my-2 shadow-lg")

        for module in modules:
            # Extract module title
            title_span = module.find("span")
            if title_span:
                module_title = clean_text(title_span.get_text())
                syllabus.append(f"\n🔹 {module_title}\n")

            # Extract module content
            content_div = module.find("div", class_="mx-6 pl-6")
            if content_div:
                for elem in content_div.children:
                    if elem.name == "p":  # Paragraphs
                        strong = elem.find("strong")
                        if strong:
                            strong_text = clean_text(strong.get_text())
                            syllabus.append(f"\n📌 {strong_text}\n")
                            remaining_text = clean_text(elem.text.replace(strong.text, ""))
                            if remaining_text:
                                syllabus.append(f"   {remaining_text}\n")
                        else:
                            text = clean_text(elem.get_text())
                            if text:
                                syllabus.append(f"   {text}\n")

                    elif elem.name == "ul":  # Bullet lists
                        for li in elem.find_all("li"):
                            li_text = clean_text(li.get_text())
                            if li_text:
                                syllabus.append(f"   - {li_text}\n")

                    elif elem.string and elem.string.strip():  # Plain text nodes
                        text = clean_text(elem.string)
                        if text:
                            syllabus.append(f"   {text}\n")

            syllabus.append("\n")  # Add extra line between modules

    return ''.join(syllabus).strip() if syllabus else "Syllabus not available"


# ================= Course Data Extraction =================
def extract_course_data(url):
    driver = None
    try:
        driver = get_driver()
        driver.get(url)

        # Wait for page to load
        time.sleep(5)

        # Scroll to trigger content loading
        for _ in range(3):
            driver.execute_script("window.scrollBy(0, 500)")
            time.sleep(1)

        # Get page source for BeautifulSoup
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')

        # Extract course name
        course_name = "Not Found"
        try:
            course_name = clean_text(soup.find("h1").get_text())
        except:
            pass

        # Extract about course (all paragraphs)
        about_course = "Not Found"
        try:
            about_div = soup.find("div", class_="font-ad")
            if about_div:
                paragraphs = [clean_text(p.get_text()) for p in about_div.find_all("p")]
                about_course = " ".join(p for p in paragraphs if p)
        except:
            pass

        # Extract syllabus
        syllabus = extract_syllabus(soup)

        # Extract certificate image
        certificate_image = extract_certificate_image(soup)

        return {
            'Course Name': course_name,
            'About Course': about_course,
            'Syllabus': syllabus,
            'Certificate Image URL': certificate_image,
            'URL': url
        }

    except Exception as e:
        print(f"Error occurred while scraping {url}: {str(e)}")
        return {
            'Course Name': 'Error',
            'About Course': 'Error',
            'Syllabus': 'Error',
            'Certificate Image URL': 'Error',
            'URL': url
        }
    finally:
        if driver:
            driver.quit()


# ================= Save to Excel =================
def save_to_excel(data, filename="course_data.xlsx"):
    df = pd.DataFrame([data])
    try:
        existing_df = pd.read_excel(filename)
        if data['URL'] not in existing_df['URL'].values:
            updated_df = pd.concat([existing_df, df], ignore_index=True)
            updated_df.to_excel(filename, index=False)
            print(f"Data appended to {filename}")
        else:
            print(f"Data for {data['URL']} already exists in {filename}")
    except FileNotFoundError:
        df.to_excel(filename, index=False)
        print(f"New file {filename} created")


# ================= Read URLs from Excel =================
def read_urls_from_excel(file_path, url_column="URL"):
    try:
        df = pd.read_excel(file_path)
        if url_column in df.columns:
            return df[url_column].dropna().unique().tolist()
        else:
            print(f"Error: Column '{url_column}' not found in the Excel file")
            return []
    except Exception as e:
        print(f"Error reading Excel file: {str(e)}")
        return []


# ================= Main =================
def main():
    input_file = r"C:\Users\taslim.siddiqui\Downloads\Pending.xlsx"
    output_file = r"C:\Users\taslim.siddiqui\Downloads\HENRY pending data 6.xlsx"

    urls = read_urls_from_excel(input_file)

    if not urls:
        print("No URLs found in the input file. Please check your Excel file.")
        return

    print(f"Found {len(urls)} URLs to scrape")

    for url in urls:
        print(f"\nScraping: {url}")
        course_data = extract_course_data(url)
        print(f"Course Name: {course_data['Course Name']}")
        save_to_excel(course_data, filename=output_file)
        print("---")

    print("\nScraping completed!")


if __name__ == "__main__":
    main()


Found 6 URLs to scrape

Scraping: https://www.henryharvin.com/medical-billing-course
Course Name: Medical Billing Course Training
New file C:\Users\taslim.siddiqui\Downloads\HENRY pending data 6.xlsx created
---

Scraping: https://www.henryharvin.com/french-language-course
Course Name: French Language Course
Data appended to C:\Users\taslim.siddiqui\Downloads\HENRY pending data 6.xlsx
---

Scraping: https://www.henryharvin.com/blockchain-certification-training
Course Name: Blockchain Certification Training Course
Data appended to C:\Users\taslim.siddiqui\Downloads\HENRY pending data 6.xlsx
---

Scraping: https://www.henryharvin.com/patent-drafting-course
Course Name: Patent Drafting Course
Data appended to C:\Users\taslim.siddiqui\Downloads\HENRY pending data 6.xlsx
---

Scraping: Certificate Course on Property Law (henryharvin.com)
Error occurred while scraping Certificate Course on Property Law (henryharvin.com): Message: invalid argument
  (Session info: chrome=139.0.7258.67)
Stackt

INPUT FILE IS EXCEL FILE  with FAQ

In [1]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
import pandas as pd
from bs4 import BeautifulSoup
import time
import os

# ================= Driver Setup =================
def get_driver():
    options = uc.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    return uc.Chrome(options=options, version_main=138)

def clean_text(text):
    """Clean and normalize text"""
    if text:
        text = ' '.join(text.split())  # Remove extra whitespace
        text = text.replace('\xa0', ' ')  # Replace non-breaking spaces
        text = text.replace('"', "'")  # Normalize quotes
    return text.strip() if text else ""

# ================= Certificate Extraction =================
def extract_certificate_image(soup):
    """Extract certificate image URL from the page"""
    certificate_div = soup.find("div", id=lambda x: x and "cerificate" in x.lower())
    if certificate_div:
        img_tag = certificate_div.find("img")
        if img_tag and "src" in img_tag.attrs:
            return img_tag["src"]
    return "Certificate image not found"

# ================= Syllabus Extraction =================
def extract_syllabus(soup):
    syllabus = []
    curriculum_div = soup.find("div", id="curriculum")
    
    if curriculum_div:
        modules = curriculum_div.find_all("li", class_="bg-white my-2 shadow-lg")
        
        for module in modules:
            # Extract module title
            title_span = module.find("span")
            if title_span:
                module_title = clean_text(title_span.get_text())
                syllabus.append(f"\n🔹 {module_title}\n")
            
            # Extract module content
            content_div = module.find("div", class_="mx-6 pl-6")
            if content_div:
                for elem in content_div.children:
                    if elem.name == "p":  # Paragraphs
                        strong = elem.find("strong")
                        if strong:
                            strong_text = clean_text(strong.get_text())
                            syllabus.append(f"\n📌 {strong_text}\n")
                            remaining_text = clean_text(elem.text.replace(strong.text, ""))
                            if remaining_text:
                                syllabus.append(f"   {remaining_text}\n")
                        else:
                            text = clean_text(elem.get_text())
                            if text:
                                syllabus.append(f"   {text}\n")
                    
                    elif elem.name == "ul":  # Bullet lists
                        for li in elem.find_all("li"):
                            li_text = clean_text(li.get_text())
                            if li_text:
                                syllabus.append(f"   - {li_text}\n")
                    
                    elif elem.string and elem.string.strip():  # Plain text nodes
                        text = clean_text(elem.string)
                        if text:
                            syllabus.append(f"   {text}\n")
            
            syllabus.append("\n")  # Add extra line between modules
    
    return ''.join(syllabus).strip() if syllabus else "Syllabus not available"

# ================= FAQ Extraction =================
def extract_faq_text(soup):
    faq_items = soup.find_all("li", class_="bg-white my-2")
    faq_text_list = []
    for li in faq_items:
        question_tag = li.find("h2")
        question = question_tag.get_text(strip=True) if question_tag else ""
        answer_div = li.find("div", {"x-ref": "tab"})
        if answer_div:
            answer_parts = []
            for p in answer_div.find_all("p"):
                text = p.get_text(strip=True)
                if text and not text.lower().startswith("looking for as an it security analyst"):
                    answer_parts.append(text)
            answer = " ".join(answer_parts)
        else:
            answer = ""
        if question or answer:
            faq_text_list.append(f"Q: {question}\nA: {answer}")
    return "\n\n".join(faq_text_list) if faq_text_list else "FAQ not available"

# ================= Course Data Extraction =================
def extract_course_data(url):
    driver = None
    try:
        driver = get_driver()
        driver.get(url)
        
        time.sleep(5)
        
        for _ in range(3):
            driver.execute_script("window.scrollBy(0, 500)")
            time.sleep(1)
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Course name
        try:
            course_name = clean_text(soup.find("h1").get_text())
        except:
            course_name = "Not Found"
        
        # About course
        try:
            about_course = "Not Found"
            about_div = soup.find("div", class_="font-ad")
            if about_div:
                about_paragraph = about_div.find("p")
                if about_paragraph:
                    about_course = clean_text(about_paragraph.get_text())
        except:
            about_course = "Not Found"
        
        # Syllabus
        syllabus = extract_syllabus(soup)
        
        # Certificate image
        certificate_image = extract_certificate_image(soup)
        
        # FAQ
        faq_text = extract_faq_text(soup)
        
        return {
            'Course Name': course_name,
            'About Course': about_course,
            'Syllabus': syllabus,
            'Certificate Image URL': certificate_image,
            'FAQ': faq_text,
            'URL': url
        }
        
    except Exception as e:
        print(f"Error occurred while scraping {url}: {str(e)}")
        return {
            'Course Name': 'Error',
            'About Course': 'Error',
            'Syllabus': 'Error',
            'Certificate Image URL': 'Error',
            'FAQ': 'Error',
            'URL': url
        }
    finally:
        if driver:
            driver.quit()

# ================= Process Input File =================
def process_input_file(input_file, output_file):
    df_input = pd.read_excel(input_file)
    all_data = []
    for _, row in df_input.iterrows():
        url = row['URL']
        course_data = extract_course_data(url)
        
        # ✅ Print course name & link
        print(f"Course Name: {course_data['Course Name']}")
        print(f"Course Link: {course_data['URL']}\n")
        print(f"Course Name: {course_data['Course Name']}")
        print(f"Syllabus:\n{course_data['Syllabus']}")
        
        all_data.append(course_data)
    
    df_output = pd.DataFrame(all_data)
    df_output.to_excel(output_file, index=False)
    print(f"Data saved to {output_file}")

# ================= Run Script =================
if __name__ == "__main__":
    input_file = r"C:\Users\taslim.siddiqui\Downloads\HenryHarvin_All_Courses (2).xlsx"
    output_file = r"C:\Users\taslim.siddiqui\Downloads\HENRY_Raghu.xlsx"
    process_input_file(input_file, output_file)


Course Name: WHY CHOOSE HENRY HARVIN® ANIMATION ACADEMY?
Course Link: https://www.henryharvin.com/3d-vfx-course

Course Name: WHY CHOOSE HENRY HARVIN® ANIMATION ACADEMY?
Syllabus:
Syllabus not available
Course Name: ASTB Test Prep Course
Course Link: https://www.henryharvin.com/astb-test-prep-course

Course Name: ASTB Test Prep Course
Syllabus:
🔹 Module 1: MST (Math Skills Test)
   This includes elementary probability questions as well as some simple word problems involving time and distance. Also contain some basic geometry, algebra, angles, and perimeter.


🔹 Module 2: RCT (Reading Comprehension Test)
   This section tests the ability to read and comprehend passages. Identifying the passage's main idea or determining whether it supports a particular claim will frequently be asked of learners.


🔹 Module 3: MCT (Mechanical Comprehension Test)
   Understanding of fundamental physics and mechanics principles will be evaluated by this subtest. Gasses and liquids, pressure, volume, veloci

EXTRACT COURSE NAME CERTIFIACTE IMAGE FORM COURSE LINK DIRECT

In [None]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
import pandas as pd
from bs4 import BeautifulSoup
import time

def get_driver():
    options = uc.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    return uc.Chrome(options=options, version_main=138)

def clean_text(text):
    """Clean and normalize text"""
    if text:
        text = ' '.join(text.split())  # Remove extra whitespace
        text = text.replace('\xa0', ' ')  # Replace non-breaking spaces
        text = text.replace('"', "'")  # Normalize quotes
    return text.strip() if text else ""

def extract_certificate_image(soup):
    """Extract certificate image URL from the page"""
    certificate_div = soup.find("div", id=lambda x: x and "cerificate" in x.lower())
    if certificate_div:
        img_tag = certificate_div.find("img")
        if img_tag and "src" in img_tag.attrs:
            return img_tag["src"]
    return "Certificate image not found"

def extract_syllabus(soup):
    syllabus = []
    curriculum_div = soup.find("div", id="curriculum")
    
    if curriculum_div:
        modules = curriculum_div.find_all("li", class_="bg-white my-2 shadow-lg")
        
        for module in modules:
            # Extract module title
            title_span = module.find("span")
            if title_span:
                module_title = clean_text(title_span.get_text())
                syllabus.append(f"\n🔹 {module_title}\n")
            
            # Extract module content
            content_div = module.find("div", class_="mx-6 pl-6")
            if content_div:
                current_element = content_div.find_next()
                while current_element and current_element.name:
                    if current_element.name == "p":
                        strong = current_element.find("strong")
                        if strong:
                            strong_text = clean_text(strong.get_text())
                            syllabus.append(f"\n📌 {strong_text}\n")
                            remaining_text = clean_text(current_element.text.replace(strong.text, ""))
                            if remaining_text:
                                syllabus.append(f"   {remaining_text}\n")
                        else:
                            text = clean_text(current_element.get_text())
                            if text:
                                syllabus.append(f"   {text}\n")
                    elif current_element.name == "ul":
                        for li in current_element.find_all("li"):
                            li_text = clean_text(li.get_text())
                            if li_text:
                                syllabus.append(f"   - {li_text}\n")
                    elif current_element.string and current_element.string.strip():  # Plain text nodes
                        text = clean_text(current_element.string)
                        if text:
                            syllabus.append(f"   {text}\n")
                    
                    current_element = current_element.find_next_sibling()
            
            syllabus.append("\n")  # Extra line between modules
    
    return ''.join(syllabus).strip() if syllabus else "Syllabus not available"

def extract_course_data(url):
    driver = None
    try:
        driver = get_driver()
        driver.get(url)
        
        # Wait for page to load
        time.sleep(5)
        
        # Scroll to trigger content loading
        for _ in range(3):
            driver.execute_script("window.scrollBy(0, 500)")
            time.sleep(1)
        
        # Get page source for BeautifulSoup
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        
        # Extract course name
        course_name = "Not Found"
        try:
            course_name = clean_text(soup.find("h1").get_text())
        except:
            pass
        
        # Extract about course
        about_course = "Not Found"
        try:
            about_div = soup.find("div", class_="font-ad")
            if about_div:
                about_paragraph = about_div.find("p")
                if about_paragraph:
                    about_course = clean_text(about_paragraph.get_text())
        except:
            pass
        
        # Extract syllabus
        syllabus = extract_syllabus(soup)
        
        # Extract certificate image
        certificate_image = extract_certificate_image(soup)
        
        return {
            'Course Name': course_name,
            'About Course': about_course,
            'Syllabus': syllabus,
            'Certificate Image URL': certificate_image,
            'URL': url
        }
        
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        return {
            'Course Name': 'Error',
            'About Course': 'Error',
            'Syllabus': 'Error',
            'Certificate Image URL': 'Error',
            'URL': url
        }
    finally:
        if driver:
            driver.quit()

def save_to_excel(data, filename="course_data1.xlsx"):
    # Create DataFrame with the data
    df = pd.DataFrame([data])
    
    try:
        # Try to read existing file
        existing_df = pd.read_excel(filename)
        # Check if URL already exists to prevent duplicates
        if data['URL'] not in existing_df['URL'].values:
            updated_df = pd.concat([existing_df, df], ignore_index=True)
            updated_df.to_excel(filename, index=False)
            print(f"Data appended to {filename}")
        else:
            print(f"Data for {data['URL']} already exists in {filename}")
    except FileNotFoundError:
        # If file doesn't exist, create new one
        df.to_excel(filename, index=False)
        print(f"New file {filename} created")

# List of course URLs to scrape
urls = [
    "https://www.henryharvin.com/banking-law-course",
    "https://www.henryharvin.com/french-language-course",
    # Add more URLs here
]

for url in urls:
    print(f"\nScraping: {url}")
    course_data = extract_course_data(url)
    print(f"Course Name: {course_data['Course Name']}")
    save_to_excel(course_data)
    print("---")



Scraping: https://www.henryharvin.com/banking-law-course
Course Name: Banking Law Course
New file course_data1.xlsx created
---

Scraping: https://www.henryharvin.com/french-language-course
Course Name: French Language Course
Data appended to course_data1.xlsx
---
