WEB SCRAPPING DATA FOR HENRY HARVING

In [None]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import pandas as pd
import time

# ================= Driver Setup =================
def get_driver():
    options = uc.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    return uc.Chrome(options=options, version_main=138)

def clean_text(text):
    """Clean and normalize text"""
    if text:
        text = ' '.join(text.split())
        text = text.replace('\xa0', ' ')
        text = text.replace('"', "'")
    return text.strip() if text else ""

# ================= Certificate Extraction =================
def extract_certificate_image(soup):
    """Extract certificate image URL from the page"""
    certificate_div = soup.find("div", id=lambda x: x and "cerificate" in x.lower())
    if certificate_div:
        img_tag = certificate_div.find("img")
        if img_tag and "src" in img_tag.attrs:
            return img_tag["src"]
    return "Certificate image not found"

# ================= Syllabus Extraction =================
def extract_syllabus(soup):
    syllabus = []
    curriculum_div = soup.find("div", id="curriculum")
    
    if curriculum_div:
        modules = curriculum_div.find_all("li", class_="bg-white my-2 shadow-lg")
        for module in modules:
            title_span = module.find("span")
            if title_span:
                module_title = clean_text(title_span.get_text())
                syllabus.append(f"\n🔹 {module_title}\n")
            
            content_div = module.find("div", class_="mx-6 pl-6")
            if content_div:
                for elem in content_div.children:
                    if elem.name == "p":
                        strong = elem.find("strong")
                        if strong:
                            strong_text = clean_text(strong.get_text())
                            syllabus.append(f"\n📌 {strong_text}\n")
                            remaining_text = clean_text(elem.text.replace(strong.text, ""))
                            if remaining_text:
                                syllabus.append(f"   {remaining_text}\n")
                        else:
                            text = clean_text(elem.get_text())
                            if text:
                                syllabus.append(f"   {text}\n")
                    elif elem.name == "ul":
                        for li in elem.find_all("li"):
                            li_text = clean_text(li.get_text())
                            if li_text:
                                syllabus.append(f"   - {li_text}\n")
                    elif elem.string and elem.string.strip():
                        text = clean_text(elem.string)
                        if text:
                            syllabus.append(f"   {text}\n")
            syllabus.append("\n")
    return ''.join(syllabus).strip() if syllabus else "Syllabus not available"

# ================= FAQ Extraction =================
def extract_faq_text(soup):
    faq_items = soup.find_all("li", class_="bg-white my-2")
    faq_text_list = []
    for li in faq_items:
        question_tag = li.find("h2")
        question = question_tag.get_text(strip=True) if question_tag else ""
        answer_div = li.find("div", {"x-ref": "tab"})
        if answer_div:
            answer_parts = []
            for p in answer_div.find_all("p"):
                text = p.get_text(strip=True)
                if text and not text.lower().startswith("looking for as an it security analyst"):
                    answer_parts.append(text)
            answer = " ".join(answer_parts)
        else:
            answer = ""
        if question or answer:
            faq_text_list.append(f"Q: {question}\nA: {answer}")
    return "\n\n".join(faq_text_list) if faq_text_list else "FAQ not available"

# ================= Course Data Extraction =================
def extract_course_data(url):
    driver = get_driver()
    try:
        driver.set_page_load_timeout(180)
        driver.get(url)

        # Wait for course title
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.TAG_NAME, "h1"))
        )

        # Try waiting for syllabus section, but don’t crash if not found
        try:
            WebDriverWait(driver, 25).until(
                EC.presence_of_element_located((By.ID, "curriculum"))
            )
        except TimeoutException:
            print("⚠️ Warning: Syllabus section not fully loaded, continuing...")

        time.sleep(3)  # allow JS rendering

        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Course name
        try:
            course_name = clean_text(soup.find("h1").get_text())
        except:
            course_name = "Not Found"

        # About course
        try:
            about_course = "Not Found"
            about_div = soup.find("div", class_="font-ad") or soup.find("div", class_="tutor-course-content-content")
            if about_div:
                about_paragraph = about_div.find("p")
                if about_paragraph:
                    about_course = clean_text(about_paragraph.get_text())
        except:
            about_course = "Not Found"

        # Syllabus
        syllabus = extract_syllabus(soup)

        # Certificate image
        certificate_image = extract_certificate_image(soup)

        # FAQ
        faq_text = extract_faq_text(soup)

        return {
            'Course Name': course_name,
            'About Course': about_course,
            'Syllabus': syllabus,
            'Certificate Image URL': certificate_image,
            'FAQ': faq_text,
            'URL': url
        }

    except TimeoutException:
        print(f"⏳ Timeout while loading {url}")
        return {
            'Course Name': "Timeout Error",
            'About Course': "Timeout Error",
            'Syllabus': "Timeout Error",
            'Certificate Image URL': "Timeout Error",
            'FAQ': "Timeout Error",
            'URL': url
        }

    finally:
        driver.quit()

# ================= Run for Single Link =================
if __name__ == "__main__":
    url = "https://www.henryharvin.com/personality-development-course"
    data = extract_course_data(url)

    # Print to console
    print("\n======================")
    print("Course Name:", data['Course Name'])
    print("About Course:", data['About Course'])
    print("\nSyllabus:\n", data['Syllabus'])
    print("\nCertificate Image URL:", data['Certificate Image URL'])
    print("\nFAQ:\n", data['FAQ'])

    # Save to Excel
    output_file = r"C:\Users\taslim.siddiqui\Downloads\HenryHarvin_Personality_Development.xlsx"
    df = pd.DataFrame([data])
    df.to_excel(output_file, index=False)
    print(f"\n✅ Data saved to {output_file}")


In [None]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
import pandas as pd
from bs4 import BeautifulSoup
import time
import os

# ================= Driver Setup =================
def get_driver():
    options = uc.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    return uc.Chrome(options=options, version_main=138)

def clean_text(text):
    """Clean and normalize text"""
    if text:
        text = ' '.join(text.split())  # Remove extra whitespace
        text = text.replace('\xa0', ' ')  # Replace non-breaking spaces
        text = text.replace('"', "'")  # Normalize quotes
    return text.strip() if text else ""

# ================= Certificate Extraction =================
def extract_certificate_image(soup):
    """Extract certificate image URL from the page"""
    certificate_div = soup.find("div", id=lambda x: x and "cerificate" in x.lower())
    if certificate_div:
        img_tag = certificate_div.find("img")
        if img_tag and "src" in img_tag.attrs:
            return img_tag["src"]
    return "Certificate image not found"

# ================= Syllabus Extraction =================
def extract_syllabus(soup):
    syllabus = []
    curriculum_div = soup.find("div", id="curriculum")
    
    if curriculum_div:
        modules = curriculum_div.find_all("li", class_="bg-white my-2 shadow-lg")
        
        for module in modules:
            # Extract module title
            title_span = module.find("span")
            if title_span:
                module_title = clean_text(title_span.get_text())
                syllabus.append(f"\n🔹 {module_title}\n")
            
            # Extract module content
            content_div = module.find("div", class_="mx-6 pl-6")
            if content_div:
                for elem in content_div.children:
                    if elem.name == "p":  # Paragraphs
                        strong = elem.find("strong")
                        if strong:
                            strong_text = clean_text(strong.get_text())
                            syllabus.append(f"\n📌 {strong_text}\n")
                            remaining_text = clean_text(elem.text.replace(strong.text, ""))
                            if remaining_text:
                                syllabus.append(f"   {remaining_text}\n")
                        else:
                            text = clean_text(elem.get_text())
                            if text:
                                syllabus.append(f"   {text}\n")
                    
                    elif elem.name == "ul":  # Bullet lists
                        for li in elem.find_all("li"):
                            li_text = clean_text(li.get_text())
                            if li_text:
                                syllabus.append(f"   - {li_text}\n")
                    
                    elif elem.string and elem.string.strip():  # Plain text nodes
                        text = clean_text(elem.string)
                        if text:
                            syllabus.append(f"   {text}\n")
            
            syllabus.append("\n")  # Add extra line between modules
    
    return ''.join(syllabus).strip() if syllabus else "Syllabus not available"

# ================= FAQ Extraction =================
def extract_faq_text(soup):
    faq_items = soup.find_all("li", class_="bg-white my-2")
    faq_text_list = []
    for li in faq_items:
        question_tag = li.find("h2")
        question = question_tag.get_text(strip=True) if question_tag else ""
        answer_div = li.find("div", {"x-ref": "tab"})
        if answer_div:
            answer_parts = []
            for p in answer_div.find_all("p"):
                text = p.get_text(strip=True)
                if text and not text.lower().startswith("looking for as an it security analyst"):
                    answer_parts.append(text)
            answer = " ".join(answer_parts)
        else:
            answer = ""
        if question or answer:
            faq_text_list.append(f"Q: {question}\nA: {answer}")
    return "\n\n".join(faq_text_list) if faq_text_list else "FAQ not available"

# ================= Course Data Extraction =================
def extract_course_data(url):
    driver = None
    try:
        driver = get_driver()
        driver.get(url)
        
        time.sleep(5)
        
        for _ in range(3):
            driver.execute_script("window.scrollBy(0, 500)")
            time.sleep(1)
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Course name
        try:
            course_name = clean_text(soup.find("h1").get_text())
        except:
            course_name = "Not Found"
        
        # About course
        try:
            about_course = "Not Found"
            about_div = soup.find("div", class_="font-ad")
            if about_div:
                about_paragraph = about_div.find("p")
                if about_paragraph:
                    about_course = clean_text(about_paragraph.get_text())
        except:
            about_course = "Not Found"
        
        # Syllabus
        syllabus = extract_syllabus(soup)
        
        # Certificate image
        certificate_image = extract_certificate_image(soup)
        
        # FAQ
        faq_text = extract_faq_text(soup)
        
        return {
            'Course Name': course_name,
            'About Course': about_course,
            'Syllabus': syllabus,
            'Certificate Image URL': certificate_image,
            'FAQ': faq_text,
            'URL': url
        }
        
    except Exception as e:
        print(f"Error occurred while scraping {url}: {str(e)}")
        return {
            'Course Name': 'Error',
            'About Course': 'Error',
            'Syllabus': 'Error',
            'Certificate Image URL': 'Error',
            'FAQ': 'Error',
            'URL': url
        }
    finally:
        if driver:
            driver.quit()

# ================= Process Input File =================
def process_input_file(input_file, output_file):
    df_input = pd.read_excel(input_file)
    all_data = []
    for _, row in df_input.iterrows():
        url = row['URL']
        course_data = extract_course_data(url)
        
        # ✅ Print course name & link
        print(f"Course Name: {course_data['Course Name']}")
        print(f"Course Link: {course_data['URL']}\n")
        print(f"Course Name: {course_data['Course Name']}")
        print(f"Syllabus:\n{course_data['Syllabus']}")
        
        all_data.append(course_data)
    
    df_output = pd.DataFrame(all_data)
    df_output.to_excel(output_file, index=False)
    print(f"Data saved to {output_file}")

# ================= Run Script =================
if __name__ == "__main__":
    input_file = r"C:\Users\taslim.siddiqui\Downloads\HenryHarvin_All_Courses (2).xlsx"
    output_file = r"C:\Users\taslim.siddiqui\Downloads\HENRY_Raghu.xlsx"
    process_input_file(input_file, output_file)


In [3]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
import pandas as pd
from bs4 import BeautifulSoup
import time
import os

# ================= Driver Setup =================
def get_driver():
    options = uc.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1280,720")
    options.add_argument("--log-level=3")
    return uc.Chrome(options=options, version_main=138)

def clean_text(text):
    """Clean and normalize text"""
    if text:
        text = ' '.join(text.split())  # Remove extra whitespace
        text = text.replace('\xa0', ' ')  # Replace non-breaking spaces
        text = text.replace('"', "'")  # Normalize quotes
    return text.strip() if text else ""

# ================= Certificate Extraction =================
def extract_certificate_image(soup):
    """Extract certificate image URL from the page"""
    certificate_div = soup.find("div", id=lambda x: x and "cerificate" in x.lower())
    if certificate_div:
        img_tag = certificate_div.find("img")
        if img_tag and "src" in img_tag.attrs:
            return img_tag["src"]
    return "Certificate image not found"

# ================= Syllabus Extraction =================
def extract_syllabus(soup):
    syllabus = []
    curriculum_div = soup.find("div", id="curriculum")
    
    if curriculum_div:
        modules = curriculum_div.find_all("li", class_="bg-white my-2 shadow-lg")
        
        for module in modules:
            # Extract module title
            title_span = module.find("span")
            if title_span:
                module_title = clean_text(title_span.get_text())
                syllabus.append(f"\n🔹 {module_title}\n")
            
            # Extract module content
            content_div = module.find("div", class_="mx-6 pl-6")
            if content_div:
                for elem in content_div.children:
                    if elem.name == "p":  # Paragraphs
                        strong = elem.find("strong")
                        if strong:
                            strong_text = clean_text(strong.get_text())
                            syllabus.append(f"\n📌 {strong_text}\n")
                            remaining_text = clean_text(elem.text.replace(strong.text, ""))
                            if remaining_text:
                                syllabus.append(f"   {remaining_text}\n")
                        else:
                            text = clean_text(elem.get_text())
                            if text:
                                syllabus.append(f"   {text}\n")
                    
                    elif elem.name == "ul":  # Bullet lists
                        for li in elem.find_all("li"):
                            li_text = clean_text(li.get_text())
                            if li_text:
                                syllabus.append(f"   - {li_text}\n")
                    
                    elif elem.string and elem.string.strip():  # Plain text nodes
                        text = clean_text(elem.string)
                        if text:
                            syllabus.append(f"   {text}\n")
            
            syllabus.append("\n")  # Add extra line between modules
    
    return ''.join(syllabus).strip() if syllabus else "Syllabus not available"

# ================= FAQ Extraction =================
def extract_faq_text(soup):
    faq_items = soup.find_all("li", class_="bg-white my-2")
    faq_text_list = []
    for li in faq_items:
        question_tag = li.find("h2")
        question = question_tag.get_text(strip=True) if question_tag else ""
        answer_div = li.find("div", {"x-ref": "tab"})
        if answer_div:
            answer_parts = []
            for p in answer_div.find_all("p"):
                text = p.get_text(strip=True)
                if text and not text.lower().startswith("looking for as an it security analyst"):
                    answer_parts.append(text)
            answer = " ".join(answer_parts)
        else:
            answer = ""
        if question or answer:
            faq_text_list.append(f"Q: {question}\nA: {answer}")
    return "\n\n".join(faq_text_list) if faq_text_list else "FAQ not available"

# ================= Course Data Extraction =================
def extract_course_data(url):
    driver = None
    try:
        driver = get_driver()
        driver.get(url)
        
        time.sleep(5)
        
        for _ in range(3):
            driver.execute_script("window.scrollBy(0, 500)")
            time.sleep(1)
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Course name
        try:
            course_name = clean_text(soup.find("h1").get_text())
        except:
            course_name = "Not Found"
        
        # About course
        try:
            about_course = "Not Found"
            about_div = soup.find("div", class_="font-ad")
            if about_div:
                about_paragraph = about_div.find("p")
                if about_paragraph:
                    about_course = clean_text(about_paragraph.get_text())
        except:
            about_course = "Not Found"
        
        # Syllabus
        syllabus = extract_syllabus(soup)
        
        # Certificate image
        certificate_image = extract_certificate_image(soup)
        
        # FAQ
        faq_text = extract_faq_text(soup)
        
        return {
            'Course Name': course_name,
            'About Course': about_course,
            'Syllabus': syllabus,
            'Certificate Image URL': certificate_image,
            'FAQ': faq_text,
            'URL': url
        }
        
    except Exception as e:
        print(f"Error occurred while scraping {url}: {str(e)}")
        return {
            'Course Name': 'Error',
            'About Course': 'Error',
            'Syllabus': 'Error',
            'Certificate Image URL': 'Error',
            'FAQ': 'Error',
            'URL': url
        }
    finally:
        if driver:
            driver.quit()

# ================= Process Input File =================
def process_input_file(input_file, output_file):
    df_input = pd.read_excel(input_file)
    all_data = []
    for _, row in df_input.iterrows():
        url = row['URL']
        course_data = extract_course_data(url)
        
        # ✅ Keep only required columns, rename last one to "Course UR"
        filtered_data = {
            'Course Name': course_data['Course Name'],
            'Syllabus': course_data['Syllabus'],
            'Course UR': course_data['URL']
        }
        
        # Print for debug
        print(f"Course Name: {filtered_data['Course Name']}")
        print(f"Course UR: {filtered_data['Course UR']}")
        print(f"Syllabus:\n{filtered_data['Syllabus']}\n")
        
        all_data.append(filtered_data)
    
    df_output = pd.DataFrame(all_data)
    df_output.to_excel(output_file, index=False)
    print(f"Data saved to {output_file}")

# ================= Run Script =================
if __name__ == "__main__":
    input_file = r"C:\Users\taslim.siddiqui\Downloads\input file.xlsx"
    output_file = r"C:\Users\taslim.siddiqui\Downloads\HENRY_course_new3 .xlsx"
    process_input_file(input_file, output_file)


Course Name: Certificate Course in Children Human Rights
Course UR: https://www.henryharvin.com/children-human-rights-course
Syllabus:
🔹 Module 1: International standards and monitoring systems
   To understand International standards and monitoring systems
   - Introduction
   - The United Nations Convention on the Rights of the Child: from conception until adoption.
   - General Principles of CRC
   - Main content of the CRC and the related States obligations
   - Role, structure, operations, and methodology of the UN Committee on the Rights of the Child
   - The CRC Communications procedure
   - Overview regional monitoring system
   - The role of the INHRIs on Childrens rights in the monitoring process at national and local level
   - NGOs role in the monitoring process
   - Children's subjective indicators: childrens participation in monitoring
   - Quiz Module 1


🔹 Module 2: The evolution of children's rights within the framework of human rights
   To understand the the evolutio