Extract course duration and Who should take it

In [1]:
import time
import re
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# -------------------- DRIVER SETUP --------------------
def get_driver(headless=False):
    options = webdriver.ChromeOptions()
    if headless:
        options.add_argument("--headless")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--start-maximized")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    return driver

# -------------------- CLEANING FUNCTION --------------------
def clean_text(text):
    if not text:
        return None
    text = re.sub(r"<.*?>", "", text)  # remove HTML tags
    text = re.sub(r"\s+", " ", text)   # normalize spaces
    return text.strip()

# -------------------- WHO SHOULD ATTEND EXTRACTION --------------------
def extract_who_should_attend(soup):
    # Step 1: look for <strong> or <span> containing 'Who Should Attend the Course?'
    candidate = soup.find(['strong','span'], string=re.compile(r"Who Should Attend the Course\?", re.I))
    
    if candidate:
        ul = candidate.find_next('ul')
        if ul:
            who_list = [li.get_text(" ", strip=True) for li in ul.find_all('li') if li.get_text(strip=True)]
            if who_list:
                return "\n".join(who_list)

    # Step 2: fallback to general patterns
    patterns = [
        r"Who Should Attend",
        r"Who Should Enroll",
        r"Who can Enroll",
        r"Target Audience",
        r"Ideal Candidates?",
        r"Who Can Join",
        r"Eligibility",
        r"Who is this (course|program) for"
    ]

    who_should_attend = []
    headers = soup.find_all(['h1','h2','h3','h4','h5','h6','strong'])
    for header in headers:
        header_text = header.get_text(strip=True)
        if any(re.search(pattern, header_text, re.I) for pattern in patterns):
            ul = header.find_next('ul')
            if ul:
                for li in ul.find_all('li'):
                    text = li.get_text(" ", strip=True)
                    if text:
                        who_should_attend.append(text)
            break

    if who_should_attend:
        seen = set()
        unique_items = []
        for item in who_should_attend:
            normalized = item.lower().strip()
            if normalized not in seen:
                seen.add(normalized)
                unique_items.append(item.strip())
        return "\n".join(unique_items)

    return "N/A - Section not found"

# -------------------- DURATION EXTRACTION --------------------
def extract_duration_info(soup):
    duration_info = []
    
    duration_divs = soup.find_all('div', class_=re.compile(r'shadow.*box-border.*relative.*mt-2.*rounded-md.*bg-white.*py-2.*px-2'))
    
    for div in duration_divs:
        h5_element = div.find('h5', class_=re.compile(r'text-\[16px\].*font-medium.*text-purple'))
        if h5_element:
            duration_text = h5_element.get_text(strip=True)
            if any(keyword in duration_text.lower() for keyword in ['hour','session','week','month','duration']):
                duration_info.append(duration_text)
    
    if not duration_info:
        h5_elements = soup.find_all('h5')
        for h5 in h5_elements:
            h5_text = h5.get_text(strip=True)
            if any(keyword in h5_text.lower() for keyword in ['hour','session','week','month','duration']):
                duration_info.append(h5_text)
    
    if not duration_info:
        all_elements = soup.find_all(string=re.compile(r'\d+\s*(Hours?|Hrs|Sessions?|Weeks?|Months?)', re.I))
        for element in all_elements:
            if hasattr(element, 'parent'):
                parent = element.parent
                if parent.name not in ['script','style']:
                    text = parent.get_text(strip=True)
                    if re.search(r'\d+\s*(Hours?|Hrs|Sessions?|Weeks?|Months?)', text, re.I):
                        duration_info.append(text)
    
    if duration_info:
        seen = set()
        unique_duration = []
        for item in duration_info:
            if item not in seen:
                seen.add(item)
                unique_duration.append(item)
        return " | ".join(unique_duration[:3])
    else:
        return "N/A - Duration not found"

# -------------------- SCRAPER FUNCTION --------------------
def scrape_course_data(course_url):
    driver = get_driver(headless=False)
    driver.get(course_url)
    time.sleep(3)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()

    try:
        title_tag = soup.find("h1")
        course_name = clean_text(title_tag.get_text()) if title_tag else None

        who_should_attend = extract_who_should_attend(soup)
        duration_info = extract_duration_info(soup)

        return {
            "Course Name": course_name,
            "Course Link": course_url,
            "Who Should Attend": who_should_attend,
            "Duration": duration_info
        }

    except Exception as e:
        return {
            "Course Name": None,
            "Course Link": course_url,
            "Who Should Attend": "N/A",
            "Duration": "N/A",
            "Error": str(e)
        }

# -------------------- MAIN --------------------
urls = [
    "https://www.henryharvin.com/car-designing-course"
]

all_data = []
for url in urls:
    print(f"\nScraping: {url}")
    course_data = scrape_course_data(url)
    print(f"Course Name: {course_data['Course Name']}")
    print(f"Duration: {course_data['Duration']}")
    print("Who Should Attend:\n" + course_data['Who Should Attend'])
    print(f"Course Link: {course_data['Course Link']}")
    all_data.append(course_data)

output_file = r"C:\Users\taslim.siddiqui\Downloads\HenryHarvin_Personality_Development.xlsx"
df = pd.DataFrame(all_data)
df.to_excel(output_file, index=False)
print(f"\n✅ Data saved to {output_file}")



Scraping: https://www.henryharvin.com/car-designing-course
Course Name: Car Designing Course
Duration: 48 Hours of Instructor-Led Sessions | 16 Hours of Live Interactive Doubt Solving Sessions | 8 Hours of Live Master Sessions by Industry Experts
Who Should Attend:
Aspiring Car Designer
Fresh Graduates
Design Intern
Vehicle Artists
Product Designer
Course Link: https://www.henryharvin.com/car-designing-course

✅ Data saved to C:\Users\taslim.siddiqui\Downloads\HenryHarvin_Personality_Development.xlsx


In [33]:
import time
import re
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# -------------------- DRIVER SETUP --------------------
def get_driver(headless=False):
    options = webdriver.ChromeOptions()
    if headless:
        options.add_argument("--headless")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--start-maximized")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    return driver

# -------------------- CLEANING FUNCTION --------------------
def clean_text(text):
    if not text:
        return None
    text = re.sub(r"<.*?>", "", text)  # remove HTML tags
    text = re.sub(r"\s+", " ", text)   # normalize spaces
    return text.strip()


# -------------------- WHO SHOULD ATTEND EXTRACTION --------------------
def extract_who_should_attend(soup):
    # Step 1: look for <strong> or <span> containing 'Who Should Attend the Course?'
    candidate = soup.find(['strong','span'], string=re.compile(r"Who Should Attend the Course\?", re.I))
    
    if candidate:
        ul = candidate.find_next('ul')
        if ul:
            who_list = [li.get_text(" ", strip=True) for li in ul.find_all('li') if li.get_text(strip=True)]
            if who_list:
                return "\n".join(who_list)

    # Step 2: fallback to general patterns
    patterns = [
        r"Who Should Attend",
        r"Who Should Enroll",
        r"Who can Enroll",
        r"Target Audience",
        r"Ideal Candidates?",
        r"Who Can Join",
        r"Eligibility",
        r"Who is this (course|program) for"
    ]

    who_should_attend = []
    headers = soup.find_all(['h1','h2','h3','h4','h5','h6','strong'])
    for header in headers:
        header_text = header.get_text(strip=True)
        if any(re.search(pattern, header_text, re.I) for pattern in patterns):
            ul = header.find_next('ul')
            if ul:
                for li in ul.find_all('li'):
                    text = li.get_text(" ", strip=True)
                    if text:
                        who_should_attend.append(text)
            break

    if who_should_attend:
        seen = set()
        unique_items = []
        for item in who_should_attend:
            normalized = item.lower().strip()
            if normalized not in seen:
                seen.add(normalized)
                unique_items.append(item.strip())
        return "\n".join(unique_items)

    return "N/A - Section not found"

# -------------------- DURATION EXTRACTION --------------------
def extract_duration_info(soup):
    duration_info = []
    
    duration_divs = soup.find_all('div', class_=re.compile(r'shadow.*box-border.*relative.*mt-2.*rounded-md.*bg-white.*py-2.*px-2'))
    
    for div in duration_divs:
        h5_element = div.find('h5', class_=re.compile(r'text-\[16px\].*font-medium.*text-purple'))
        if h5_element:
            duration_text = h5_element.get_text(strip=True)
            if any(keyword in duration_text.lower() for keyword in ['hour','session','week','month','duration']):
                duration_info.append(duration_text)
    
    if not duration_info:
        h5_elements = soup.find_all('h5')
        for h5 in h5_elements:
            h5_text = h5.get_text(strip=True)
            if any(keyword in h5_text.lower() for keyword in ['hour','session','week','month','duration']):
                duration_info.append(h5_text)
    
    if not duration_info:
        all_elements = soup.find_all(string=re.compile(r'\d+\s*(Hours?|Hrs|Sessions?|Weeks?|Months?)', re.I))
        for element in all_elements:
            if hasattr(element, 'parent'):
                parent = element.parent
                if parent.name not in ['script','style']:
                    text = parent.get_text(strip=True)
                    if re.search(r'\d+\s*(Hours?|Hrs|Sessions?|Weeks?|Months?)', text, re.I):
                        duration_info.append(text)
    
    if duration_info:
        seen = set()
        unique_duration = []
        for item in duration_info:
            if item not in seen:
                seen.add(item)
                unique_duration.append(item)
        return " | ".join(unique_duration[:3])
    else:
        return "N/A - Duration not found"

# -------------------- SCRAPER FUNCTION --------------------
def scrape_course_data(course_url):
    driver = get_driver(headless=False)
    driver.get(course_url)
    time.sleep(3)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()

    try:
        title_tag = soup.find("h1")
        course_name = clean_text(title_tag.get_text()) if title_tag else None
        who_should_attend = extract_who_should_attend(soup)
        duration_info = extract_duration_info(soup)

        return {
            "Course Name": course_name,
            "Course Link": course_url,
            "Who Should Attend": who_should_attend,
            "Duration": duration_info
        }
    except Exception as e:
        return {
            "Course Name": None,
            "Course Link": course_url,
            "Who Should Attend": "N/A",
            "Duration": "N/A",
            "Error": str(e)
        }

# -------------------- MAIN --------------------
# 📥 Read input Excel file (must contain column "Course Link")
input_file = r"C:\Users\taslim.siddiqui\Downloads\HenryHarvin_All_Courses (3).xlsx"
df_input = pd.read_excel(input_file)

all_data = []
for url in df_input["Course Link"].dropna():
    print(f"\nScraping: {url}")
    course_data = scrape_course_data(url)
    print(f"Course Name: {course_data['Course Name']}")
    print(f"Duration: {course_data['Duration']}")
    print(f"Who Should Attend: {course_data['Who Should Attend']}")
    print(f"Course Link: {course_data['Course Link']}")
    all_data.append(course_data)

# 📤 Save results to output Excel
output_file = r"C:\Users\taslim.siddiqui\Downloads\HH_Duration and who should tak.xlsx"
df_out = pd.DataFrame(all_data)
df_out.to_excel(output_file, index=False)
print(f"\n✅ Data saved to {output_file}")



Scraping: https://www.henryharvin.com/3d-vfx-course
Course Name: WHY CHOOSE HENRY HARVIN® ANIMATION ACADEMY?
Duration: N/A - Duration not found
Who Should Attend: N/A - Section not found
Course Link: https://www.henryharvin.com/3d-vfx-course

Scraping: https://www.henryharvin.com/astb-test-prep-course
Course Name: ASTB Test Prep Course
Duration: 24 Hours of Instructor-Led Sessions | 8 Hours of Live Interactive Doubt Solving Sessions | 4 Hours of Live Master Sessions by Industry Experts
Who Should Attend: N/A - Section not found
Course Link: https://www.henryharvin.com/astb-test-prep-course

Scraping: https://www.henryharvin.com/asvab-prep-course
Course Name: ASVAB Preparation Course
Duration: 24 Hours of Instructor-Led Sessions | 8 Hours of Live Interactive Doubt Solving Sessions | 4 Hours of Live Master Sessions by Industry Experts
Who Should Attend: N/A - Section not found
Course Link: https://www.henryharvin.com/asvab-prep-course

Scraping: https://www.henryharvin.com/advanced-dipl