Henry Harvin Link Extraction

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin

def extract_courses(html_content, base_url):
    """Extract ALL course names and links from HTML"""
    soup = BeautifulSoup(html_content, 'html.parser')
    courses = []
    
    # Find ALL <a> tags with href (hyperlinks)
    for link in soup.find_all('a', href=True):
        href = link['href'].strip()
        text = link.get_text(strip=True)
        
        # Skip if: Not a course link OR text too short
        if (not re.search(r'course|training|certification|program', href, re.I) 
            or len(text) < 3 
            or re.search(r'blog|category|testimonial|about|contact', href, re.I)):
            continue
        
        courses.append({
            "Course Name": re.sub(r'\s+', ' ', text),  # Clean whitespace
            "Course Link": urljoin(base_url, href)     # Make full URL
        })
    
    # Remove duplicates (same link or same name)
    df = pd.DataFrame(courses).drop_duplicates(subset=["Course Link", "Course Name"])
    return df.sort_values("Course Name").reset_index(drop=True)

def main():
    # Load HTML file
    with open("C:\\Users\\taslim.siddiqui\\Downloads\\1200+ Certification Courses with Gold Membership-Henry Harvin.html", "r", encoding="utf-8") as f:
        html = f.read()
    
    # Extract courses
    df = extract_courses(html, base_url="https://www.henryharvin.com")
    
    # Save to Excel
    output_path = r"C:\Users\taslim.siddiqui\Downloads\HenryHarvin_Courses_Complete.xlsx"
    df.to_excel(output_path, index=False)
    
    print(f"✅ Extracted {len(df)} courses!")
    print(f"📂 Saved to: {output_path}\n")
    print("Sample Courses:")
    print(df.head(10).to_string(index=False))

if __name__ == "__main__":
    main()