In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import json
from urllib.parse import urljoin
import os

def extract_courses_from_html(html_file_path):
    """
    Extract all course names and links from a Coding Blocks HTML file
    """
    # Read the HTML file
    with open(html_file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # List to store all courses
    courses = []
    
    # Method 1: Extract courses from JSON-LD structured data (most reliable)
    script_tags = soup.find_all('script', type='application/ld+json')
    for script_tag in script_tags:
        try:
            json_data = json.loads(script_tag.string)
            if isinstance(json_data, list):
                for item in json_data:
                    if item.get('@type') == 'Course':
                        course_name = item.get('name', '').strip()
                        course_url = item.get('url', '')
                        if course_url and course_name:
                            if not course_url.startswith('http'):
                                course_url = urljoin('https://codingblocks.com/', course_url)
                            courses.append({
                                'Course Name': course_name,
                                'Course Link': course_url,
                                'Source': 'JSON-LD'
                            })
            elif isinstance(json_data, dict) and json_data.get('@type') == 'Course':
                course_name = json_data.get('name', '').strip()
                course_url = json_data.get('url', '')
                if course_url and course_name:
                    if not course_url.startswith('http'):
                        course_url = urljoin('https://codingblocks.com/', course_url)
                    courses.append({
                        'Course Name': course_name,
                        'Course Link': course_url,
                        'Source': 'JSON-LD'
                    })
        except Exception as e:
            print(f"Error parsing JSON-LD: {e}")
            continue

    # Method 2: Extract from course cards with specific class patterns
    course_selectors = [
        'div.course-card', 'div.program-card', 'div.course-item', 'div.box',
        'div[class*="course"]', 'div[class*="program"]', 'div[class*="card"]'
    ]
    
    for selector in course_selectors:
        elements = soup.select(selector)
        for element in elements:
            # Skip elements that are too small to be actual course cards
            if len(element.get_text(strip=True)) < 20:
                continue
                
            # Find the course name
            name_selectors = [
                'h2', 'h3', 'h4', '.course-title', '.program-name', 
                '.card-title', '.boxhead', '[class*="title"]', '[class*="name"]'
            ]
            
            course_name = ""
            for name_selector in name_selectors:
                name_element = element.select_one(name_selector)
                if name_element:
                    course_name = name_element.get_text(strip=True)
                    if course_name and len(course_name) > 3:
                        break
            
            # If we still don't have a name, try to find any heading
            if not course_name:
                heading = element.find(['h2', 'h3', 'h4', 'h5'])
                if heading:
                    course_name = heading.get_text(strip=True)
            
            # Find the course link
            link_element = element.find('a', href=True)
            if link_element:
                course_link = link_element['href']
                
                # Make sure it's a full URL
                if not course_link.startswith('http'):
                    course_link = urljoin('https://codingblocks.com/', course_link)
                
                # Only add if we have both name and link
                if course_name and course_link and len(course_name) > 3:
                    courses.append({
                        'Course Name': course_name,
                        'Course Link': course_link,
                        'Source': f'CSS Selector: {selector}'
                    })

    # Method 3: Extract from all links with course-related keywords
    course_keywords = ['course', 'program', 'bootcamp', 'training', 'learning-path']
    for keyword in course_keywords:
        links = soup.find_all('a', href=re.compile(keyword, re.I))
        for link in links:
            href = link['href']
            link_text = link.get_text(strip=True)
            
            # If link text is too short, try to find text in parent elements
            if len(link_text) < 5:
                parent = link.find_parent(['div', 'section', 'li'])
                if parent:
                    heading = parent.find(['h2', 'h3', 'h4', 'h5'])
                    if heading:
                        link_text = heading.get_text(strip=True)
            
            if link_text and len(link_text) > 3:
                full_url = href if href.startswith('http') else urljoin('https://codingblocks.com/', href)
                courses.append({
                    'Course Name': link_text,
                    'Course Link': full_url,
                    'Source': f'Keyword: {keyword}'
                })

    # Remove duplicates
    unique_courses = []
    seen_links = set()

    for course in courses:
        # Normalize the URL to avoid duplicates with different formatting
        normalized_link = course['Course Link'].lower().rstrip('/')
        normalized_name = course['Course Name'].strip()
        
        if normalized_link not in seen_links and normalized_name:
            seen_links.add(normalized_link)
            unique_courses.append(course)

    return unique_courses

def main():
    # Define file paths
    html_file_path = r"C:\Users\taslim.siddiqui\Downloads\Learn Coding from India's Leading Programming Institute.html"
    output_path = r"C:\Users\taslim.siddiqui\Downloads\CodingBlocks_Courses.xlsx"
    
    # Check if input file exists
    if not os.path.exists(html_file_path):
        print(f"Error: HTML file not found at {html_file_path}")
        return
    
    # Extract courses
    print("🔍 Extracting courses from HTML file...")
    courses = extract_courses_from_html(html_file_path)
    
    if not courses:
        print("❌ No courses found in the HTML file.")
        return
    
    # Create DataFrame
    df = pd.DataFrame(courses)
    
    # Save to Excel
    df.to_excel(output_path, index=False)
    
    print(f"✅ Successfully extracted {len(df)} unique course links.")
    print(f"📁 Saved to: {output_path}")
    
    # Display sample of courses
    print("\n📋 Sample of courses found:")
    for i, course in enumerate(df.head(10).to_dict('records'), 1):
        print(f"{i}. {course['Course Name']}")
        print(f"   🔗 {course['Course Link']}")
        print(f"   📍 Source: {course['Source']}")
        print()

if __name__ == "__main__":
    main()

🔍 Extracting courses from HTML file...
✅ Successfully extracted 41 unique course links.
📁 Saved to: C:\Users\taslim.siddiqui\Downloads\CodingBlocks_Courses.xlsx

📋 Sample of courses found:
1. Master Data structures and Algorithms using C++
   🔗 https://codingblocks.com/data-structures-and-algorithms-using-c-plus-plus.html
   📍 Source: CSS Selector: div.box

2. Master Data structures and Algorithms using Java
   🔗 https://codingblocks.com/data-structures-and-algorithms-using-java.html
   📍 Source: CSS Selector: div.box

3. Master Data structures and Algorithms using Python
   🔗 https://codingblocks.com/data-structures-and-algorithms-using-python.html
   📍 Source: CSS Selector: div.box

4. Master Competitive programming
   🔗 https://codingblocks.com/competitive-programming.html
   📍 Source: CSS Selector: div.box

5. Master Interview prep with C++
   🔗 https://codingblocks.com/interview-preparation-using-c-plus-plus.html
   📍 Source: CSS Selector: div.box

6. Master Interview prep with Ja