Eduonix CPP link extraction


In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
import re
from collections import deque

base_url = "https://www.eduonix.com"
visited = set()
queue = deque([base_url])
headers = {'User-Agent': 'Mozilla/5.0'}
courses = []

# Regex for course-like page ending
course_url_regex = re.compile(r"https://www\.eduonix\.com/[a-z0-9\-]+$", re.I)

# Exclusion list for non-course pages
excluded = [
    "/login", "/signup", "/cart", "/checkout", "/dashboard", "/profile",
    "/terms", "/privacy", "/contact", "/about", "/faq", "/category",
    ".jpg", ".png", ".svg", ".pdf", "tel:", "mailto:", "javascript:", "#",
    "lifetime", "edegree", "infiniti", "deals", "freebies", "upcoming"
]

def is_course_page(url, text):
    return (
        text and len(text.strip()) > 5
        and url.startswith(base_url)
        and not any(x in url.lower() for x in excluded)
        and re.match(course_url_regex, url)
        and not url.endswith(('courses', 'course'))
    )

def should_crawl(url):
    return (
        url.startswith(base_url)
        and not any(x in url.lower() for x in excluded)
        and url not in visited
        and not url.endswith(('courses', 'course'))
    )

while queue and len(visited) < 100:  # Limit to 100 pages to prevent excessive crawling
    current_url = queue.popleft()
    visited.add(current_url)

    try:
        print(f"🔍 Scanning: {current_url}")
        r = requests.get(current_url, headers=headers, timeout=10)
        soup = BeautifulSoup(r.text, "html.parser")

        # Find all course links - looking for <a> tags with href and text
        for a in soup.find_all("a", href=True):
            href = urljoin(base_url, a["href"].strip())
            text = a.get_text(strip=True)
            
            if should_crawl(href):
                queue.append(href)
                
            if is_course_page(href, text):
                courses.append({
                    "Course Name": text,
                    "Course Link": href
                })
                
        # Special handling for course listing pages
        if "courses" in current_url:
            for course_card in soup.select('a[href*="/courses/"]'):
                href = urljoin(base_url, course_card["href"].strip())
                text = course_card.get_text(strip=True)
                
                if is_course_page(href, text):
                    courses.append({
                        "Course Name": text,
                        "Course Link": href
                    })

    except Exception as e:
        print(f"⚠️ Skipping {current_url}: {e}")

# Deduplicate
df = pd.DataFrame(courses).drop_duplicates(subset="Course Link").reset_index(drop=True)

# Save to Excel
output_path = "C:\\Users\\taslim.siddiqui\\Downloads\\Eduonix_Course_Links.xlsx"
df.to_excel(output_path, index=False)

print(f"\n✅ Extracted {len(df)} real course links.")
print(f"📁 Saved to: {output_path}")
print("\nSample of extracted courses:")
print(df.head(10).to_string(index=False))