# JUNO SCHOOL CPP LINK EXTRACTION

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from urllib.parse import urljoin

# Base configuration
base_url = "https://junoschool.org"
visited_urls = set()
course_links = []
headers = {'User-Agent': 'Mozilla/5.0'}

def is_course_url(url):
    """Check if URL is a course page and not a pagination or listing page"""
    url = url.lower()

    # Must contain these patterns
    course_patterns = [
        '/free-certificate-course/',
        '/course/',
        '/certificate/',
        '/training/',
        '/learn/'
    ]

    # Must NOT contain these patterns
    exclude_patterns = [
        '/tag/',
        '/category/',
        '/page/',
        '/author/',
        '/archive/',
        '/search/'
    ]

    # Check for at least one course pattern and no exclude patterns
    return (any(pattern in url for pattern in course_patterns) and
            not any(pattern in url for pattern in exclude_patterns) and
            not url.endswith(('/courses/', '/all-courses/')))

def extract_courses_from_url(url):
    try:
        print(f"🔍 Scanning: {url}")
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code != 200:
            print(f"⚠️ Failed to access {url} - Status code: {response.status_code}")
            return

        soup = BeautifulSoup(response.text, "html.parser")

        # Find all links that might be courses
        potential_links = soup.find_all("a", href=True)

        for a in potential_links:
            href = a["href"]
            text = a.get_text(strip=True)

            # Skip if no text or obviously not a course
            if not text or len(text) < 3 or text.lower() in ["home", "about", "contact", "login", "sign up"]:
                continue

            # Construct full URL
            full_url = urljoin(base_url, href)

            # Check if this is a course URL and not already collected
            if (full_url.startswith(base_url) and is_course_url(full_url) and 
                not any(x['Course Link'] == full_url for x in course_links)):

                # Clean the course name
                clean_name = re.sub(r'\s+', ' ', text).strip()

                # If the name is too short, try getting a better title from the page
                if len(clean_name) < 5:
                    title_tag = soup.find('h1') or soup.find('h2') or soup.find('title')
                    if title_tag:
                        clean_name = title_tag.get_text(strip=True)

                if clean_name:
                    course_links.append({"Course Name": clean_name, "Course Link": full_url})
                    print(f"✅ Found course: {clean_name} - {full_url}")

        # Queue more internal links for scanning
        internal_links = soup.find_all("a", href=re.compile(r"^/|^" + re.escape(base_url)))
        for link in internal_links:
            href = link["href"]
            full_internal_url = urljoin(base_url, href)

            # Skip already visited or irrelevant URLs
            if (full_internal_url.startswith(base_url) and
                full_internal_url not in visited_urls and
                not any(x in full_internal_url.lower() for x in ["wp-admin", "wp-login", "feed", ".jpg", ".png", ".pdf"]) and
                not re.search(r'/page/\d+', full_internal_url.lower())):

                visited_urls.add(full_internal_url)
                extract_courses_from_url(full_internal_url)

    except Exception as e:
        print(f"⚠️ Error processing {url}: {str(e)}")

# Start crawling from known pages
start_urls = [
    base_url,
    base_url + "/free-certificate-courses/",
    base_url + "/courses/",
    base_url + "/all-courses/",
    base_url + "/free-certificate-course/"
]

for url in start_urls:
    if url not in visited_urls:
        visited_urls.add(url)
        extract_courses_from_url(url)

# Save results to Excel
if course_links:
    df = pd.DataFrame(course_links)
    df = df.drop_duplicates(subset=["Course Link"]).reset_index(drop=True)

    output_file = "C:\\Users\\taslim.siddiqui\\Downloads\\Juno_School_All_Courses.xlsx"
    df.to_excel(output_file, index=False)

    print(f"\n✅ Successfully extracted {len(df)} unique courses!")
    print(f"📁 Saved to: {output_file}")
else:
    print("❌ No courses found. Please check the website structure.")
