CERTYBOX LINK EXTRACTION 

In [None]:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from urllib.parse import urljoin, urlparse
from concurrent.futures import ThreadPoolExecutor

# Configuration
BASE_URL = "https://www.certybox.com"
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
MAX_THREADS = 8

# Storage
course_data = []
visited_urls = set()

def is_course_page(url):
    """Strict check for course pages only (no categories)"""
    patterns = [
        r'^/courses/[^/]+/?$',
        r'^/course/[^/]+/?$',
        r'^/training/[^/]+/?$',
        r'^/certification/[^/]+/?$',
        r'^/program/[^/]+/?$'
    ]
    path = urlparse(url.lower()).path
    return any(re.match(pattern, path) for pattern in patterns)

def is_discovery_page(url):
    """Identify pages that may contain course links"""
    path = urlparse(url.lower()).path
    return not bool(re.search(r'course-category|category|topic|subject', path))

def extract_course_info(url):
    """Extract course name from its page"""
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Try multiple ways to find course name
        name_selectors = [
            'h1.product_title', 'h1.course-title', 'h1.entry-title',
            'h1.title', 'h1.page-title', 'title'
        ]
        
        for selector in name_selectors:
            name = soup.select_one(selector)
            if name:
                course_name = re.sub(r'\s+', ' ', name.get_text(strip=True)).strip()
                return course_name
        
        return "Unknown Course Name"
    except:
        return "Unknown Course Name"

def process_page(url):
    """Process a page to find courses and their names"""
    if url in visited_urls:
        return
    visited_urls.add(url)
    
    print(f"Processing: {url}")
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Method 1: Course cards
        for card in soup.find_all(class_=re.compile(r'course|product|training|certification', re.I)):
            link = card.find('a', href=True)
            if link:
                full_url = urljoin(BASE_URL, link['href'])
                if is_course_page(full_url):
                    name = extract_course_name_from_card(card) or extract_course_info(full_url)
                    course_data.append({
                        "Course Name": name,
                        "Course Link": full_url
                    })
        
        # Method 2: Direct links
        for link in soup.find_all('a', href=True):
            full_url = urljoin(BASE_URL, link['href'])
            if is_course_page(full_url):
                name = link.get_text(strip=True)
                if len(name.split()) > 2:  # Filter out short/non-descriptive links
                    course_data.append({
                        "Course Name": re.sub(r'\s+', ' ', name).strip(),
                        "Course Link": full_url
                    })
        
        # Handle pagination
        pagination = soup.find(class_=re.compile(r'pagination|page-numbers', re.I))
        if pagination:
            for page in pagination.find_all('a', href=True):
                page_url = urljoin(BASE_URL, page['href'])
                if is_discovery_page(page_url) and page_url not in visited_urls:
                    process_page(page_url)
                    
    except Exception as e:
        print(f"Error processing {url}: {str(e)}")

def extract_course_name_from_card(card):
    """Extract name from course card element"""
    name = card.find(class_=re.compile(r'title|name|heading', re.I)) or card.find(['h2', 'h3', 'h4'])
    return name.get_text(strip=True) if name else None

def main():
    print("🚀 Starting CertyBox course extraction...")
    
    # Entry points that contain course listings
    entry_points = [
        BASE_URL + "/courses/",
    ]
    
    # Multi-threaded processing
    with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
        executor.map(process_page, entry_points)
    
    # Process results
    if course_data:
        df = pd.DataFrame(course_data)
        df = df.drop_duplicates(subset=["Course Link"]).sort_values(by="Course Name")
        
        output_file = "C:\\Users\\taslim.siddiqui\\Downloads\\CertyBox_Courses_With_Links.xlsx"
        df.to_excel(output_file, index=False)
        
        print(f"\n✅ Success! Extracted {len(df)} courses")
        print(f"📁 Saved to: {output_file}")
        
        print("\n=== Sample Courses ===")
        print(df.head(10).to_string(index=False))
    else:
        print("❌ No courses found")

if __name__ == "__main__":
    main()