In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
import re
from collections import deque
import pandas as pd
import random

class ComprehensiveCourseExtractor:
    def __init__(self):
        self.base_url = "https://skillsetarena.com"
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.9',
            'Referer': 'https://www.google.com/',
            'DNT': '1'
        }
        self.session = requests.Session()
        self.session.headers.update(self.headers)
        self.visited = set()
        self.course_links = set()
        self.all_links = set()

    def is_valid_url(self, url):
        """Check if URL is valid and belongs to the domain"""
        try:
            parsed = urlparse(url)
            if not parsed.netloc:
                return False
            return 'skillsetarena.com' in parsed.netloc
        except:
            return False

    def should_crawl(self, url):
        """Determine if we should crawl this URL"""
        # Skip non-html files
        skip_extensions = ['.pdf', '.jpg', '.jpeg', '.png', '.gif', '.zip', '.rar', '.exe', '.mp4', '.avi', '.mp3']
        if any(url.lower().endswith(ext) for ext in skip_extensions):
            return False
        
        # Skip admin and utility pages
        skip_keywords = [
            '/wp-admin/', '/wp-json/', '/login', '/register', '/signin', '/signup',
            '/cart/', '/checkout/', '/my-account/', '/account/', '/dashboard/',
            '/tag/', '/category/', '/author/', '/feed/', '/wp-content/',
            '/privacy-policy', '/terms-conditions', '/refund-policy',
            'replytocom', 'share=', 'comment-', '?s=', '/page/', '/blog/',
            '/about', '/contact', '/career', '/job', '/testimonial', '/faq',
            '/support', '/tnc', '/ref-policy'
        ]
        if any(keyword in url.lower() for keyword in skip_keywords):
            return False
            
        return True

    def is_course_page(self, url, title="", link_text=""):
        """Enhanced course detection focusing on program/course URLs"""
        url_lower = url.lower()
        title_lower = title.lower()
        link_text_lower = link_text.lower()
        
        # Course URL patterns (expanded to include all found patterns)
        url_patterns = [
            # Pattern: /something-program/ 
            re.search(r'/([a-z-]+)-program/$', url_lower),
            # Pattern: /something-course/
            re.search(r'/([a-z-]+)-course/$', url_lower),
            # Pattern: /something-certification-program/
            re.search(r'/([a-z-]+)-certification-program/$', url_lower),
            # Pattern: /something-training/
            re.search(r'/([a-z-]+)-training/$', url_lower),
            # Pattern with online-certification
            re.search(r'/([a-z-]+)-online-certification-program/$', url_lower),
            # Simple program pages
            re.search(r'/([a-z-]+)/$', url_lower) and any(word in url_lower for word in ['program', 'course', 'training']),
        ]
        
        # Check if URL matches any pattern
        url_matches_pattern = any(pattern for pattern in url_patterns if pattern)
        
        # Content indicators (expanded)
        content_indicators = [
            url_matches_pattern,
            '-program/' in url_lower,
            '-course/' in url_lower,
            '-training/' in url_lower,
            '-certification/' in url_lower,
            'program/' in url_lower and len(url_lower.split('/')) > 2,
            'course/' in url_lower and len(url_lower.split('/')) > 2,
            'training/' in url_lower and len(url_lower.split('/')) > 2,
            
            # Title patterns
            'program' in title_lower and len(title_lower) > 10,
            'course' in title_lower and len(title_lower) > 10,
            'training' in title_lower and len(title_lower) > 10,
            'certification' in title_lower,
            
            # Link text patterns
            'program' in link_text_lower and len(link_text_lower) > 5,
            'course' in link_text_lower and len(link_text_lower) > 5,
            'training' in link_text_lower and len(link_text_lower) > 5,
            'certification' in link_text_lower,
            'enroll' in link_text_lower,
            'register' in link_text_lower,
            'apply now' in link_text_lower,
            'view more' in link_text_lower,
        ]
        
        return any(content_indicators)

    def extract_all_links_from_page(self, url):
        """Extract ALL links from a page with comprehensive parsing"""
        try:
            print(f"🔍 Extracting links from: {url}")
            response = self.session.get(url, timeout=15, allow_redirects=True)
            response.raise_for_status()
            
            # Check content type
            content_type = response.headers.get('content-type', '').lower()
            if 'text/html' not in content_type:
                return []
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Get page title
            title = soup.find('title')
            page_title = title.get_text().strip() if title else ""
            
            links_found = set()
            
            # Extract from all anchor tags
            for link in soup.find_all('a', href=True):
                href = link.get('href')
                if not href or href.startswith(('javascript:', 'mailto:', 'tel:', '#')):
                    continue
                
                # Convert to absolute URL
                full_url = urljoin(url, href)
                
                # Normalize URL (remove fragments, sort query params)
                parsed = urlparse(full_url)
                normalized_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
                if parsed.query:
                    normalized_url += f"?{parsed.query}"
                
                links_found.add(normalized_url)
                
                # Get link text for better course detection
                link_text = link.get_text(strip=True)
                
                # Check if this is a course page (focus on URL structure)
                if self.is_course_page(normalized_url, page_title, link_text):
                    if normalized_url not in self.course_links:
                        self.course_links.add(normalized_url)
                        print(f"🎯 COURSE FOUND: {normalized_url}")
                        if link_text:
                            print(f"   Link Text: {link_text}")
            
            return list(links_found)
            
        except Exception as e:
            print(f"❌ Error extracting from {url}: {e}")
            return []

    def deep_crawl_website(self, max_pages=100):
        """Deep crawl the entire website focusing on course pages"""
        print("🚀 Starting DEEP website crawl...")
        
        # Start with multiple entry points that might contain course links
        start_urls = [
            self.base_url,
            f"{self.base_url}/programs/",
            f"{self.base_url}/programs",
        ]
        
        queue = deque(start_urls)
        for url in start_urls:
            self.visited.add(url)
        
        page_count = 0
        
        while queue and page_count < max_pages:
            url = queue.popleft()
            page_count += 1
            
            print(f"\n📖 [{page_count}/{max_pages}] Crawling: {url}")
            
            links = self.extract_all_links_from_page(url)
            
            # Add new links to queue
            new_links_added = 0
            for link in links:
                if self.is_valid_url(link) and self.should_crawl(link):
                    self.all_links.add(link)
                    
                    if link not in self.visited and link not in queue:
                        # HIGH PRIORITY: Links that match our desired course URL pattern
                        if any(pattern in link.lower() for pattern in 
                              ['-program/', '-course/', '-training/', '-certification/']):
                            queue.appendleft(link)  # Highest priority
                            new_links_added += 1
                        # MEDIUM PRIORITY: Other course-related directories
                        elif any(pattern in link.lower() for pattern in 
                                ['/program/', '/course/', '/training/', '/certification/']):
                            queue.appendleft(link)  # High priority
                            new_links_added += 1
                        # LOW PRIORITY: All other valid links
                        else:
                            queue.append(link)
                            new_links_added += 1
                        
                        self.visited.add(link)
            
            print(f"   Found {len(links)} links, added {new_links_added} new URLs")
            print(f"   Total course links so far: {len(self.course_links)}")
            
            # Random delay to be respectful
            time.sleep(random.uniform(0.3, 1.0))

    def generate_course_urls(self):
        """Generate potential course URLs based on common patterns"""
        print("🔧 Generating potential course URLs...")
        
        # Based on the found courses, let's generate similar patterns
        course_topics = [
            'sql', 'structured-query-language', 'python', 'java', 'javascript', 
            'data-science', 'machine-learning', 'artificial-intelligence', 'ai',
            'deep-learning', 'nlp', 'natural-language-processing', 'computer-vision',
            'data-analytics', 'business-analytics', 'tableau', 'power-bi', 'excel',
            'big-data', 'hadoop', 'spark', 'aws', 'azure', 'gcp', 'cloud-computing',
            'devops', 'cybersecurity', 'web-development', 'full-stack', 'frontend',
            'backend', 'react', 'angular', 'nodejs', 'html-css', 'php', 'mysql',
            'mongodb', 'oracle', 'postgresql', 'linux', 'networking', 'ethical-hacking',
            'digital-marketing', 'seo', 'social-media-marketing', 'content-marketing',
            'project-management', 'agile', 'scrum', 'product-management',
            'generative-ai', 'prompt-engineering', 'looker-studio', 'google-sheet',
            'microsoft-excel', 'cyber-security', 'investment-banking', 'life-skill',
            'executive'
        ]
        
        # URL patterns that match the found format
        url_patterns = [
            "/{topic}-program/",
            "/{topic}-course/",
            "/{topic}-training/",
            "/{topic}-certification/",
            "/{topic}-online-certification-program/",
            "/{topic}-certification-program/",
        ]
        
        generated_urls = set()
        for topic in course_topics:
            for pattern in url_patterns:
                url = f"{self.base_url}{pattern.format(topic=topic)}"
                generated_urls.add(url)
        
        # Test all generated URLs
        print(f"   Testing {len(generated_urls)} generated URLs...")
        tested_count = 0
        for url in generated_urls:
            tested_count += 1
            if tested_count % 50 == 0:
                print(f"   Progress: {tested_count}/{len(generated_urls)}")
                
            try:
                response = self.session.head(url, timeout=5, allow_redirects=True)
                if response.status_code == 200:
                    final_url = response.url
                    if self.is_course_page(final_url):
                        if final_url not in self.course_links:
                            self.course_links.add(final_url)
                            print(f"✅ Generated URL found: {final_url}")
            except:
                continue

    def extract_from_sitemaps(self):
        """Extract from all possible sitemap variations - FIXED VERSION"""
        print("🗺️  Extracting from sitemaps...")
        
        sitemap_urls = [
            f"{self.base_url}/sitemap.xml",
            f"{self.base_url}/sitemap_index.xml",
            f"{self.base_url}/wp-sitemap.xml",
            f"{self.base_url}/sitemap.php",
            f"{self.base_url}/post-sitemap.xml",
            f"{self.base_url}/page-sitemap.xml",
        ]
        
        for sitemap_url in sitemap_urls:
            try:
                print(f"   Checking: {sitemap_url}")
                response = self.session.get(sitemap_url, timeout=10)
                
                if response.status_code == 200:
                    # Try different parsing methods
                    try:
                        # Try XML parsing first
                        soup = BeautifulSoup(response.content, 'lxml')
                        urls = soup.find_all('loc')
                        
                        for url in urls:
                            url_text = url.get_text().strip()
                            if self.is_valid_url(url_text) and self.is_course_page(url_text):
                                if url_text not in self.course_links:
                                    self.course_links.add(url_text)
                                    print(f"✅ Course from sitemap: {url_text}")
                    except:
                        # Fallback to text parsing
                        try:
                            urls = response.text.split('\n')
                            for url_text in urls:
                                url_text = url_text.strip()
                                if url_text and self.is_valid_url(url_text) and self.is_course_page(url_text):
                                    if url_text not in self.course_links:
                                        self.course_links.add(url_text)
                                        print(f"✅ Course from sitemap: {url_text}")
                        except:
                            pass
                            
            except Exception as e:
                print(f"⚠️  Could not process {sitemap_url}: {e}")

    def get_all_courses(self):
        """Main method to get ALL course links in the desired format"""
        print("🎯 Starting COMPREHENSIVE course extraction...")
        print("=" * 70)
        print("Target format: https://skillsetarena.com/topic-program/")
        print("=" * 70)
        
        # Method 1: Extract from sitemaps (fastest)
        self.extract_from_sitemaps()
        
        # Method 2: Generate and test potential URLs
        self.generate_course_urls()
        
        # Method 3: Deep crawl website
        self.deep_crawl_website(max_pages=100)
        
        print(f"\n✅ Extraction complete! Found {len(self.course_links)} course links")
        return sorted(list(self.course_links))

    def save_to_excel(self, file_path=None):
        """Save ALL course links to Excel - FIXED VERSION"""
        if file_path is None:
            file_path = r"C:\Users\taslim.siddiqui\Downloads\skill_course_links.xlsx"
        
        try:
            # Prepare ALL course data (not filtering)
            data = []
            for i, url in enumerate(sorted(self.course_links), 1):
                # Extract course name from URL
                course_name = ""
                # Try to extract meaningful course name
                if '-program/' in url:
                    course_name = url.split('/')[-2].replace('-', ' ').title()
                elif '-course/' in url:
                    course_name = url.split('/')[-2].replace('-', ' ').title()
                elif '-training/' in url:
                    course_name = url.split('/')[-2].replace('-', ' ').title()
                elif '-certification/' in url:
                    course_name = url.split('/')[-2].replace('-', ' ').title()
                else:
                    course_name = "Course Program"
                
                data.append({
                    'S.No': i,
                    'Course_Name': course_name,
                    'Course_URL': url,
                    'Website': 'Skillset Arena',
                    'Extraction_Date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
                })
            
            df = pd.DataFrame(data)
            
            # Save to Excel
            df.to_excel(file_path, index=False)
            
            print(f"\n💾 RESULTS SAVED!")
            print(f"   📁 File: {file_path}")
            print(f"   📊 Total Courses Saved: {len(df)}")
            print(f"   📄 Pages Crawled: {len(self.visited)}")
            
            # Show ALL found courses
            if len(df) > 0:
                print(f"\n📋 ALL COURSES FOUND ({len(df)} total):")
                print("-" * 80)
                for i, row in df.iterrows():
                    print(f"   {row['S.No']:2d}. {row['Course_URL']}")
            
            return True
            
        except Exception as e:
            print(f"❌ Error saving to Excel: {e}")
            return False

# Main execution
if __name__ == "__main__":
    print("🎯 Skillset Arena Course Extractor")
    print("Targeting format: https://skillsetarena.com/topic-program/")
    print("=" * 70)
    
    # Create extractor and get courses
    extractor = ComprehensiveCourseExtractor()
    all_courses = extractor.get_all_courses()
    
    # Display results
    print(f"\n🎉 EXTRACTION COMPLETE!")
    print("=" * 70)
    print(f"📊 FINAL SUMMARY:")
    print(f"   ✅ Total Course Links Found: {len(all_courses)}")
    print(f"   🔍 Total Pages Crawled: {len(extractor.visited)}")
    print(f"   🌐 Total Links Discovered: {len(extractor.all_links)}")
    
    # Save to Excel
    output_file = r"C:\Users\taslim.siddiqui\Downloads\skill_course_links.xlsx"
    extractor.save_to_excel(output_file)

🎯 Skillset Arena Course Extractor
Targeting format: https://skillsetarena.com/topic-program/
🎯 Starting COMPREHENSIVE course extraction...
Target format: https://skillsetarena.com/topic-program/
🗺️  Extracting from sitemaps...
   Checking: https://skillsetarena.com/sitemap.xml
   Checking: https://skillsetarena.com/sitemap_index.xml
   Checking: https://skillsetarena.com/wp-sitemap.xml
   Checking: https://skillsetarena.com/sitemap.php
   Checking: https://skillsetarena.com/post-sitemap.xml
   Checking: https://skillsetarena.com/page-sitemap.xml
🔧 Generating potential course URLs...
   Testing 360 generated URLs...
   Progress: 50/360
   Progress: 100/360


In [3]:
import time
import os
import re
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import requests
import json
from collections import Counter

# -------------------- DRIVER SETUP --------------------
def get_driver(headless=True):
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--no-sandbox")
    options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--log-level=3")
    if headless:
        options.add_argument("--headless=new")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

# -------------------- SYLLABUS EXTRACTION --------------------
def extract_syllabus(soup):
    try:
        syllabus_text = ""
        
        # Find the Program Structure section
        program_structure = soup.find("p", class_="has-small-font-size", string=lambda text: text and "Program Structure" in text)
        
        if not program_structure:
            print("⚠️ Program Structure section not found")
            return "Syllabus not available"
        
        # Find all elements after Program Structure
        current_element = program_structure.find_next_sibling()
        
        while current_element:
            # Check for month headings
            if current_element.name == "p" and current_element.get("class") == ["has-small-font-size"]:
                month_text = current_element.get_text(strip=True)
                if "Month" in month_text:
                    syllabus_text += f"\n{month_text}\n"
            
            # Check for module details
            elif current_element.name == "details" and "wp-block-details" in current_element.get("class", []):
                # Get module summary
                summary = current_element.find("summary")
                if summary:
                    syllabus_text += f"🔹 {summary.get_text(strip=True)}\n"
                
                # Get the list items
                lessons_list = current_element.find("ul", class_="wp-block-list")
                if lessons_list:
                    lessons = lessons_list.find_all("li")
                    for lesson in lessons:
                        lesson_text = lesson.get_text(strip=True)
                        if lesson_text:
                            syllabus_text += f"    . {lesson_text}\n"
                
                syllabus_text += "\n"
            
            # Stop if we reach the next major section
            elif current_element.name == "p" and current_element.get("class") == ["has-small-font-size"]:
                next_section_text = current_element.get_text(strip=True)
                if "Key Features" in next_section_text or "Tools & Technologies" in next_section_text:
                    break
            
            current_element = current_element.find_next_sibling()
        
        return syllabus_text.strip() if syllabus_text else "Syllabus not available"
        
    except Exception as e:
        print(f"❌ Syllabus extraction error: {str(e)}")
        import traceback
        print(f"Detailed error: {traceback.format_exc()}")
        return "Syllabus extraction failed"

# -------------------- SYLLABUS CURRICULUM EXTRACTION --------------------
def extract_syllabus_curriculum(soup):
    try:
        curriculum_text = ""
        
        # Find the Detailed Curriculum section
        detailed_curriculum = soup.find("p", class_="has-small-font-size", string=lambda text: text and "Detailed Curriculum" in text)
        
        if not detailed_curriculum:
            print("⚠️ Detailed Curriculum section not found")
            return "Curriculum not available"
        
        # Find all elements after Detailed Curriculum
        current_element = detailed_curriculum.find_next_sibling()
        
        while current_element:
            # Check for month headings
            if current_element.name == "p" and current_element.get("class") == ["has-small-font-size"]:
                month_text = current_element.get_text(strip=True)
                if "Month" in month_text:
                    curriculum_text += f"\n{month_text}\n"
            
            # Check for module details
            elif current_element.name == "details" and "wp-block-details" in current_element.get("class", []):
                # Get module summary
                summary = current_element.find("summary")
                if summary:
                    curriculum_text += f"🔹 {summary.get_text(strip=True)}\n"
                
                # Get the list items
                lessons_list = current_element.find("ul", class_="wp-block-list")
                if lessons_list:
                    lessons = lessons_list.find_all("li")
                    for lesson in lessons:
                        lesson_text = lesson.get_text(strip=True)
                        if lesson_text:
                            curriculum_text += f"    . {lesson_text}\n"
                
                curriculum_text += "\n"
            
            # Stop if we reach the next major section
            elif current_element.name == "p" and current_element.get("class") == ["has-small-font-size"]:
                next_section_text = current_element.get_text(strip=True)
                if "Program Summary" in next_section_text or "Final Outcome" in next_section_text or "Learning Approach" in next_section_text:
                    break
            
            current_element = current_element.find_next_sibling()
        
        return curriculum_text.strip() if curriculum_text else "Curriculum not available"
        
    except Exception as e:
        print(f"❌ Syllabus curriculum extraction error: {str(e)}")
        import traceback
        print(f"Detailed error: {traceback.format_exc()}")
        return "Syllabus curriculum extraction failed"

# -------------------- EXTRACT DURATION --------------------
def extract_duration(soup):
    try:
        # Look for duration in program overview
        overview_items = soup.find_all("li", class_="has-small-font-size")
        for item in overview_items:
            text = item.get_text(strip=True)
            if "Duration" in text:
                return text
        
        # Alternative search
        duration_elem = soup.find(string=re.compile("Duration", re.IGNORECASE))
        if duration_elem:
            return duration_elem.strip()
        
        return "Duration not available"
    except Exception as e:
        print(f"⚠️ Duration extraction error: {str(e)}")
        return "Duration not available"

# -------------------- EXTRACT CERTIFICATE INFO --------------------
def extract_certificate(soup):
    try:
        # Look for certificate information
        certificate_items = soup.find_all("li", class_="has-small-font-size")
        for item in certificate_items:
            text = item.get_text(strip=True)
            if "certificate" in text.lower():
                return text
        
        # Look in certificate of completion section
        cert_section = soup.find("p", class_="has-small-font-size", string=lambda text: text and "Certificate of Completion" in text)
        if cert_section:
            cert_list = cert_section.find_next("ul", class_="wp-block-list")
            if cert_list:
                cert_item = cert_list.find("li", class_="has-small-font-size")
                if cert_item:
                    return cert_item.get_text(strip=True)
        
        return "Certificate information not available"
    except Exception as e:
        print(f"⚠️ Certificate extraction error: {str(e)}")
        return "Certificate information not available"

# -------------------- EXTRACT WHO SHOULD TAKE --------------------
def extract_who_should_take(soup):
    try:
        # Look for the course description paragraph that describes who should take the course
        course_name_elem = soup.find("p", class_="has-upper-heading-font-size")
        if course_name_elem:
            # Find the next paragraph with the description (this is usually the target audience description)
            next_elem = course_name_elem.find_next_sibling()
            while next_elem:
                if next_elem.name == "p" and "has-small-font-size" in next_elem.get("class", []):
                    description_text = next_elem.get_text(strip=True)
                    if description_text and not description_text.startswith("Program Overview"):
                        # This is typically the description that explains who the course is for
                        return description_text
                next_elem = next_elem.find_next_sibling()
        
        # Fallback: Look for prerequisites in program overview
        overview_items = soup.find_all("li", class_="has-small-font-size")
        for item in overview_items:
            text = item.get_text(strip=True)
            if "Prerequisites" in text or "suitable for beginners" in text.lower():
                return text
        
        return "Target audience information not available"
    except Exception as e:
        print(f"⚠️ Who should take extraction error: {str(e)}")
        return "Target audience information not available"

# -------------------- EXTRACT LEARNING MODE --------------------
def extract_learning_mode(soup):
    try:
        # Look for mode in program overview
        overview_items = soup.find_all("li", class_="has-small-font-size")
        for item in overview_items:
            text = item.get_text(strip=True)
            if "Mode:" in text or "Mode" in text:
                return text
        
        # Alternative search for learning mode
        mode_elem = soup.find(string=re.compile(r"Mode:|Instructor-led|live online", re.IGNORECASE))
        if mode_elem:
            return mode_elem.strip()
        
        return "Learning mode not available"
    except Exception as e:
        print(f"⚠️ Learning mode extraction error: {str(e)}")
        return "Learning mode not available"

# -------------------- SCRAPER --------------------
def scrape_course_data(url):
    driver = get_driver(headless=False)  # Set to False for debugging
    try:
        print(f"🌐 Accessing URL: {url}")
        driver.get(url)

        # Wait for page to load
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )

        # Scroll multiple times to ensure all content loads
        for i in range(3):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1)
            driver.execute_script("window.scrollTo(0, 0);")
            time.sleep(1)

        # Debug: Save page source for inspection
        with open("page_source.html", "w", encoding="utf-8") as f:
            f.write(driver.page_source)
        print("💾 Page source saved as page_source.html")

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # 1. Course Name
        course_name_elem = soup.find("p", class_="has-upper-heading-font-size")
        course_name = course_name_elem.get_text(strip=True) if course_name_elem else "Course Name Not Found"
        print(f"📛 Course Name: {course_name}")

        # 2. About Course - Get the first paragraph after course name
        about_course = "About course not available"
        if course_name_elem:
            # Find the next paragraph with the description
            next_elem = course_name_elem.find_next_sibling()
            while next_elem:
                if next_elem.name == "p" and "has-small-font-size" in next_elem.get("class", []):
                    about_text = next_elem.get_text(strip=True)
                    if about_text and not about_text.startswith("Program Overview"):
                        about_course = about_text
                        break
                next_elem = next_elem.find_next_sibling()
        print(f"📝 About Course: {about_course[:100]}...")

        # 3. Syllabus
        syllabus = extract_syllabus(soup)
        print(f"📚 Syllabus extracted ({len(syllabus)} characters)")
        
        # Debug: Print first 500 chars of syllabus
        print(f"📖 Syllabus preview: {syllabus[:500]}...")

        # 4. Syllabus Curriculum
        syllabus_curriculum = extract_syllabus_curriculum(soup)
        print(f"📘 Syllabus Curriculum extracted ({len(syllabus_curriculum)} characters)")
        
        # Debug: Print first 500 chars of syllabus curriculum
        print(f"📖 Syllabus Curriculum preview: {syllabus_curriculum[:500]}...")

        # 5. Certificate
        certificate_info = extract_certificate(soup)
        print(f"🏆 Certificate: {certificate_info}")

        # 6. Duration
        duration = extract_duration(soup)
        print(f"⏳ Duration: {duration}")

        # 7. Who Should Take
        who_should_take = extract_who_should_take(soup)
        print(f"🎯 Who Should Take: {who_should_take[:100]}...")

        # 8. Learning Mode
        learning_mode = extract_learning_mode(soup)
        print(f"🎓 Learning Mode: {learning_mode}")

        return course_name, about_course, syllabus, syllabus_curriculum, certificate_info, duration, who_should_take, learning_mode

    except Exception as e:
        print(f"🔥 Scraping failed: {str(e)}")
        import traceback
        print(f"Detailed error: {traceback.format_exc()}")
        return ["Error"] * 8
    finally:
        driver.quit()
        print("🚪 Browser closed")

# -------------------- SAVE TO EXCEL --------------------
def save_to_excel(data, file_path):
    columns = [
        "Course Name", 
        "About Course", 
        "Syllabus", 
        "Syllabus Curriculum", 
        "Certificate", 
        "Duration",
        "Who Should Take",
        "Learning Mode",
        "Course URL"
    ]

    try:
        if os.path.exists(file_path):
            df = pd.read_excel(file_path)
            for col in columns:
                if col not in df.columns:
                    df[col] = None
        else:
            df = pd.DataFrame(columns=columns)

        # Skip if course already exists
        if data[-1] in df["Course URL"].values:
            print(f"🔄 Course already exists: {data[0]}")
            return

        # Add new row with matching columns
        new_row = pd.DataFrame([dict(zip(columns, [*data[:8], data[-1]]))])
        df = pd.concat([df, new_row], ignore_index=True)
        df.to_excel(file_path, index=False)
        print(f"💾 Saved data for: {data[0]}")

    except Exception as e:
        print(f"❌ Excel save error: {e}")

# -------------------- MAIN EXECUTION --------------------
if __name__ == "__main__":
    course_urls = [
        "https://skillsetarena.com/data-science-and-machine-learning-with-ai/",
    ]

    print("🚀 Starting scraping process...")
    file_path = r"C:\Users\taslim.siddiqui\Downloads\course_smartset.xlsx"

    for course_url in course_urls:
        course_data = scrape_course_data(course_url)
        if all(item != "Error" for item in course_data):
            save_to_excel((*course_data, course_url), file_path)
            print(f"✅ Successfully scraped: {course_data[0]}")
        else:
            print(f"❌ Failed to scrape complete data for {course_url}")

    print("🎉 Process completed!")

🚀 Starting scraping process...
🌐 Accessing URL: https://skillsetarena.com/data-science-and-machine-learning-with-ai/
💾 Page source saved as page_source.html
📛 Course Name: Data Science and Machine learning Certification Program With AI(Gen AI & Prompt Engineering)
📝 About Course: Generative AI & Prompt Engineering:...
⚠️ Program Structure section not found
📚 Syllabus extracted (22 characters)
📖 Syllabus preview: Syllabus not available...
⚠️ Detailed Curriculum section not found
📘 Syllabus Curriculum extracted (24 characters)
📖 Syllabus Curriculum preview: Curriculum not available...
🏆 Certificate: Certificate information not available
⏳ Duration: • Duration: 6 months
🎯 Who Should Take: Generative AI & Prompt Engineering:...
🎓 Learning Mode: LLM –Large Language Models
🚪 Browser closed
💾 Saved data for: Data Science and Machine learning Certification Program With AI(Gen AI & Prompt Engineering)
✅ Successfully scraped: Data Science and Machine learning Certification Program With AI(Gen AI

In [1]:
import time
import os
import re
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import requests
import json
from collections import Counter

# -------------------- DRIVER SETUP --------------------
def get_driver(headless=True):
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--no-sandbox")
    options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--log-level=3")
    if headless:
        options.add_argument("--headless=new")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

# -------------------- SYLLABUS EXTRACTION --------------------
def extract_syllabus(soup):
    try:
        syllabus_text = ""
        
        # Find the Program Structure section
        program_structure = soup.find("p", class_="has-small-font-size", string=lambda text: text and "Program Structure" in text)
        
        if not program_structure:
            print("⚠️ Program Structure section not found")
            return "Syllabus not available"
        
        # Find all elements after Program Structure
        current_element = program_structure.find_next_sibling()
        
        while current_element:
            # Check for month headings
            if current_element.name == "p" and current_element.get("class") == ["has-small-font-size"]:
                month_text = current_element.get_text(strip=True)
                if "Month" in month_text:
                    syllabus_text += f"\n{month_text}\n"
            
            # Check for module details
            elif current_element.name == "details" and "wp-block-details" in current_element.get("class", []):
                # Get module summary
                summary = current_element.find("summary")
                if summary:
                    syllabus_text += f"🔹 {summary.get_text(strip=True)}\n"
                
                # Get the list items
                lessons_list = current_element.find("ul", class_="wp-block-list")
                if lessons_list:
                    lessons = lessons_list.find_all("li")
                    for lesson in lessons:
                        lesson_text = lesson.get_text(strip=True)
                        if lesson_text:
                            syllabus_text += f"    . {lesson_text}\n"
                
                syllabus_text += "\n"
            
            # Stop if we reach the next major section
            elif current_element.name == "p" and current_element.get("class") == ["has-small-font-size"]:
                next_section_text = current_element.get_text(strip=True)
                if "Key Features" in next_section_text or "Tools & Technologies" in next_section_text:
                    break
            
            current_element = current_element.find_next_sibling()
        
        return syllabus_text.strip() if syllabus_text else "Syllabus not available"
        
    except Exception as e:
        print(f"❌ Syllabus extraction error: {str(e)}")
        import traceback
        print(f"Detailed error: {traceback.format_exc()}")
        return "Syllabus extraction failed"

# -------------------- SYLLABUS CURRICULUM EXTRACTION --------------------
def extract_syllabus_curriculum(soup):
    try:
        curriculum_text = ""
        
        # Find the Detailed Curriculum section
        detailed_curriculum = soup.find("p", class_="has-small-font-size", string=lambda text: text and "Detailed Curriculum" in text)
        
        if not detailed_curriculum:
            print("⚠️ Detailed Curriculum section not found")
            return "Curriculum not available"
        
        # Find all elements after Detailed Curriculum
        current_element = detailed_curriculum.find_next_sibling()
        
        while current_element:
            # Check for month headings
            if current_element.name == "p" and current_element.get("class") == ["has-small-font-size"]:
                month_text = current_element.get_text(strip=True)
                if "Month" in month_text:
                    curriculum_text += f"\n{month_text}\n"
            
            # Check for module details
            elif current_element.name == "details" and "wp-block-details" in current_element.get("class", []):
                # Get module summary
                summary = current_element.find("summary")
                if summary:
                    curriculum_text += f"🔹 {summary.get_text(strip=True)}\n"
                
                # Get the list items
                lessons_list = current_element.find("ul", class_="wp-block-list")
                if lessons_list:
                    lessons = lessons_list.find_all("li")
                    for lesson in lessons:
                        lesson_text = lesson.get_text(strip=True)
                        if lesson_text:
                            curriculum_text += f"    . {lesson_text}\n"
                
                curriculum_text += "\n"
            
            # Stop if we reach the next major section
            elif current_element.name == "p" and current_element.get("class") == ["has-small-font-size"]:
                next_section_text = current_element.get_text(strip=True)
                if "Program Summary" in next_section_text or "Final Outcome" in next_section_text or "Learning Approach" in next_section_text:
                    break
            
            current_element = current_element.find_next_sibling()
        
        return curriculum_text.strip() if curriculum_text else "Curriculum not available"
        
    except Exception as e:
        print(f"❌ Syllabus curriculum extraction error: {str(e)}")
        import traceback
        print(f"Detailed error: {traceback.format_exc()}")
        return "Syllabus curriculum extraction failed"

# -------------------- EXTRACT DURATION --------------------
def extract_duration(soup):
    try:
        # Look for duration in program overview
        overview_items = soup.find_all("li", class_="has-small-font-size")
        for item in overview_items:
            text = item.get_text(strip=True)
            if "Duration" in text:
                return text
        
        # Alternative search
        duration_elem = soup.find(string=re.compile("Duration", re.IGNORECASE))
        if duration_elem:
            return duration_elem.strip()
        
        return "Duration not available"
    except Exception as e:
        print(f"⚠️ Duration extraction error: {str(e)}")
        return "Duration not available"

# -------------------- EXTRACT CERTIFICATE INFO --------------------
def extract_certificate(soup):
    try:
        # Look for certificate information
        certificate_items = soup.find_all("li", class_="has-small-font-size")
        for item in certificate_items:
            text = item.get_text(strip=True)
            if "certificate" in text.lower():
                return text
        
        # Look in certificate of completion section
        cert_section = soup.find("p", class_="has-small-font-size", string=lambda text: text and "Certificate of Completion" in text)
        if cert_section:
            cert_list = cert_section.find_next("ul", class_="wp-block-list")
            if cert_list:
                cert_item = cert_list.find("li", class_="has-small-font-size")
                if cert_item:
                    return cert_item.get_text(strip=True)
        
        return "Certificate information not available"
    except Exception as e:
        print(f"⚠️ Certificate extraction error: {str(e)}")
        return "Certificate information not available"

# -------------------- EXTRACT WHO SHOULD TAKE --------------------
def extract_who_should_take(soup):
    try:
        # Look for the course description paragraph that describes who should take the course
        course_name_elem = soup.find("p", class_="has-upper-heading-font-size")
        if course_name_elem:
            # Find the next paragraph with the description (this is usually the target audience description)
            next_elem = course_name_elem.find_next_sibling()
            while next_elem:
                if next_elem.name == "p" and "has-small-font-size" in next_elem.get("class", []):
                    description_text = next_elem.get_text(strip=True)
                    if description_text and not description_text.startswith("Program Overview"):
                        # This is typically the description that explains who the course is for
                        return description_text
                next_elem = next_elem.find_next_sibling()
        
        # Fallback: Look for prerequisites in program overview
        overview_items = soup.find_all("li", class_="has-small-font-size")
        for item in overview_items:
            text = item.get_text(strip=True)
            if "Prerequisites" in text or "suitable for beginners" in text.lower():
                return text
        
        return "Target audience information not available"
    except Exception as e:
        print(f"⚠️ Who should take extraction error: {str(e)}")
        return "Target audience information not available"

# -------------------- EXTRACT LEARNING MODE --------------------
def extract_learning_mode(soup):
    try:
        # Look for mode in program overview
        overview_items = soup.find_all("li", class_="has-small-font-size")
        for item in overview_items:
            text = item.get_text(strip=True)
            if "Mode:" in text or "Mode" in text:
                return text
        
        # Alternative search for learning mode
        mode_elem = soup.find(string=re.compile(r"Mode:|Instructor-led|live online", re.IGNORECASE))
        if mode_elem:
            return mode_elem.strip()
        
        return "Learning mode not available"
    except Exception as e:
        print(f"⚠️ Learning mode extraction error: {str(e)}")
        return "Learning mode not available"

# -------------------- SCRAPER --------------------
def scrape_course_data(url):
    driver = get_driver(headless=False)  # Set to False for debugging
    try:
        print(f"🌐 Accessing URL: {url}")
        driver.get(url)

        # Wait for page to load
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )

        # Scroll multiple times to ensure all content loads
        for i in range(3):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1)
            driver.execute_script("window.scrollTo(0, 0);")
            time.sleep(1)

        # Debug: Save page source for inspection
        with open("page_source.html", "w", encoding="utf-8") as f:
            f.write(driver.page_source)
        print("💾 Page source saved as page_source.html")

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # 1. Course Name
        course_name_elem = soup.find("p", class_="has-upper-heading-font-size")
        course_name = course_name_elem.get_text(strip=True) if course_name_elem else "Course Name Not Found"
        print(f"📛 Course Name: {course_name}")

        # 2. About Course - Get the first paragraph after course name
        about_course = "About course not available"
        if course_name_elem:
            # Find the next paragraph with the description
            next_elem = course_name_elem.find_next_sibling()
            while next_elem:
                if next_elem.name == "p" and "has-small-font-size" in next_elem.get("class", []):
                    about_text = next_elem.get_text(strip=True)
                    if about_text and not about_text.startswith("Program Overview"):
                        about_course = about_text
                        break
                next_elem = next_elem.find_next_sibling()
        print(f"📝 About Course: {about_course[:100]}...")

        # 3. Syllabus
        syllabus = extract_syllabus(soup)
        print(f"📚 Syllabus extracted ({len(syllabus)} characters)")
        
        # Debug: Print first 500 chars of syllabus
        print(f"📖 Syllabus preview: {syllabus[:500]}...")

        # 4. Syllabus Curriculum
        syllabus_curriculum = extract_syllabus_curriculum(soup)
        print(f"📘 Syllabus Curriculum extracted ({len(syllabus_curriculum)} characters)")
        
        # Debug: Print first 500 chars of syllabus curriculum
        print(f"📖 Syllabus Curriculum preview: {syllabus_curriculum[:500]}...")

        # 5. Certificate
        certificate_info = extract_certificate(soup)
        print(f"🏆 Certificate: {certificate_info}")

        # 6. Duration
        duration = extract_duration(soup)
        print(f"⏳ Duration: {duration}")

        # 7. Who Should Take
        who_should_take = extract_who_should_take(soup)
        print(f"🎯 Who Should Take: {who_should_take[:100]}...")

        # 8. Learning Mode
        learning_mode = extract_learning_mode(soup)
        print(f"🎓 Learning Mode: {learning_mode}")

        return course_name, about_course, syllabus, syllabus_curriculum, certificate_info, duration, who_should_take, learning_mode

    except Exception as e:
        print(f"🔥 Scraping failed: {str(e)}")
        import traceback
        print(f"Detailed error: {traceback.format_exc()}")
        return ["Error"] * 8
    finally:
        driver.quit()
        print("🚪 Browser closed")

# -------------------- SAVE TO EXCEL --------------------
def save_to_excel(data, file_path):
    columns = [
        "Course Name", 
        "About Course", 
        "Syllabus", 
        "Syllabus Curriculum", 
        "Certificate", 
        "Duration",
        "Who Should Take",
        "Learning Mode",
        "Course URL"
    ]

    try:
        if os.path.exists(file_path):
            df = pd.read_excel(file_path)
            for col in columns:
                if col not in df.columns:
                    df[col] = None
        else:
            df = pd.DataFrame(columns=columns)

        # Skip if course already exists
        if data[-1] in df["Course URL"].values:
            print(f"🔄 Course already exists: {data[0]}")
            return

        # Add new row with matching columns
        new_row = pd.DataFrame([dict(zip(columns, [*data[:8], data[-1]]))])
        df = pd.concat([df, new_row], ignore_index=True)
        df.to_excel(file_path, index=False)
        print(f"💾 Saved data for: {data[0]}")

    except Exception as e:
        print(f"❌ Excel save error: {e}")

# -------------------- READ URLS FROM EXCEL --------------------
def read_urls_from_excel(file_path):
    """
    Read course URLs from an Excel file.
    Expected column name: 'Course URL' or 'URL'
    """
    try:
        df = pd.read_excel(file_path)
        
        # Check for possible column names
        url_columns = ['Course URL', 'URL', 'course_url', 'url', 'Link', 'link']
        url_column = None
        
        for col in url_columns:
            if col in df.columns:
                url_column = col
                break
        
        if url_column is None:
            print("❌ No URL column found in the Excel file.")
            print(f"Available columns: {list(df.columns)}")
            return []
        
        # Get URLs and remove any NaN values
        urls = df[url_column].dropna().tolist()
        
        print(f"📖 Found {len(urls)} URLs in the Excel file")
        return urls
        
    except Exception as e:
        print(f"❌ Error reading Excel file: {e}")
        return []

# -------------------- MAIN EXECUTION --------------------
if __name__ == "__main__":
    # Input Excel file path
    input_file_path = r"C:\Users\taslim.siddiqui\Downloads\skill_course_links.xlsx" # Change this to your input file path
    output_file_path = r"C:\Users\taslim.siddiqui\Downloads\course_smartset_all.xlsx"
    
    print("🚀 Starting scraping process...")
    
    # Read URLs from Excel file
    course_urls = read_urls_from_excel(input_file_path)
    
    if not course_urls:
        print("❌ No URLs found to scrape. Please check your input Excel file.")
        exit()
    
    print(f"🔗 URLs to scrape: {course_urls}")
    
    for course_url in course_urls:
        print(f"\n{'='*50}")
        print(f"Scraping: {course_url}")
        print(f"{'='*50}")
        
        course_data = scrape_course_data(course_url)
        if all(item != "Error" for item in course_data):
            save_to_excel((*course_data, course_url), output_file_path)
            print(f"✅ Successfully scraped: {course_data[0]}")
        else:
            
            print(f"❌ Failed to scrape complete data for {course_url}")

    print("🎉 Process completed!")

🚀 Starting scraping process...
📖 Found 17 URLs in the Excel file
🔗 URLs to scrape: ['https://skillsetarena.com/cyber-security-program/', 'https://skillsetarena.com/data-science-11-online-certification-program/', 'https://skillsetarena.com/data-science-21-online-certification-program/', 'https://skillsetarena.com/data-science-and-machine-learning-with-ai/', 'https://skillsetarena.com/executive-certification-program/', 'https://skillsetarena.com/full-stack-development/', 'https://skillsetarena.com/generative-ai-and-prompt-engineering/', 'https://skillsetarena.com/google-looker-studio-program/', 'https://skillsetarena.com/google-sheet-certification-program/', 'https://skillsetarena.com/investment-banking/', 'https://skillsetarena.com/life-skill-program/', 'https://skillsetarena.com/machine-learning-artificial-intelligence-11-online-certification-program/', 'https://skillsetarena.com/machine-learning-artificial-intelligence-21-online-certification-program/', 'https://skillsetarena.com/micr