# Course link Extract  MCKL link only

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time

def get_all_course_links(base_url="https://klic.mkcl.org"):
    """Get all course links from the MKCL KLiC website by handling pagination"""
    all_courses = []
    page = 1
    max_pages = 50  # Safety limit to prevent infinite loops
    
    while page <= max_pages:
        # Construct URL for the current page
        if page == 1:
            url = f"{base_url}/klic-courses"
        else:
            url = f"{base_url}/klic-courses/page:{page}"
        
        print(f"Scraping page {page}: {url}")
        
        try:
            # Fetch the page content
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            
            # Parse the HTML
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find all course cards
            course_cards = soup.find_all('a', class_='card', href=True)
            
            if not course_cards:
                print("No more courses found. Ending scraping.")
                break
            
            # Extract course information
            for card in course_cards:
                # Extract course name
                title_element = card.find('div', class_='ct-title')
                course_name = title_element.get_text(strip=True) if title_element else "Unknown Course"
                
                # Extract course link
                course_link = card['href']
                full_link = urljoin(base_url, course_link)
                
                all_courses.append({
                    "Course Name": course_name,
                    "Course Link": full_link
                })
            
            # Check if there's a next page
            next_page_link = soup.find('a', rel='next')
            if not next_page_link:
                print("No next page found. Ending scraping.")
                break
                
            page += 1
            time.sleep(1)  # Be polite with a delay between requests
            
        except requests.exceptions.RequestException as e:
            print(f"Error fetching page {page}: {e}")
            break
    
    return all_courses

def main():
    # Get all courses from the website
    all_courses = get_all_course_links()
    
    # Create DataFrame and remove duplicates
    df = pd.DataFrame(all_courses)
    df = df.drop_duplicates(subset=["Course Link"])
    
    # Save to Excel
    output_path = r'C:\Users\taslim.siddiqui\Downloads\MKCL_KLiC_All_Courses.xlsx'
    df.to_excel(output_path, index=False)
    
    print(f"✅ Successfully extracted {len(df)} courses!")
    print(f"📂 Saved to: {output_path}")
    
    # Display sample of the results
    print("\nSample of extracted courses:")
    print(df.head(10).to_string(index=False))

if __name__ == "__main__":
    main()

# Course link extraction only 14 courses for megasoft link only

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import os
import re

def extract_megasoft_courses(html_content):
    """Extract course names and links from Megasoft Technologies HTML content"""
    soup = BeautifulSoup(html_content, 'html.parser')
    
    courses = []
    
    # Find all course elements
    course_elements = soup.find_all('div', class_='edublink-single-course')
    
    for course in course_elements:
        # Extract course name
        title_element = course.find('h6', class_='title')
        if not title_element:
            title_element = course.find(['h6', 'h5', 'h4', 'h3', 'h2'], class_=re.compile(r'title'))
        
        if title_element:
            course_name = title_element.get_text(strip=True)
            
            # Extract course link
            link_element = title_element.find('a')
            if not link_element:
                link_element = course.find('a', class_='course-thumb')
            
            if link_element and link_element.has_attr('href'):
                course_link = link_element['href']
                
                # Make sure link is absolute, not relative
                if not course_link.startswith('http'):
                    base_url = "https://megasofttech.in"
                    course_link = base_url + (course_link if course_link.startswith('/') else '/' + course_link)
                
                courses.append({
                    "Course Name": course_name,
                    "Course Link": course_link
                })
    
    return courses

def fetch_all_courses():
    """Fetch courses from all pages"""
    base_url = "https://megasofttech.in/courses/"
    all_courses = []
    page = 1
    
    while True:
        if page == 1:
            url = base_url
        else:
            url = f"{base_url}page/{page}/"
        
        print(f"📄 Fetching page {page}: {url}")
        
        html_content = fetch_html_from_url(url)
        if not html_content:
            print(f"❌ Failed to fetch page {page}")
            break
        
        # Check if this page has courses
        soup = BeautifulSoup(html_content, 'html.parser')
        course_elements = soup.find_all('div', class_='edublink-single-course')
        
        if not course_elements:
            print("⏹️ No more courses found")
            break
        
        # Extract courses from this page
        page_courses = extract_megasoft_courses(html_content)
        all_courses.extend(page_courses)
        
        print(f"✅ Found {len(page_courses)} courses on page {page}")
        
        # Check if there's a next page
        next_page_link = soup.find('a', class_='page-numbers', href=True)
        if not next_page_link or 'page/' not in next_page_link['href']:
            break
            
        page += 1
        
        # Safety limit to prevent infinite loops
        if page > 10:
            print("⚠️ Safety limit reached (10 pages)")
            break
    
    return all_courses

def fetch_html_from_url(url):
    """Fetch HTML content from a URL"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        print(f"❌ Error fetching website: {e}")
        return None

def read_html_from_file(file_path):
    """Read HTML content from a file"""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except FileNotFoundError:
        print(f"❌ File not found: {file_path}")
        return None
    except Exception as e:
        print(f"❌ Error reading file: {e}")
        return None

def main():
    # Option 1: Fetch all courses from the website (with pagination)
    all_courses = fetch_all_courses()
    
    # Option 2: Load HTML from a local file (uncomment below lines)
    # file_path = "All Courses - Megasoft Technologies.html"
    # html_content = read_html_from_file(file_path)
    # if html_content:
    #     all_courses = extract_megasoft_courses(html_content)
    
    if not all_courses:
        print("❌ No courses found. Please check the source.")
        return
    
    # Create DataFrame
    df = pd.DataFrame(all_courses)
    
    # Remove duplicates (just in case)
    df = df.drop_duplicates(subset=['Course Name', 'Course Link'])
    
    # Save to Excel
    output_path = r'C:\Users\taslim.siddiqui\Downloads\Megasoft_Technologies_Coursespage2.xlsx'
    
    try:
        df.to_excel(output_path, index=False)
        print(f"✅ Successfully extracted {len(df)} courses!")
        print(f"📂 Saved to: {output_path}")
        
        # Display the results
        print("\nExtracted courses:")
        print(df.to_string(index=False))
        
    except Exception as e:
        print(f"❌ Error saving to Excel: {e}")
        # Try alternative path if the original fails
        alternative_path = "Megasoft_Technologies_Courses.xlsx"
        df.to_excel(alternative_path, index=False)
        print(f"📂 Saved to alternative location: {os.path.abspath(alternative_path)}")

if __name__ == "__main__":
    main()