# gitinfo.com CPPP LINK EXTRACTION


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from urllib.parse import urljoin

# Base configuration
base_url = "https://gitinfo.com"
visited_urls = set()
course_links = []
headers = {'User-Agent': 'Mozilla/5.0'}

def is_course_url(url):
    """Check if URL is a course page based on GIT Academy structure"""
    url = url.lower()
    
    # Must contain these patterns
    course_patterns = [
        '/courses/',
        '/course/',
        '/training/',
        '/learn/',
        '/comptia-',
        '/ccna/',
        '/ccnp-',
        '/ceh-',
        '/chfi-',
        '/certified-',
        '/exam-'
    ]
    
    # Must NOT contain these patterns
    exclude_patterns = [
        '/tag/',
        '/category/',
        '/page/',
        '/author/',
        '/search/',
        '/wp-',
        '/feed/'
    ]
    
    return any(pattern in url for pattern in course_patterns) and \
           not any(pattern in url for pattern in exclude_patterns)

def extract_courses_from_page(url):
    try:
        print(f"🔍 Scanning: {url}")
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code != 200:
            print(f"⚠️ Failed to access {url} - Status code: {response.status_code}")
            return

        soup = BeautifulSoup(response.text, "html.parser")
        
        # Extract course title from the page
        title = soup.find('h1')
        if title:
            course_name = title.get_text(strip=True)
            
            # Only add if we don't already have this course
            if not any(x['Course Link'] == url for x in course_links):
                course_links.append({
                    "Course Name": course_name,
                    "Course Link": url
                })
                print(f"✅ Found course: {course_name} - {url}")

    except Exception as e:
        print(f"⚠️ Error processing {url}: {str(e)}")

def extract_courses_from_menu():
    try:
        print("🍔 Extracting courses from main menu...")
        response = requests.get(base_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Find the main menu items
        menu_items = soup.select('#menu-main-menu > li.menu-item')
        
        for item in menu_items:
            # Check if this is a training courses menu item
            if 'Training Courses' in item.get_text():
                # Find all submenu items
                sub_menus = item.select('ul.gm-dropdown-menu--lvl-1 li.gm-menu-item a')
                
                for menu in sub_menus:
                    href = menu['href']
                    text = menu.get_text(strip=True)
                    
                    # Skip empty or non-course links
                    if not text or not href:
                        continue
                        
                    full_url = urljoin(base_url, href)
                    
                    # If this is a category page, we'll need to scan it
                    if '/courses/' in full_url.lower():
                        extract_courses_from_url(full_url)
                    else:
                        # Direct course link
                        if not any(x['Course Link'] == full_url for x in course_links):
                            course_links.append({
                                "Course Name": text,
                                "Course Link": full_url
                            })
                            print(f"✅ Found course: {text} - {full_url}")

    except Exception as e:
        print(f"⚠️ Error extracting menu: {str(e)}")

def extract_courses_from_url(url):
    try:
        if url in visited_urls:
            return
            
        visited_urls.add(url)
        print(f"🌐 Scanning category: {url}")
        
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Find course links on category pages
        course_anchors = soup.select('a[href*="/courses/"], a[href*="/course/"]')
        
        for a in course_anchors:
            href = a['href']
            text = a.get_text(strip=True)
            
            if not text or not href:
                continue
                
            full_url = urljoin(base_url, href)
            
            if is_course_url(full_url) and not any(x['Course Link'] == full_url for x in course_links):
                course_links.append({
                    "Course Name": text,
                    "Course Link": full_url
                })
                print(f"✅ Found course: {text} - {full_url}")

    except Exception as e:
        print(f"⚠️ Error processing category {url}: {str(e)}")

# Start scraping
extract_courses_from_menu()

# Also scan known important pages
important_pages = [
    base_url + "/courses/",
    base_url + "/courses/cisco/",
    base_url + "/courses/comptia/",
    base_url + "/courses/ec-council/",
    base_url + "/courses/certnexus/",
    base_url + "/courses/microsoft/",
    base_url + "/courses/cyber-security-master-program/"
]

for page in important_pages:
    extract_courses_from_url(page)

# Save results to Excel
if course_links:
    df = pd.DataFrame(course_links)
    df = df.drop_duplicates(subset=["Course Link"]).reset_index(drop=True)

    output_file = "C:\\Users\\taslim.siddiqui\\Downloads\\GIT_Academy_Courses.xlsx"
    df.to_excel(output_file, index=False)

    print(f"\n✅ Successfully extracted {len(df)} unique courses!")
    print(f"📁 Saved to: {output_file}")
else:
    print("❌ No courses found. Please check the website structure.")