# Global Institute of Regulatory link extraction

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from collections import defaultdict

def extract_courses_from_boxes(soup, base_url):
    link_to_name = {}
    for box in soup.select('.single-causes-box'):
        link = box.select_one('h4 a')
        if link:
            course_name = link.get_text(strip=True)
            course_link = urljoin(base_url, link['href']).strip()
            if course_link not in link_to_name:
                link_to_name[course_link] = course_name
    return link_to_name

def extract_courses_from_footer(soup, base_url):
    link_to_name = {}
    for footer_section in soup.select('.f-widget'):
        if "Our Courses" in footer_section.get_text():
            for link in footer_section.select('a[href]'):
                course_name = link.get_text(strip=True)
                course_link = urljoin(base_url, link['href']).strip()
                if course_link not in link_to_name:
                    link_to_name[course_link] = course_name
    return link_to_name

def extract_courses_from_courses_page(soup, base_url):
    link_to_name = {}
    for link in soup.select('a[href]'):
        course_name = link.get_text(strip=True)
        href = link.get('href')
        if href and course_name and (
            '/course' in href or '/diploma' in href or '/training' in href or '/validation' in href
        ):
            course_link = urljoin(base_url, href).strip()
            if course_link not in link_to_name:
                link_to_name[course_link] = course_name
    return link_to_name

def collect_all_links(soup, base_url):
    all_links = set()
    for a in soup.select('a[href]'):
        href = urljoin(base_url, a['href']).strip()
        all_links.add(href)
    return all_links

def find_links_with_multiple_names(soup, base_url):
    link_names = defaultdict(list)
    for a in soup.select('a[href]'):
        href = urljoin(base_url, a['href']).strip()
        name = a.get_text(strip=True)
        if href and name:
            link_names[href].append(name)
    
    for link, names in link_names.items():
        if len(set(names)) > 1:
            print(f"⚠️ Link with multiple names: {link} -> {set(names)}")

def extract_all_courses(html_content, base_url):
    soup = BeautifulSoup(html_content, 'html.parser')
    combined = {}

    # Combine from all sources
    sources = [
        extract_courses_from_boxes(soup, base_url),
        extract_courses_from_footer(soup, base_url),
        extract_courses_from_courses_page(soup, base_url),
    ]

    for source in sources:
        for link, name in source.items():
            if link not in combined:
                combined[link] = name

    # Log multiple names for same link
    find_links_with_multiple_names(soup, base_url)

    # Log all found links for audit
    all_links = collect_all_links(soup, base_url)
    print(f"🔍 Total unique hrefs found on page (for audit): {len(all_links)}")

    # Convert to DataFrame
    df = pd.DataFrame([(name, link) for link, name in combined.items()],
                      columns=["Course Name", "Course Link"])
    df = df.sort_values("Course Name").reset_index(drop=True)
    return df

def main():
    file_path = "C:\\Users\\taslim.siddiqui\\Downloads\\Global Institute of Regulatory Affairs-Pharmaceutical Training Courses.html"
    base_url = "https://www.regulatoryinstitute.com/"
    
    with open(file_path, "r", encoding="utf-8") as f:
        html = f.read()
    
    df = extract_all_courses(html, base_url)

    print(f"\n✅ Found {len(df)} unique course links:")
    print(df.to_string(index=False))

    output_path = "C:\\Users\\taslim.siddiqui\\Downloads\\GIRA_Courses_UniqueByLink.xlsx"
    df.to_excel(output_path, index=False)
    print(f"\n📂 Saved to: {output_path}")

if __name__ == "__main__":
    main()
