# Ipec Solutions Pvt Ltd  Link extraction


In [None]:

import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def extract_all_courses(html_content, base_url):
    soup = BeautifulSoup(html_content, "html.parser")
    courses = []

    # Loop through every icon-box-description (course name)
    for desc in soup.select(".elementor-icon-box-description"):
        course_name = desc.get_text(separator=" ", strip=True)
        if not course_name or course_name.lower() in ['top courses', 'fast track learning programmes']:
            continue

        # Find the nearest "Know More" button within the same grandparent block
        parent_section = desc.find_parent(class_="elementor-widget")
        if not parent_section:
            continue

        link_tag = parent_section.find_next("a", class_="elementor-button", href=True)
        if link_tag:
            course_link = urljoin(base_url, link_tag["href"])
            courses.append({
                "Course Name": course_name,
                "Course Link": course_link
            })

    # Remove duplicates
    df = pd.DataFrame(courses)
    df = df.drop_duplicates(subset="Course Link").sort_values("Course Name").reset_index(drop=True)
    return df

def main():
    html_file = "C:\\Users\\taslim.siddiqui\\Downloads\\Courses List – iPEC Solutions Private Limited.html"
    output_file = "C:\\Users\\taslim.siddiqui\\Downloads\\iPEC_Solutions_Courses_Clean.xlsx"

    with open(html_file, "r", encoding="utf-8") as f:
        html = f.read()

    base_url = "https://ipecsolutions.com"
    df = extract_all_courses(html, base_url)

    if df.empty:
        print("⚠️ No course data found.")
    else:
        df.to_excel(output_file, index=False)
        print(f"✅ Extracted {len(df)} courses.")
        print(f"📂 Saved to: {output_file}")
        print("📊 Sample:")
        print(df.head(10).to_string(index=False))

if __name__ == "__main__":
    main()
