In [22]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def fetch_rendered_html(url, wait_selector=None, wait_time=15):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    )
    # Todo - Change the path to windows path
    service = Service(r"c:\Users\Nachappa\chromedriver.exe")
    driver = webdriver.Chrome(service=service, options=chrome_options)
    try:
        driver.get(url)
        if wait_selector:
            WebDriverWait(driver, wait_time).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, wait_selector))
            )
        rendered_html = driver.page_source
    except Exception as ex:
        print(ex)
    finally:
        driver.quit()
    return rendered_html

In [23]:
from bs4 import BeautifulSoup
import csv
from urllib.parse import urlparse

class EmptyExtraction(Exception):
    pass

def extract(url):
    try:
        rendered_html = fetch_rendered_html(url)
    except Exception as e:
        print(f"Error fetching HTML from {url}: {e}")
        return []

    

    try:
        soup = BeautifulSoup(rendered_html, "html.parser")
        text_len = len(soup.get_text(strip=True))
        if text_len < 50:  # threshold you decide
            raise EmptyExtraction(f"Extraction failed for {url}. HTML content is junk/empty.")
    except EmptyExtraction as e:
        print(e)
        return []

    

    try:
        soup = BeautifulSoup(rendered_html, "html.parser")
    except Exception as e:
        print(f"Error parsing HTML from {url}: {e}")
        return []

    clickable_links=[a['href'] for a in soup.find_all("a", href=True)]
    
    cleaned_clickable_links = []
    for a in clickable_links:
        if ('https' in a) or (('#' not in a) and ('mailto' not in a) and (a != '/')):
            cleaned_clickable_links.append(a)
        
    
    # Completing the unfinished links
    cleaned_clickable_links = [(url[:-1] + a) if ('https' not in a) else a for a in cleaned_clickable_links]
    
    # Removing Duplicates
    cleaned_clickable_links = list(set(cleaned_clickable_links))
    
    
    

    social_media_platforms = [
        "facebook",
        "instagram",
        "whatsapp",
        "messenger",
        "twitter",
        "x",
        "linkedin",
        "snapchat",
        "tiktok",
        "youtube",
        "pinterest",
        "reddit",
        "discord",
        "telegram",
        "spotify"
    ]
    social_media_links = [
        a for a in cleaned_clickable_links
        if any(platform.lower() in a.lower() for platform in social_media_platforms)
    ]
    
    links = list(set(cleaned_clickable_links) - set(social_media_links))
    
    

    # ---- CSV Writing Section ----
    parsed_url = urlparse(url)
    domain = parsed_url.netloc.replace("www.", "")  # e.g., startupindia.gov.in
    base_name = domain.split(".")[0]                # e.g., startupindia
    csv_filename = f"{base_name}.csv"
    try:
        with open(csv_filename, mode="w", newline="", encoding="utf-8") as file:
            writer = csv.writer(file)
            writer.writerow(["Link"])  # Header row
            
            
            
            for link in links:
                writer.writerow([link])
        
        print(f"✅ Links saved to {csv_filename}")
    except Exception as e:
        print(f"Error writing CSV: {e}")
    # -----------------------------

    return links

In [24]:
extract('https://www.startupindia.gov.in/')

✅ Links saved to startupindia.csv


['https://www.startupindia.gov.in/content/sih/en/search.html?roles=Accelerator&page=0',
 'https://www.startupindia.gov.in/content/sih/en/ams-application/application-listing.html',
 'https://www.startupindia.gov.in/content/sih/en/logo-form.html',
 'https://investorconnect.startupindia.gov.in/',
 'https://www.startupindia.gov.in/content/sih/en/user/my-dashboard',
 'https://www.startupindia.gov.in/content/sih/en/about-startup-india-initiative.html',
 'https://www.startupindia.gov.in/content/sih/en/BRICS.html',
 'https://www.startupindia.gov.in/content/sih/en/international.html',
 'https://www.startupindia.gov.in/content/sih/en/intellectual-property-rights.html',
 'https://www.startupindia.gov.in/content/sih/en/reources/market-research.html',
 'https://www.startupindia.gov.in/content/sih/en/disclaimer.html',
 'https://apna.co/contests/bharat-livelihood-challenge?utm_source=dpiit&utm_medium=dpiit&utm_campaign=dpiit',
 'https://www.startupindia.gov.in/content/dam/invest-india/Templates/publi