In [None]:
import json
from bs4 import BeautifulSoup

def get_ps_id(table):
    for tr in table.select("tr"):
        th = tr.find("th")
        if th and "Problem Statement ID" in th.get_text(strip=True):
            td = tr.find("td")
            return td.get_text(strip=True) if td else ""
    return ""

def get_ps_title(table):
    for tr in table.select("tr"):
        th = tr.find("th")
        if th and "Problem Statement Title" in th.get_text(strip=True):
            td = tr.find("td")
            return td.get_text(strip=True) if td else ""
    return ""

def get_ps_description(table):
    for tr in table.select("tr"):
        th = tr.find("th")
        if th and "Description" in th.get_text(strip=True):
            div = tr.find("div", class_="style-2")
            if div:
                for br in div.find_all("br"):
                    br.replace_with("\n")
                return div.get_text(strip=True)
    return ""

def get_ps_organization(table):
    for tr in table.select("tr"):
        th = tr.find("th")
        if th and "Organization" in th.get_text(strip=True):
            td = tr.find("td")
            return td.get_text(strip=True) if td else ""
    return ""

def get_ps_department(table):
    for tr in table.select("tr"):
        th = tr.find("th")
        if th and "Department" in th.get_text(strip=True):
            td = tr.find("td")
            return td.get_text(strip=True) if td else ""
    return ""

def get_ps_category(table):
    for tr in table.select("tr"):
        th = tr.find("th")
        if th and "Category" in th.get_text(strip=True):
            td = tr.find("td")
            return td.get_text(strip=True) if td else ""
    return ""

def get_ps_theme(table):
    for tr in table.select("tr"):
        th = tr.find("th")
        if th and "Theme" in th.get_text(strip=True):
            td = tr.find("td")
            return td.get_text(strip=True) if td else ""
    return ""

def get_youtube_link(table):
    for tr in table.select("tr"):
        th = tr.find("th")
        if th and "YouTube Link" in th.get_text(strip=True):
            td = tr.find("td")
            return td.get_text(strip=True) if td else ""
    return ""

def get_dataset_link(table):
    for tr in table.select("tr"):
        th = tr.find("th")
        if th and "Dataset Link" in th.get_text(strip=True):
            td = tr.find("td")
            return td.get_text(strip=True) if td else ""
    return ""

# html = open("sih2025PS.html", "r", encoding="utf-8").read()
import requests
url = "https://sih.gov.in/sih2025PS"
response = requests.get(
    url=url,
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/",
        "Accept-Language": "en-US,en;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
    })
print("Status Code:", response.status_code)
html = response.text
soup = BeautifulSoup(html, "html.parser")
tbody = soup.select_one("#dataTablePS > tbody")
content = {}

for tr in tbody.select("tr"):
    tds = tr.find_all("td", class_="colomn_border")
    if not tds:
        continue
    sno = tds[0].get_text(strip=True)
    
    settings_table = tr.find("table", id="settings")
    if not settings_table:
        continue
    
    content[sno] = {
        "ps_id": get_ps_id(settings_table),
        "ps_title": get_ps_title(settings_table),
        "ps_description": get_ps_description(settings_table),
        "ps_organization": get_ps_organization(settings_table),
        "ps_department": get_ps_department(settings_table),
        "ps_category": get_ps_category(settings_table),
        "ps_theme": get_ps_theme(settings_table),
        "submission_ideas_count": tds[5].get_text(strip=True) if len(tds) > 5 else "",
        "youtube_link": get_youtube_link(settings_table) or "",
        "dataset_link": get_dataset_link(settings_table) or ""
    }



In [None]:
import pandas as pd
df = pd.DataFrame.from_dict(content, orient='index')
df.to_csv('sih2025PS_extracted.csv', index_label='sno', encoding='utf-8')
print('Exported to sih2025PS_extracted.csv')