In [11]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import json
import re

In [12]:
BASE_URL = "https://mosdac.gov.in/"
urls = ["global-ocean-surface-current","high-resolution-sea-surface-salinity","indian-mainland-coastal-product","ocean-subsurface","oceanic-eddies-detection","sea-ice-occurrence-probability","wave-based-renewable-energy"]


In [13]:
def is_parent(sr):
    return sr.isdigit()

def extract_parent(sr):
    return re.match(r"(\d+)", sr).group(1)

In [14]:
# LOOPING THROUGH URLS
for url in urls:

    response = requests.get(BASE_URL + url)
    print(url, ": ", response.status_code)

    soup = BeautifulSoup(response.text, "html.parser")

    # NOW scraping happens inside loop
    table = soup.find("h3", string=lambda x: x and "MetaData" in x)
    table = table.find_next("table")
    rows = table.find_all("tr")

    metadata = {}

    for row in rows[1:]:
        cols = [c.get_text(strip=True) for c in row.find_all("td")]
        if len(cols) != 3:
            continue

        sr_no, element, definition = cols

        if is_parent(sr_no):
            metadata[sr_no] = {
                "sr_no": sr_no,
                "metadata_element": element,
                "definition": definition,
                "children": {}
            }
        else:
            parent_key = extract_parent(sr_no)

            if parent_key not in metadata:
                metadata[parent_key] = {
                    "sr_no": parent_key,
                    "metadata_element": "",
                    "definition": "",
                    "children": {}
                }

            metadata[parent_key]["children"][sr_no] = {
                "sr_no": sr_no,
                "metadata_element": element,
                "definition": definition
            }

    # convert empty children {} â†’ null
    for key, value in metadata.items():
        if value["children"] == {}:
            value["children"] = None

    # SAVE JSON (fix your typo: .json not .josn)
    save_path = f"C:\\Users\\WELCOME\\Desktop\\mosdac_chatbot\\open_data_section\\ocean\\{url}.json"

    with open(save_path, "w", encoding="utf-8") as f:
        json.dump(metadata, f, indent=4, ensure_ascii=False)

    print("Saved:", save_path)

global-ocean-surface-current :  200
Saved: C:\Users\WELCOME\Desktop\mosdac_chatbot\open_data_section\ocean\global-ocean-surface-current.json
high-resolution-sea-surface-salinity :  200
Saved: C:\Users\WELCOME\Desktop\mosdac_chatbot\open_data_section\ocean\high-resolution-sea-surface-salinity.json
indian-mainland-coastal-product :  200
Saved: C:\Users\WELCOME\Desktop\mosdac_chatbot\open_data_section\ocean\indian-mainland-coastal-product.json
ocean-subsurface :  200
Saved: C:\Users\WELCOME\Desktop\mosdac_chatbot\open_data_section\ocean\ocean-subsurface.json
oceanic-eddies-detection :  200
Saved: C:\Users\WELCOME\Desktop\mosdac_chatbot\open_data_section\ocean\oceanic-eddies-detection.json
sea-ice-occurrence-probability :  200
Saved: C:\Users\WELCOME\Desktop\mosdac_chatbot\open_data_section\ocean\sea-ice-occurrence-probability.json
wave-based-renewable-energy :  200
Saved: C:\Users\WELCOME\Desktop\mosdac_chatbot\open_data_section\ocean\wave-based-renewable-energy.json


In [15]:
print(json.dumps(metadata, indent=4, ensure_ascii=False))

{
    "1": {
        "sr_no": "1",
        "metadata_element": "Metadata language",
        "definition": "English",
        "children": null
    },
    "2": {
        "sr_no": "2",
        "metadata_element": "Metadata Contact",
        "definition": "MOSDAC",
        "children": null
    },
    "3": {
        "sr_no": "3",
        "metadata_element": "Metadata date",
        "definition": "August 3rd ,2015",
        "children": null
    },
    "4": {
        "sr_no": "4",
        "metadata_element": "Data Lineage",
        "definition": "Wave power in Kilowatt/meter from Altimeters over Indian Ocean Region",
        "children": null
    },
    "5": {
        "sr_no": "5",
        "metadata_element": "Title",
        "definition": "Altimeters for Wave based Renewable Energy (AWARE)",
        "children": null
    },
    "6": {
        "sr_no": "6",
        "metadata_element": "Abstract",
        "definition": "The wave power is computed from the altimeters (Jason-2 and SARAL/AltiKa) fo