In [1]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import re



## Defining const


In [2]:
BASE_URL = "https://mosdac.gov.in/"

HEADINGS = ["Data Access","Data Version","Data Sources","Data Citation", "Processing Steps", "References", "Derivation Techniques and Algorithm", "Limitations", "Known problems with data", "Related data collections","File Naming Convention"]

URLS = ["3d-volumetric-terls-dwrproduct","inland-water-height","river-discharge","soil-moisture-0"]


In [3]:

def checker(data):
    if not data:
        return False
    else:
        return True


In [4]:
# this is used inside the get_data()
def _fix_link(link):
    # if link starts with '/', prepend BASE_URL
    if re.match(r"^/", link):
        return BASE_URL + link
    return link

In [5]:
#gets the data in json format

def get_data(all_h3):

    data = [] # this will contain the data in json format

    for h in all_h3:
        h_text = h.get_text(strip=True)

        if h_text in HEADINGS:
            title = h_text
            content_div = h.find_next_sibling("div")

            # Extract full text
            full_text = content_div.get_text(" ", strip=True).replace("\xa0", " ")

            # Extract all links
            link_tags = content_div.find_all("a")
            links = [a.get("href") for a in link_tags]

            if not link_tags:
                data.append({
                    "title": title,
                    "text": full_text
                })
            else:
                data.append({
                    "title": title,
                    "text": full_text,
                    "links": [_fix_link(l) for l in links]  # FIX
                })
    return data


In [6]:

for url in URLS:
    N_url = BASE_URL + url

    #sending req
    res = requests.get(N_url)
    soup = BeautifulSoup(res.text,"html.parser")

    #getting the parent div & sub-headings
    parent_div = soup.find(id="accordion")
    all_h3 = parent_div.find_all("h3")

    # functions timee
    data = get_data(all_h3)
    
    if not checker(data):
        print(f"the data is empty for url: {url}")
        break

    # Save JSON file
    path = rf"C:\Users\WELCOME\Desktop\mosdac_chatbot\testing_data\{url}.json"
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)
    


In [7]:
data

[{'title': 'Data Access',
  'text': 'Click Here to access the Science Products . Request to use MOSDAC Single Sign On user credentials to download the data.',
  'links': ['https://mosdac.gov.in//opendata/soil_moisture/']},
 {'title': 'Data Version', 'text': 'Version 1.0 (beta)'},
 {'title': 'Data Sources', 'text': 'SMAP L-band radiometer data'},
 {'title': 'Processing Steps',
  'text': 'Following are the three major processing steps : i. SMAP L1C daily data. ii. Import previous 3 days data iii. Generate global mosaic of Tb iv. Generate Indian mosaic of Tb v. Generate SWI from Tb and gridding the data'},
 {'title': 'Derivation Techniques and Algorithm',
  'text': 'Soil Wetness Index (SWI) is derived using a time series based methodology using SMAP L-band radiometer data, normalized to the extreme values of 0 and 1, corresponding to the dry and saturated soil wetness conditions respectively.'},
 {'title': 'File Naming Convention',
  'text': 'The Geotiff file names follows naming conventi

In [8]:
print(len(URLS))

4
