## Imports

In [16]:
from urllib.request import urlopen
import json
import pandas as pd

# url = "https://data.nrao.edu/archive-service/restapi_product_details_view?sdm_id=25B-104.sb49474829.eb49602275.60965.561376909725"

## Helper Functions

In [17]:
def get_html(url):
    '''
    Returns json data read from the url given in parameter    
    '''
    page = urlopen(url)

    html_bytes = page.read()
    html = html_bytes.decode("utf-8")

    return json.loads(html)

## Scraping NRAO Data Archive

The following code extracts the table data from the NRAO Data Archive Portal: https://data.nrao.edu/portal/#/ 

It fetches the json request the web page makes the list of latest projects. It then looks more into the details of the observations performed in each project, in particular the frequency bands in which the observations were performed.

In [18]:
START = 0
NUM_ROWS = 25

NRAO_PORTAL_URL = f"https://data.nrao.edu/archive-service/restapi_get_eb_project_view?start={START}&rows={NUM_ROWS}&sort=proj_stop%20desc"

# Read list of projects from NRAO data archive portal
nrao_portal_data = get_html(NRAO_PORTAL_URL)

In [19]:
# Extract list of project IDs
projects = nrao_portal_data["project_dict"]["projects"]
pd_projects = pd.DataFrame(projects)
print(projects)

project_codes = [project["project_code"] for project in projects if "project_code" in project]
print(project_codes)


public_projects = pd_projects[pd_projects["data_rights"].apply(lambda x: x == "PUBLIC")]
print("\nList of public projects:\n", public_projects["project_code"])

[{'id': 'nrao:623334', 'eb_id': 623334, 'sci_prod_locator': 'uid://evla/execblock/4edace58-af9f-4123-9239-aba6f6369549', 'obs_id': '25B-250.sb49363256.eb49611267.60969.423685601854', 'schedBlockName': None, 'filegroup_id': None, 'legacy_id': 'AB1981', 'project_code': '25B-250', 'title': 'PEARRLS: the Proplyd EVLA/ALMA Radio Recombination Line Survey', 'abstract': 'Most Sun-like stars form in clusters, where protoplanetary disks are externally photoevaporated by nearby massive stars. While previous studies of the Orion "proplyds" have shed light on the prevalence of external photoevaporation in typical star-forming environments, there is still much we do not know about the effects of external photoevaporation on planet formation. We propose to use joint VLA K-band, VLA Ka-band, and ALMA Band 4 observations of hydrogen (H) and helium (He) radio recombination lines (RRLs) to conduct a detailed study of the ionized gas that is ejected off photoevaporating disks. We will target 17 proplyds 

In [23]:
def get_project_observations(project_codes):

    df = pd.DataFrame()
    for code in project_codes[:2]:
        project_url = f"https://data.nrao.edu/archive-service/restapi_get_paged_exec_blocks?start=0&rows=25&sort=obs_stop%20desc&project_code=%22{code}%22"
        print('url:', project_url)
        project_data = get_html(project_url)

        observations = [
            {"obs_id": p["obs_id"],
            "obs_band": p["obs_band"]}
            for p in project_data["eb_list"]
            if "obs_id" in p and "project_code" in p and "obs_band" in p
        ]
        
        df = pd.concat([df, pd.DataFrame(observations)], ignore_index=True, sort=False)

    return df

observations = get_project_observations(project_codes)
print(observations)

url: https://data.nrao.edu/archive-service/restapi_get_paged_exec_blocks?start=0&rows=25&sort=obs_stop%20desc&project_code=%2225B-250%22
url: https://data.nrao.edu/archive-service/restapi_get_paged_exec_blocks?start=0&rows=25&sort=obs_stop%20desc&project_code=%2225A-310%22
                                              obs_id obs_band
0   25B-250.sb49363256.eb49611267.60969.423685601854     [KA]
1    25B-250.sb49363115.eb49611263.60969.26939707176     [KA]
2    25B-250.sb49363256.eb49605086.60967.44715115741     [KA]
3    25B-250.sb49362732.eb49604915.60967.27364846064      [K]
4    25B-250.sb49362855.eb49604897.60966.45654993056      [K]
5   25B-250.sb49362597.eb49569883.60955.515199421294      [K]
6   25B-250.sb49362732.eb49569527.60953.313976840276      [K]
7   25B-250.sb49362732.eb49565159.60950.328320706016      [K]
8     25B-250.sb49362855.eb49557156.60948.4727197801      [K]
9    25B-250.sb49363115.eb49549282.60943.35614350694     [KA]
10   25B-250.sb49362855.eb49547736.60941.511

### Observations in Bands of Interest

In [21]:
target_bands = {"KA", "Q"}  # use a set for faster lookup
interest_obs = observations[observations["obs_band"].apply(lambda x: any(b in target_bands for b in x))]

print(interest_obs)

                                              obs_id obs_band
0   25B-250.sb49363256.eb49611267.60969.423685601854     [KA]
1    25B-250.sb49363115.eb49611263.60969.26939707176     [KA]
2    25B-250.sb49363256.eb49605086.60967.44715115741     [KA]
9    25B-250.sb49363115.eb49549282.60943.35614350694     [KA]
14   25A-310.sb49268828.eb49611265.60969.38213347222     [KA]
15   25A-310.sb49268828.eb49569561.60954.45835519676     [KA]
16  25A-310.sb49268828.eb49569556.60954.416629444444     [KA]
17   25A-310.sb49268828.eb49549284.60943.46696909722     [KA]
18   25A-310.sb49268828.eb49549179.60942.48157337963     [KA]


In [22]:
interest_obs_id = interest_obs["obs_id"].iloc[0]
print(interest_obs_id)
observation_url = f"https://data.nrao.edu/archive-service/restapi_product_details_view?sdm_id={interest_obs_id}"

obs_data = get_html(observation_url)
print(json.dumps(obs_data, indent=2))

25B-250.sb49363256.eb49611267.60969.423685601854
{
  "details": {
    "dataset_title": "25B-250.sb49363256.eb49611267.60969.423685601854",
    "execution_blocks": [
      {
        "sdm_id": "25B-250.sb49363256.eb49611267.60969.423685601854",
        "project_code": "25B-250",
        "obs_start": "2025-10-21 10:10:07.100",
        "obs_stop": 60969.534458333335,
        "cal_status": "Ready",
        "band_code": "KA",
        "num_antennas": 25,
        "configuration": "B",
        "instrument_name": "EVLA",
        "alma_ous_id": null,
        "legacy_id": "AB1981",
        "access_estsize": 106182932000,
        "configurations": [
          {
            "observation_configuration_id": "418459",
            "observation_configuration_number": "0",
            "band": "Ka",
            "aggregate_bandwidth": 2557772222.222229,
            "min_frequency": 28095733741.77062,
            "max_frequency": 36663452651.459946,
            "min_spec_resolution": 100000.0,
            "m