In [33]:
# Imports
import yaml
import json
import requests
import re
import time
from datetime import datetime
from pprint import pp
from urllib.parse import quote
from dateutil import parser


In [41]:
OpenDataSites = yaml.safe_load(open(r"C:\Users\srappel\Documents\GitHub\GeoDiscovery-Utils\opendataharvest\OpenDataSites.yaml", "r"))


CATALOG = OpenDataSites["TestSites"]
assert isinstance(CATALOG, dict)

MAXRETRY = 4
SLEEPTIME = 2

In [7]:
class Site:
    def __init__(self, site_name: str, site_details: dict, site_json: dict):
        self.site_name = site_name
        self.site_details = site_details
        self.site_json = site_json

    def __getitem__(self, key):
        return getattr(self, key)

    def __setitem__(self, key, value):
        setattr(self, key, value)

In [52]:
def harvest_sites() -> list:
    site_list = [] # list of Site objects
    for site, details in CATALOG.items():
        for i in range(MAXRETRY):  # Retry up to 5 times
            try:
                response = requests.get(details["SiteURL"], timeout=3)
                response.raise_for_status()
                site_json = response.json()
                current_Site = Site(details["SiteName"], details, site_json)
                site_list.append(current_Site)
                break  # If the request is successful, break the retry loop
            except json.JSONDecodeError:
                print(f"The content from {site} is not a valid JSON document.")
                break  # If the content is not valid JSON, break the retry loop
            except (requests.HTTPError, requests.exceptions.Timeout) as e:
                print(f"Received bad response from {site}. Retrying after {SLEEPTIME} seconds...")
                time.sleep(SLEEPTIME)  # Wait for 1 second before retrying
                if i == (MAXRETRY - 1):  # If this was the last retry
                    print(f"Failed to connect to {site} after {MAXRETRY + 1} attempts.")
                    print(e)
    return site_list

list_of_sites = harvest_sites()

for website in list_of_sites:
    print(f'Website for {website.site_name}: {website.site_details["SiteURL"]}')
    print(f'Details for {website.site_name}:\n{website.site_details}')
    print(f'JSON for {website.site_name}:\n{website.site_json}')
    print(f'The following items are on the skiplist:')
    for skip in website.site_details["SkipList"]:
        print(skip['UUID'])
    print()

Website for MilwaukeeCounty: https://data-mclio.hub.arcgis.com/api/feed/dcat-us/1.1.json
Details for MilwaukeeCounty:
{'CreatedBy': 'Milwaukee County', 'SiteURL': 'https://data-mclio.hub.arcgis.com/api/feed/dcat-us/1.1.json', 'SiteName': 'MilwaukeeCounty', 'SkipList': [{'UUID': 'd7f707071cd24b83ab3b9adb8a7d10ce'}, {'UUID': 'bce9201dd312445b9b4567ee14d8032a'}, {'UUID': '84c7b8d95af04cdda6b0c2ae26590531'}, {'UUID': '7a1d3d055d4b4457845c721088c132f0'}], 'Spatial': ['Milwaukee County', 'United States']}
JSON for MilwaukeeCounty:
{'@context': 'https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld', '@type': 'dcat:Catalog', 'conformsTo': 'https://project-open-data.cio.gov/v1.1/schema', 'describedBy': 'https://project-open-data.cio.gov/v1.1/schema/catalog.json', 'dataset': [{'@type': 'dcat:Dataset', 'identifier': 'https://www.arcgis.com/home/item.html?id=84c7b8d95af04cdda6b0c2ae26590531', 'landingPage': 'https://data-mclio.hub.arcgis.com/apps/MCLIO::data-download-topoplanimetric-and-ca

In [53]:
def extract_id_sublayer(url):
    id_pattern = r'id=([a-zA-Z0-9]+)'
    sublayer_pattern = r'sublayer=(\d+)'

    id_match = re.search(id_pattern, url)
    sublayer_match = re.search(sublayer_pattern, url)

    id_value = id_match.group(1) if id_match else None
    sublayer_value = sublayer_match.group(1) if sublayer_match else None

    return id_value, sublayer_value

url = "https://www.arcgis.com/home/item.html?id=0286131de8884484b26bbec4176ea403"
id_value, sublayer_value = extract_id_sublayer(url)

print(f"id: {id_value}, sublayer: {sublayer_value}")

id: 0286131de8884484b26bbec4176ea403, sublayer: None


In [73]:
dataset_dict = list_of_sites[0].site_json["dataset"][0]
print(dataset_dict.__class__)
pp(dataset_dict)



<class 'dict'>
{'@type': 'dcat:Dataset',
 'identifier': 'https://www.arcgis.com/home/item.html?id=84c7b8d95af04cdda6b0c2ae26590531',
 'landingPage': 'https://data-mclio.hub.arcgis.com/apps/MCLIO::data-download-topoplanimetric-and-cadastral',
 'title': 'Data Download: TopoPlanimetric and Cadastral',
 'description': "<div><span style='font-family:&quot;Avenir Next W01&quot;, "
                '&quot;Avenir Next W00&quot;, &quot;Avenir Next&quot;, Avenir, '
                '&quot;Helvetica Neue&quot;, sans-serif; font-size:16px; '
                "font-weight:bold;'>Updated Jan 2021 to include option to "
                'preview the Topoplanimetric and Cadastral '
                "data</span></div><span style='font-family:&quot;Avenir Next "
                'W01&quot;, &quot;Avenir Next W00&quot;, &quot;Avenir '
                'Next&quot;, Avenir, &quot;Helvetica Neue&quot;, sans-serif; '
                "font-size:16px; font-weight:bold;'><div><span "
                "style='font-famil

You can open the link to the SiteURL in OpenRefine to get a easy to see tabular view.

In [89]:
class Aardvark:
    
    def __init__(self, dataset_dict, website):
        # Required fields
        self.gbl_resourceClass_sm = ["Datasets"]
        self.dct_accessRights_s = "public"
        self.gbl_mdVersion_s = "Aardvark"
        self.dct_language_sm = ["English"]
        self.schema_provider_s = "American Geographical Society Library – UWM Libraries"
        self.gbl_suppressed_b = False

        # From YAML:
        self.md_id = f"{website.site_name}-{extract_id_sublayer(dataset_dict['identifier'])}"
        assert isinstance(self.md_id, str) and len(self.md_id) > 0, "id is required"

        self.dct_spatial_sm = dataset_dict['spatial']

        assert "title" in dataset_dict and "identifier" in dataset_dict, "Dataset missing title or identifier"

        # dct_title_s (REQUIRED)
        self.dct_title_s = dataset_dict.get('title', '')
        assert isinstance(self.dct_title_s, str) and len(self.dct_title_s) > 0, "Title is required"

        # gbl_mdModified_dt (Required)
        self.gbl_mdModified_dt = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
        assert isinstance(self.gbl_mdModified_dt, str) and len(self.gbl_mdModified_dt) > 0, "mdModified is required"

        # id (Required)
        id_pattern = r'id=([^&]+)'
        sublayer_pattern = r'sublayer=(\d+)'

        identifier_match = re.search(id_pattern, dataset_dict['identifier'])
        sublayer_match = re.search(sublayer_pattern, dataset_dict['identifier'])

        identifier = identifier_match.group(1) if identifier_match else ""
        sublayer = sublayer_match.group(1) if sublayer_match else ""

        

        # dct_description_sm
        self.dct_description_sm = [re.sub('<[^<]+?>', '', dataset_dict.get('description', []))]
        self.dct_description_sm.append(f"This dataset was automatically cataloged from the author's Open Data Portal. In some cases, publication year and bounding coordinates shown here may be incorrect. Additional download formats may be available on the author's website. Please check the 'More details at' link for additional information.")

        # dct_creator_sm
        self.dct_creator_sm = [dataset_dict['publisher']['name']] if 'publisher' in dataset_dict else []

        # dct_issued_s
        self.dct_issued_s = dataset_dict.get('issued', '')

        # locn_geometry & dcat_bbox
        if 'spatial' in dataset_dict:
            bbox = ','.join(coord.strip() for coord in dataset_dict['spatial'].split(','))
            self.locn_geometry = self.dcat_bbox = f"ENVELOPE({bbox})"

        # dcat_keyword_sm (string multiple!)
        self.dcat_keyword_sm = json.dumps(dataset_dict.get('keyword', []))

        # dct_references_s

        def getURL(refs):
            url = refs.get('accessURL', refs.get('downloadURL', 'invalid'))
            return quote(url, safe=':/?=')

        if 'distribution' in dataset_dict:
            references = {"http://schema.org/url": dataset_dict["landingPage"]}
            for dist in dataset_dict['distribution']:
                url = getURL(dist)
                if 'format' in dist and url != "invalid":
                    if dist['format'] == 'ArcGIS GeoServices REST API':
                        if 'FeatureServer' in url:
                            references['urn:x-esri:serviceType:ArcGIS#FeatureLayer'] = url
                        elif 'ImageServer' in url:
                            references['urn:x-esri:serviceType:ArcGIS#ImageMapLayer'] = url
                        elif 'MapServer' in url:
                            references['urn:x-esri:serviceType:ArcGIS#DynamicMapLayer'] = url
                    elif dist['format'] == "ZIP":
                        references['http://schema.org/downloadUrl'] = url
            self.dct_references_s = json.dumps(references).replace("\"", "\\\"").replace(" ","")

        # Years
        if 'modified' in dataset_dict:
            try:
                index_date = parser.parse(dataset_dict['modified'])
                index_year = int(index_date.year)
            except ImportError:
                index_year = int(dataset_dict['modified'][:4])
            except Exception as e:
                print(f"An error occurred: {e}")
            
            self.gbl_indexYear_im = [index_year]
        else:
            self.gbl_indexYear_im = []

        if 'issued' in dataset_dict:
            try:
                index_date = parser.parse(dataset_dict['issued'])
                index_year = int(index_date.year)
            except ImportError:
                index_year = int(dataset_dict['issued'][:4])
            except Exception as e:
                print(f"An error occurred: {e}")

            self.gbl_indexYear_im.append(index_year)      

    def __str__(self):
        return f"""
        Title: {self.dct_title_s}
        Id: {self.md_id}
        Index Year: {self.gbl_indexYear_im}
        Metadata Modified: {self.gbl_mdModified_dt}
        Description: {self.dct_description_sm}
        Creator: {self.dct_creator_sm}
        Issued: {self.dct_issued_s}
        Spatial bbox: {self.locn_geometry}
        References: {self.dct_references_s}
        """
    
    # TODO:
    gbl_resourceType_sm: list
    dcat_theme_sm: list
    dct_temporal_sm: list
    dct_spatial_sm: list
    dct_rights_sm: list
    dct_format_s: str
    dct_identifier_sm: list
    dct_alternative_sm: list
    pcdm_memberOf_sm: list

    

  


In [90]:
for website in list_of_sites:
    for dataset in website.site_json["dataset"]:
        new_aardvark_object = Aardvark(dataset, website)
        print(new_aardvark_object)




        Title: Data Download: TopoPlanimetric and Cadastral
        Id: MilwaukeeCounty-('84c7b8d95af04cdda6b0c2ae26590531', None)
        Index Year: [2022, 2017]
        Metadata Modified: 2024-05-08T17:56:44Z
        Description: ['Updated Jan 2021 to include option to preview the Topoplanimetric and Cadastral dataDownloading Data:Locate your area of interest by panning and zooming the map, or using the search function in the upper-right corner of the window.Click on the map within your area of interest. A pop-up should appear for that square mile section.Click on some or all of the links to download data in ESRI Geodatabase (.gdb) or CAD Drawing (.dwg) formatOnce the files are downloaded, unzip them. Within the output folders will be the original source data.Data Included (Variable Geometries; Point, Line, Polygon and Annotation):Topographic/Planimetric Data Types:EnvironmentalStructures and BridgesHydrologyParkSurvey ControlElevation ContoursTransportation FeaturesUtility Feature