In [None]:
"GitHub/Geodata_Geoinfo_GIS_WS2023/data/derived/xanten_bounds.csv"

# Extract DTM file names from HTML table and create polygon vector layer with the tiles' BBs

The German Federal State of North Rhine-Westphalia (NRW or NW) provides wonderful and huge open data on its **[NRW Open Geodata Portal](https://www.opengeodata.nrw.de/produkte/)**.

One of the fantastic data sets is the [**Digital Terrain Model (DTM) tiles of NRW in 1m horizontal resolution**](https://www.opengeodata.nrw.de/produkte/geobasis/hm/dgm1_xyz/dgm1_xyz/). This link provides a table with the filenames and download links of 35860 DTM tiles covering NRW of 1 km² size.

To see on a map where these tiles a squared polygon is created for each tile with shows location and extent of the tiles. The coordinates of the corner of the squares are derived from the filename which contains the lower left corner and the tile size (here: 1000 x 1000 grid points with 1m spacing, yielding 1000 m x 1000 m total tile size).  

Web scraping ([Wikipedia](https://en.wikipedia.org/wiki/Web_scraping)) with [`BeautifulSoup`](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) is used to extract the DTM filenames from the HTML table listing all the files [here](https://www.opengeodata.nrw.de/produkte/geobasis/hm/dgm1_xyz/dgm1_xyz/).

In [1]:
dtm_url = r"https://www.opengeodata.nrw.de/produkte/geobasis/hm/dgm1_xyz/dgm1_xyz/"

In [2]:
data_dir = r"./data/derived/NRW_DTM_NRW_EPSG_25832_Tiles_BB/"
out_fname = r"NRW_DTM_NRW_EPSG_25832_Tiles_BB.gpkg"

In [5]:
import requests
from bs4 import BeautifulSoup
from shapely.geometry import Polygon
import geopandas as gpd
import os

In [21]:
os.makedirs(data_dir,exist_ok = True)

In [22]:
def points_from_fname(fname, Dx=1000, Dy=1000):
    """
    Usage: [(x_LL,y_LL),(x_UL,y_UL),(x_UR,y_UR),(x_LR,y_LR)]  = points_from_fname(...)
       
    Returns a list of four 2-tuples representing the corner points of a square. 
    Create corner points of the squared bounding box for a NRW DTM tile in ESPG:25832.
    URL of NRW DTM tile collection: https://www.opengeodata.nrw.de/produkte/geobasis/hm/dgm1_xyz/dgm1_xyz/
    The coordinates of the lower left corner are extracted from the filename which is 
    formatted like 'dgm1_32_280_5652_1_nw.xyz.gz'. Filename elements:
    dgm1: name of product with 1x1 m² grid cell size, 32: UTM Zone 32, EPSG:25832, 
    280: Easting in km, 5652: Northing in km, 1: 1m x 1m grid cell size, nw: North Rhine-Wastphalia,
    xyz: ASCII fixed width file format with three columns (easting, northing, elevation), gz: GNU zipped
    """
    
    x_Left  = int(fname.split("_")[2])*1000
    y_Low   = int(fname.split("_")[3])*1000
    x_Right = x_Left + Dx
    y_Up    = y_Low + Dy

    P_LL = (x_Left,y_Low)
    P_UL = (x_Left,y_Up)
    P_UR = (x_Right,y_Up)
    P_LR = (x_Right,y_Low)
    
    return [P_LL, P_UL, P_UR, P_LR]

In [23]:
help(points_from_fname)

Help on function points_from_fname in module __main__:

points_from_fname(fname, Dx=1000, Dy=1000)
    Usage: [(x_LL,y_LL),(x_UL,y_UL),(x_UR,y_UR),(x_LR,y_LR)]  = points_from_fname(...)
       
    Returns a list of four 2-tuples representing the corner points of a square. 
    Create corner points of the squared bounding box for a NRW DTM tile in ESPG:25832.
    URL of NRW DTM tile collection: https://www.opengeodata.nrw.de/produkte/geobasis/hm/dgm1_xyz/dgm1_xyz/
    The coordinates of the lower left corner are extracted from the filename which is 
    formatted like 'dgm1_32_280_5652_1_nw.xyz.gz'. Filename elements:
    dgm1: name of product with 1x1 m² grid cell size, 32: UTM Zone 32, EPSG:25832, 
    280: Easting in km, 5652: Northing in km, 1: 1m x 1m grid cell size, nw: North Rhine-Wastphalia,
    xyz: ASCII fixed width file format with three columns (easting, northing, elevation), gz: GNU zipped



In [37]:
#!pip install lxml

Collecting lxml
  Downloading lxml-4.9.3.tar.gz (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: lxml
  Building wheel for lxml (setup.py) ... [?25ldone
[?25h  Created wheel for lxml: filename=lxml-4.9.3-cp39-cp39-macosx_11_0_arm64.whl size=1567242 sha256=4c8b986a40ef300057c1ce572151f910cb7179e70169f16d6fb0142e3908aa02
  Stored in directory: /Users/sac/Library/Caches/pip/wheels/5c/05/aa/530f84480d476c5bb9ea09877eea78fb144ec047fbb00ee2ca
Successfully built lxml
Installing collected packages: lxml
Successfully installed lxml-4.9.3


In [41]:
r = requests.get(dtm_url)
soup = BeautifulSoup(r.content, 'html.parser')

## content

In [42]:
#soup

In [45]:
r.content[:100]

b'<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>\n<opengeodata>\n\t<metafiles>\n\t\t<metafile name'

In [46]:
tag_list = soup.find_all('file')
fname_list = [tag["name"] for tag in tag_list]
geom_list = [Polygon(points_from_fname(fname)) for fname in fname_list]

In [51]:
# fname: polygon attribute, geometry: polygon geometry 
dic = {'fname': fname_list, 'geometry': geom_list}
gdf = gpd.GeoDataFrame(data = dic, crs="EPSG:25832")

In [52]:
print(f"gdf.shape: {gdf.shape}")
gdf.head()

gdf.shape: (35860, 2)


Unnamed: 0,fname,geometry
0,dgm1_32_280_5652_1_nw.xyz.gz,"POLYGON ((280000.000 5652000.000, 280000.000 5..."
1,dgm1_32_280_5653_1_nw.xyz.gz,"POLYGON ((280000.000 5653000.000, 280000.000 5..."
2,dgm1_32_280_5654_1_nw.xyz.gz,"POLYGON ((280000.000 5654000.000, 280000.000 5..."
3,dgm1_32_280_5655_1_nw.xyz.gz,"POLYGON ((280000.000 5655000.000, 280000.000 5..."
4,dgm1_32_280_5656_1_nw.xyz.gz,"POLYGON ((280000.000 5656000.000, 280000.000 5..."


In [29]:
print(f"Write gdf to file {data_dir + out_fname:s}")
gdf.to_file(data_dir + out_fname, driver = "GPKG") 

Write gdf to file ../data/derived/NRW_DTM_NRW_EPSG_25832_Tiles_BB/NRW_DTM_NRW_EPSG_25832_Tiles_BB.gpkg
