In [None]:
!pip3 install beautifulsoup4 requests pydantic

In [159]:
import requests 
from bs4 import BeautifulSoup
import bs4
from dataclasses import dataclass, asdict
import json
import re

In [64]:
resp = requests.get("https://en.wikipedia.org/wiki/List_of_TCP_and_UDP_port_numbers", headers={"User-Agent": "curl/8.7.1"})
resp.raise_for_status()

soup = BeautifulSoup(resp.text)
soup.title

<title>List of TCP and UDP port numbers - Wikipedia</title>

In [173]:
@dataclass
class Port:
    start: int
    end: int
    
    category: str 
    description: str 

    types: dict[str, str]

In [174]:
def purify(v: str) -> str:
    return re.sub(r'\[.*\d+\]', '', v).strip()

purify("De-assigned on 2025-02-13, previously compressnet[2]\n")

'De-assigned on 2025-02-13, previously compressnet'

In [175]:
def port_number_to_range(v: str) -> tuple[int, int]:
    parts = v.split("–")
    if len(parts) == 1:
        port = int(parts[0])
        return port, port

    return int(parts[0]), int(parts[1])

port_number_to_range("10"), port_number_to_range("50–70")

((10, 10), (50, 70))

In [178]:
ports: list[Port] = []

for table in soup.select("table.sortable"):
    tbody = table.select_one("tbody")

    category = purify(table.select_one("caption").text)
    
    prev_colspan = 1
    port_number = (1, 1)
    
    for tr in list(tbody.children)[1:]:
        if isinstance(tr, bs4.element.NavigableString):
            continue
            
        children = []
        for child in tr.children:
            if not isinstance(child, bs4.element.Tag):
                continue

            children.append(child)

        rowspaned = True
        prev_colspan -= 1
        if prev_colspan <= 0:
            rowspaned = False
            port_number = port_number_to_range(purify(children[0].text))
            prev_colspan = int(children[0].attrs.get("rowspan", 1))

        types = {}
        port_types = iter(["tcp", "upd", "sctp", "dccp"])

        skip = 0 if rowspaned else 1
        for child in children[skip:-1]:
            text = purify(child.text)
            colspan = int(child.attrs.get("colspan", 1))

            for _ in range(colspan):
                port_type = next(port_types)
                
                if text != "":
                    types[port_type] = text    

        start, end = port_number
        port = Port(start=start, end=end, description = purify(children[-1].text), category=category, types=types)
        ports.append(port)

print(len(ports))
for port in ports[:20]:
    print(port)

1522
Port(start=0, end=0, category='Well-known ports', description='In programming APIs (not in communication between hosts), requests a system-allocated (dynamic) port', types={'tcp': 'Reserved', 'upd': 'Reserved'})
Port(start=1, end=1, category='Well-known ports', description='TCP Port Service Multiplexer (TCPMUX). Historic. Both TCP and UDP have been assigned to TCPMUX by IANA,', types={'tcp': 'Yes', 'upd': 'Assigned'})
Port(start=2, end=2, category='Well-known ports', description='De-assigned on 2025-02-13, previously compressnet', types={'tcp': 'Reserved', 'upd': 'Reserved'})
Port(start=3, end=3, category='Well-known ports', description='De-assigned on 2025-02-13, previously compressnet', types={'tcp': 'Reserved', 'upd': 'Reserved'})
Port(start=5, end=5, category='Well-known ports', description='Remote Job Entry and IANA has assigned both TCP and UDP 5 to it.', types={'tcp': 'Assigned', 'upd': 'Assigned'})
Port(start=7, end=7, category='Well-known ports', description='Echo Protoco

In [179]:
with open("ports.json", "w") as f:
    json.dump([asdict(port) for port in ports], f)