In [9]:
import pandas as pd

base = "../sample"
files = [
    f"{base}/2024-2 UNLOCODE CodeListPart1.csv",
    f"{base}/2024-2 UNLOCODE CodeListPart2.csv",
    f"{base}/2024-2 UNLOCODE CodeListPart3.csv",
]

cols = [
    "change", "country", "locode", "name", "name_wo_diacritics",
    "subdiv", "status", "function", "date", "iata", "coordinates", "remarks",
]

dfs = [
    pd.read_csv(f, encoding="latin-1", header=None, names=cols)
    for f in files
]
df = pd.concat(dfs, ignore_index=True)

# Drop fully empty columns
df = df.dropna(axis=1, how="all")
# Strip whitespace
df = df.apply(lambda c: c.astype(str).str.strip() if c.dtype == object else c)
# Restore empty strings as NaN
df = df.replace("", pd.NA).replace("nan", pd.NA)

df

Unnamed: 0,change,country,locode,name,name_wo_diacritics,subdiv,status,function,date,iata,coordinates,remarks
0,,AD,,.ANDORRA,,,,,,,,
1,,AD,ALV,Andorra la Vella,Andorra la Vella,,--34-6--,AI,601.0,,4230N 00131E,
2,,AD,CAN,Canillo,Canillo,,--3-----,RL,307.0,,4234N 00135E,
3,,AD,ENC,Encamp,Encamp,,--3-----,RL,307.0,,4232N 00134E,
4,,AD,ESC,Escaldes-Engordany,Escaldes-Engordany,,--3-----,RL,307.0,,4231N 00133E,
...,...,...,...,...,...,...,...,...,...,...,...,...
116468,,ZW,STH,Southerton,Southerton,,--3-----,RL,201.0,,1751S 03101E,
116469,,ZW,THJ,Thompson Junction,Thompson Junction,,-23-----,RL,701.0,,1800S 02626E,
116470,,ZW,VFA,Victoria Falls,Victoria Falls,,---4----,AI,9501.0,,,
116471,,ZW,ZMZ,Zimbabwe,Zimbabwe,MV,1-3-----,RL,1401.0,,2016S 03055E,


In [10]:
import re
from shapely.geometry import Point

def parse_unlocode_coords(s):
    """Parse UNLOCODE coordinate string (DDMMN/S DDDMME/W) to (lat, lon) decimal degrees."""
    if pd.isna(s) or not str(s).strip():
        return None, None
    s = str(s).strip()
    # Format: 4230N 00131E or 1751S 03101E — lat DDMM/DDDMM + N|S, lon DDDMM + E|W
    m = re.match(r"(\d{4,5})([NS])\s+(\d{5})([EW])", s)
    if not m:
        return None, None
    lat_dm, lat_dir, lon_dm, lon_dir = m.groups()
    lat_deg = int(lat_dm[:-2])
    lat_min = int(lat_dm[-2:])
    lon_deg = int(lon_dm[:-2])
    lon_min = int(lon_dm[-2:])
    lat = lat_deg + lat_min / 60
    lon = lon_deg + lon_min / 60
    if lat_dir == "S":
        lat = -lat
    if lon_dir == "W":
        lon = -lon
    return lat, lon

# Parse and create Point geometries (WGS84)
def coords_to_point(s):
    lat, lon = parse_unlocode_coords(s)
    if lat is None:
        return None
    return Point(lon, lat)  # shapely uses (x, y) = (lon, lat)

df["geometry"] = df["coordinates"].apply(coords_to_point)
df[["lat", "lon"]] = df["coordinates"].apply(
    lambda s: pd.Series(parse_unlocode_coords(s))
)

# Decode status to location_type (port, airport, rail, road, etc.)
FUNCTION_LABELS = {1: "port", 2: "rail", 3: "road", 4: "airport", 5: "mail", 6: "multimodal", 7: "fixed_transport", 8: "inland_water"}
def status_to_location_types(s):
    if pd.isna(s) or not str(s).strip() or len(str(s)) < 8:
        return None
    types = [FUNCTION_LABELS.get(i + 1, f"f{i+1}") for i, c in enumerate(str(s).strip()[:8]) if c not in ("-", " ", "0")]
    return ",".join(types) if types else None
df["location_type"] = df["status"].apply(status_to_location_types)

# Load SubdivisionCodes: lookup subdiv_type (Parish, Province, City, Region, etc.) by (country, subdiv)
subdiv_codes = pd.read_csv(
    f"{base}/2024-2 SubdivisionCodes.csv",
    encoding="latin-1",
    header=None,
    names=["country", "subdiv", "subdiv_name", "subdiv_type"],
)
df = df.merge(
    subdiv_codes[["country", "subdiv", "subdiv_type"]],
    on=["country", "subdiv"],
    how="left",
)

# GeoDataFrame with WGS84
import geopandas as gpd
gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")
gdf[gdf["geometry"].notna()].head()

Unnamed: 0,change,country,locode,name,name_wo_diacritics,subdiv,status,function,date,iata,coordinates,remarks,geometry,lat,lon,location_type,subdiv_type
1,,AD,ALV,Andorra la Vella,Andorra la Vella,,--34-6--,AI,601.0,,4230N 00131E,,POINT (1.51667 42.5),42.5,1.516667,"road,airport,multimodal",
2,,AD,CAN,Canillo,Canillo,,--3-----,RL,307.0,,4234N 00135E,,POINT (1.58333 42.56667),42.566667,1.583333,road,
3,,AD,ENC,Encamp,Encamp,,--3-----,RL,307.0,,4232N 00134E,,POINT (1.56667 42.53333),42.533333,1.566667,road,
4,,AD,ESC,Escaldes-Engordany,Escaldes-Engordany,,--3-----,RL,307.0,,4231N 00133E,,POINT (1.55 42.51667),42.516667,1.55,road,
5,,AD,EAC,Escàs,Escas,4.0,--3-----,RL,1407.0,,4233N 00131E,,POINT (1.51667 42.55),42.55,1.516667,road,Parish


In [11]:
# Export to GeoPackage
cols_export = ["country", "subdiv", "subdiv_type", "locode", "name", "name_wo_diacritics", "geometry"]
cols_export = [c for c in cols_export if c in gdf.columns]
out = gdf.dropna(subset=["geometry"])[cols_export]
out.to_file("../sample/unlocode_2024-2.gpkg", driver="GPKG", layer="locations")
out.shape

(92907, 7)