In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml


In [2]:
import geopandas as gpd

gdf = gpd.read_file("../data/clean/bezirksgrenzen.geojson", encoding="utf-8", sep=';')

# Quick check
print(gdf.head())
print(gdf.columns)


                       gml_id               Gemeinde_name Gemeinde_schluessel  \
0  s_wfs_alkis_bezirk.F176__1               Reinickendorf                 012   
1  s_wfs_alkis_bezirk.F176__2  Charlottenburg-Wilmersdorf                 004   
2  s_wfs_alkis_bezirk.F176__3           Treptow-KÃ¶penick                 009   
3  s_wfs_alkis_bezirk.F176__4                      Pankow                 003   
4  s_wfs_alkis_bezirk.F176__5                   NeukÃ¶lln                 008   

  Land_name Land_schluessel Schluessel_gesamt  \
0    Berlin              11          11000012   
1    Berlin              11          11000004   
2    Berlin              11          11000009   
3    Berlin              11          11000003   
4    Berlin              11          11000008   

                                            geometry  
0  MULTIPOLYGON (((13.32074 52.6266, 13.32045 52....  
1  MULTIPOLYGON (((13.32111 52.52446, 13.32103 52...  
2  MULTIPOLYGON (((13.57925 52.39083, 13.57958 52... 

  return ogr_read(


In [3]:
# 1) Fix the garbled sequences ONLY in this column
col = "Gemeinde_name"

bad2good = {
    "Ã¤": "ä", "Ã¶": "ö", "Ã¼": "ü", "ÃŸ": "ß",
    "Ã„": "Ä", "Ã–": "Ö", "Ãœ": "Ü"
}

s = gdf[col].astype(str)
for bad, good in bad2good.items():
    s = s.str.replace(bad, good, regex=False)

gdf[col + "_utf8"] = s  # e.g., 'Treptow-KÃ¶penick' -> 'Treptow-Köpenick'

# 2) (Optional) also create a join-safe ASCII version from the fixed text
trans = str.maketrans({
    "ä": "ae", "ö": "oe", "ü": "ue", "ß": "ss",
    "Ä": "Ae", "Ö": "Oe", "Ü": "Ue"
})
gdf[col + "_ascii"] = gdf[col + "_utf8"].apply(lambda x: x.translate(trans))

# 3) Quick check
print(gdf[[col, col + "_utf8", col + "_ascii"]].head(10))


                Gemeinde_name          Gemeinde_name_utf8  \
0               Reinickendorf               Reinickendorf   
1  Charlottenburg-Wilmersdorf  Charlottenburg-Wilmersdorf   
2           Treptow-KÃ¶penick            Treptow-Köpenick   
3                      Pankow                      Pankow   
4                   NeukÃ¶lln                    Neukölln   
5                 Lichtenberg                 Lichtenberg   
6         Marzahn-Hellersdorf         Marzahn-Hellersdorf   
7                     Spandau                     Spandau   
8         Steglitz-Zehlendorf         Steglitz-Zehlendorf   
9                       Mitte                       Mitte   

          Gemeinde_name_ascii  
0               Reinickendorf  
1  Charlottenburg-Wilmersdorf  
2           Treptow-Koepenick  
3                      Pankow  
4                   Neukoelln  
5                 Lichtenberg  
6         Marzahn-Hellersdorf  
7                     Spandau  
8         Steglitz-Zehlendorf  
9       

In [8]:
gdf.to_file("../data/clean/bezirksgrenzen_fixed.geojson", driver="GeoJSON", encoding="utf-8")


In [None]:
data/clean/bezirksgrenzen.geojson