In [None]:
# REQUIREMENTS.TXT

import warnings
warnings.filterwarnings("ignore", 'This pattern has match groups')


In [None]:
import pandas as pd
stations = pd.read_csv("./datasets/station_service_stations.csv",sep=';',encoding="utf-8")

# FIX ENCODING
stations=stations.replace({'Ã¼': 'ü','\'':''}, regex=True) # ü
stations=stations.replace({'ÃŸ': 'ß','\'':''}, regex=True) # ß
stations=stations.replace({'Ã¶': 'ö','\'':''}, regex=True) # ö
stations=stations.replace({'Ã¶': 'ö','\'':''}, regex=True) # ß
# PRINT RAW RESULT
stations.head(5)

In [None]:
# FILTER STATIONS
# 1. RV = REGIONALVERKEHR
# 2. NUR HAUPTBAHNHÖFE
BN_TYPE = "DB Station und Service AG" # DB Station und Service AG 

stations_filtered = stations[(stations['Verkehr'] == "RV" )]
stations_filtered = stations_filtered[(stations_filtered['Betreiber_Name'] == BN_TYPE )]
stations_filtered

In [None]:
# SIMPLYFY DATA AND REMOVE COLUMNS
stations_filtered = stations_filtered[['EVA_NR', 'NAME', 'Laenge', 'Breite']]
stations_filtered.head(5)

In [None]:
# NOW WE HAVE CLEANED STATION DATA WITH LAT AND LONG INFOMRATION ABOUT EACH STATION FROM DB
# THE NEXT STEP IS TO MATCH THE STATIONS WITH THE LANDKREISE OF THE RKI
# TO MAP THE SIMULATED RESULTS WITH THE CORONA DATA

In [None]:
import geopandas as gpd
landkreis_geo = gpd.read_file("./datasets/RKI_Corona_Landkreise.geojson")
landkreis_geo.head(5)

In [None]:
import geoplot
import geoplot.crs as gcrs
# PLOT LOADED GEOJSON DATA
geoplot.polyplot(landkreis_geo, projection=gcrs.AlbersEqualArea(), edgecolor='darkgrey', facecolor='lightgrey', linewidth=.3, figsize=(12, 8))

In [None]:
# AS WE CAN SEE IN THE RENDERED GEOJSON, THE FILE CONTAINS POLYGONS OF EACH LANDKREIS
# SO THE NEXT STEP IS TO MATCH THE LANDKREIS POLYGON WITH THE LAT AND LONG OF THE DB STATION DATA
gpd_points_lat = []
gpd_points_long = []

for index, row in stations_filtered.iterrows():
    # BUT FIRST WE NEED TO FIX THE , FLOARINGPOINT GERMAN STUFF...
    lat = float(str(row['Breite']).replace(',','.'))
    long = float(str(row['Laenge']).replace(',','.'))
    # !!!!!! SWITCH LAT LONG !!!!!!!!!!!!
    gpd_points_lat.append(long)
    gpd_points_long.append(lat)

gpd_points = gpd.points_from_xy(gpd_points_lat, gpd_points_long)

stations_geo_preperation = stations_filtered[['EVA_NR', 'NAME']]

station_geo_points = gpd.GeoDataFrame(stations_geo_preperation, geometry=gpd_points, crs="EPSG:4326")
station_geo_points.head(5)

In [None]:
# VISULIZE THE STATIONS INSIDE THE GEOJSON MAP
import matplotlib.pyplot as plt
# USE THE LANFKREIS GEO DATA AS BASE WITH THE SAME SETTINGS
lkg_plot = landkreis_geo.plot(edgecolor='darkgrey', facecolor='lightgrey', linewidth=.2, figsize=(12, 8))
# DRAW THE STATION POINTS OVER
station_geo_points.plot(ax=lkg_plot, color='red')
plt.show(lkg_plot)


In [None]:
# THE LAST STEP IS TO PERFORM A TEST WHICH LANDKREISID CONTAINS WHICH STATIONS
# SO THE RESULT SHOULD BE A STATION + RKI LANKREIS DATA DATAFRAME

In [None]:
# community = landkreis_geo #landkreis_geo[landkreis_geo.GEN == 'Flensburg'] # TEST
station_geo_lkid = gpd.sjoin(landkreis_geo,station_geo_points)
station_geo_lkid.head(5)
len(station_geo_lkid)

In [None]:
# NOW WE HAVE A 5000x 51 sized dataframe
# IN THIS STAGE WE DONT NEED ONLY THE LANDKREISID AND THE STATION ID + GEOINFORMATION
# SO LETS STRIP IT DOWN
station_geo_lkid = station_geo_lkid[['NAME','GEN','BEZ','geometry','EVA_NR', 'AGS']]
station_geo_lkid.columns = ['db_station_name', 'rki_landkreisname', 'rki_bezeichner','geometry', 'db_station_id','rki_ags']
station_geo_lkid.head(5)

# FINALLY FILTER FOR SOME 

In [None]:
# KEEP ONLY ONE TRAIN STATION FOR EACH COMMUNITY
station_geo_lkid = station_geo_lkid.sort_values('db_station_id').drop_duplicates(subset=['rki_ags'], keep='last')

# DROP NAN ROWS
station_geo_lkid.dropna(inplace=True)
station_geo_lkid.head(5)

In [None]:
# GENERATE A SMALLER DATASET WITHOUT THE GEOMETY
station_geo_lkid_wo_geometry = station_geo_lkid[['db_station_name', 'rki_landkreisname', 'rki_bezeichner', 'db_station_id', 'rki_ags']]
station_geo_lkid_wo_geometry.head(5)

In [None]:
# TO SIMPLYFY THINGS THE ONLY GOING TO USE ONLY "MAIN" STATION WITH THE Hbf SUBSTRING

station_geo_lkid_wo_geometry_hbf = station_geo_lkid_wo_geometry[station_geo_lkid_wo_geometry['db_station_name'].str.contains('Hbf')]
# REMOVE S-BAHN STATION (S-Bahn) WITH ~ OPERATOR
station_geo_lkid_wo_geometry_hbf = station_geo_lkid_wo_geometry_hbf[~station_geo_lkid_wo_geometry_hbf['db_station_name'].str.contains('(S-Bahn)')]
station_geo_lkid_wo_geometry_hbf.head(5)

In [None]:
# FINALLY SAVE THE PREPARED DATA
station_geo_lkid.to_file("./generated/0_db_station_rki_lk.geojson", driver='GeoJSON')
station_geo_lkid.to_file("./generated/0_db_station_rki_lk.gpkg", layer='world', driver="GPKG")
station_geo_lkid_wo_geometry.to_csv("./generated/0_db_station_lk.csv", encoding='utf-8', index=False, sep=';')
station_geo_lkid_wo_geometry_hbf.to_csv("./generated/0_db_station_lk_hbf.csv", encoding='utf-8', index=False, sep=';')