### Links zu den Daten
[Strecke Link](https://geovdbn.deutschebahn.com/pgv-map/geoserver.action?LAYERS=ISR%3AISR_V_GEO_TEN_KLASSIFIZIERUNG&TRANSPARENT=TRUE&FORMAT=kml&VERSION=1.1.1&TILED=false&USERDEFINEDSLD=&SERVICE=WMS&REQUEST=GetMap&VIEWPARAMS=ZEITSCHEIBE%3AUNDEFINED%3BLANG%3ADE%3BALG_DBNETZ_STRECKE%3Aalle%20Strecken%3BJAHR%3A2020&SRS=EPSG%3A31467&BBOX=3250000,5200000,3950000,6100000&WIDTH=700&HEIGHT=900)


[Bahnhöfe Link](https://geovdbn.deutschebahn.com/pgv-map/geoserver.action?LAYERS=ISR%3AISR_V_GEO_BETRIEBSSTELLEN_PUNKT&TRANSPARENT=TRUE&FORMAT=kml&VERSION=1.1.1&TILED=false&USERDEFINEDSLD=&SERVICE=WMS&REQUEST=GetMap&VIEWPARAMS=ZEITSCHEIBE%3AUNDEFINED%3BLANG%3ADE%3BALG_DBNETZ_STRECKE%3Aalle%20Strecken%3BJAHR%3A2020&SRS=EPSG%3A31467&BBOX=3250000,5200000,3950000,6100000&WIDTH=700&HEIGHT=900)

In [None]:
import os, sys
sys.path.append("../")
sys.path.append('../rtd_crawler')
import fiona
import geopandas as gpd
from geopandas.plotting import plot_linestring_collection
import matplotlib.pyplot as plt
import shapely
from shapely.geometry import GeometryCollection, MultiPoint, Point
from shapely.ops import nearest_points, linemerge
import networkx as nx
import random
import lxml.etree as etree
import re
import pandas as pd
from tqdm.auto import tqdm
from tqdm import tqdm_notebook as tqdm_bar
from helpers import StationPhillip, BetriebsstellenBill, NoLocationError
import geopy.distance

# enable KML support
fiona.drvsupport.supported_drivers['kml'] = 'rw'
fiona.drvsupport.supported_drivers['KML'] = 'rw'

parser = etree.XMLParser(encoding='utf-8', collect_ids=False)
stations = StationPhillip(notebook=True)
betriebstellen = BetriebsstellenBill(notebook=True)


tqdm.pandas(desc='progress')

def parse_atr(atr):
    atr_name = re.compile(r"<span class=\"atr-name\">(.*?)<\/span>")
    atr_value = re.compile(r"<span class=\"atr-value\">(.*?)<\/span>")

    atr_list = []
    for i in range(len(atr)):
        attributes = atr.iat[i]
        names = [match[1] for match in atr_name.finditer(attributes)]
        values = [match[1] for match in atr_value.finditer(attributes)]
        atr_list.append(pd.Series(values, names))
        
    return pd.concat(atr_list, axis=1).T

def tranform_geo(strecke):
    """
    transform 
    ```python
        GeometryCollection[
            GeometryCollection[
                ...
            ]
        ]
    ```
    to 
    ```python
        Linestring[
            ...
        ]
    ```
    """
    for i in range(len(strecke)):
        new_geo =  []
        for g in strecke.iat[i, 2]:
            if type(g) == shapely.geometry.collection.GeometryCollection:
                for u in g:
                    new_geo.append(u)
            elif type(g) != shapely.geometry.point.Point:
                new_geo.append(g)
        strecke.iat[i, 2] = linemerge(GeometryCollection(new_geo))
    return strecke

def get_map_positions():
    map_positions = {}
    for i in range(len(strecke)):
        if type(strecke.iat[i, 2]) == shapely.geometry.multilinestring.MultiLineString:
            pointroute = MultiPoint(strecke.iat[i, 2][0].coords)
        else:
            try:
                pointroute = MultiPoint(strecke.iat[i, 2].coords)
            except NotImplementedError:
                continue
        map_positions[strecke_atr.loc[i, 'source']] = pointroute[0]
        map_positions[strecke_atr.loc[i, 'target']] = pointroute[-1]
    map_positions = {key: Point(value.x, value.y) for key, value in map_positions.items()}
    return map_positions

def get_strecke_atr():
    strecke_atr = parse_atr(strecke.iloc[:, 1])
    strecke_atr['source'] = ''
    strecke_atr['target'] = ''
    for i in range(len(strecke_atr)):
        nodes = strecke_atr.at[i, 'ISR_STRECKE_VON_BIS'].split(' - ')
        strecke_atr.at[i, 'source'] = nodes[0]
        strecke_atr.at[i, 'target'] = nodes[1]
    return strecke_atr

def geo_map(row, destination, val, dest_unary, col="geometry"):
    try:
        nearest_geom = nearest_points(row[col], dest_unary)
        match_geom = destination.loc[destination.geometry == nearest_geom[1]]
        match_value = list(match_geom.loc[:, ['geometry', 'bhf']].to_numpy()[0])
        match_value.append(geopy.distance.distance(row[col].coords, match_value[0].coords).km)
        match_value = pd.Series(data=match_value, index=['geom_match', 'geo_match_node', 'geomatch_dist'])
        return match_value
    except Exception as ex:
        return pd.Series(data=[None, None, None], index=['geom_match', 'geo_match_node', 'geomatch_dist'])

def name_replace(name):
    name = name.lower()
    name = name.replace(' ', '').replace('(', '').replace(')', '')
    name = name.replace('.', '').replace('-', '').replace('_', '').replace('/', '')
    return name

import difflib
def get_closest_match(row, names_to_match):
    name = row['name']
    name = name_replace(name)
    match = difflib.get_close_matches(name, names_to_match, n=1, cutoff=0)[0]
    score = difflib.SequenceMatcher(None, name, match).ratio()
    index = names_to_match.index(match)
    match = map_positions_gdf.at[index, 'bhf']

    return pd.Series(data=[match, score], index=['string_match', 'string_match_score'])

### Read kml files

In [None]:
strecke = gpd.read_file('../data/ISR-ISR_V_GEO_TEN_KLASSIFIZIERUNG.kml')
bahnhöfe = gpd.read_file('../data/ISR-ISR_V_GEO_BETRIEBSSTELLEN_PUNKT.kml')

In [None]:
strecke = tranform_geo(strecke)
strecke_atr = get_strecke_atr()
strecke_atr['distance'] = strecke_atr.loc[:, 'ALG_LAENGE_ABSCHNITT'].str.replace(',', '.').astype('float')
map_positions = get_map_positions()
strecke_graph = nx.from_pandas_edgelist(strecke_atr, source='source', target='target')

### Add mapping match columns

In [None]:
stations_gdf = stations.get_geopandas()

In [None]:
if not 'geom_match' in stations_gdf:
    stations_gdf['geom_match'] = ''
if not 'geo_match_node' in stations_gdf:
    stations_gdf['geo_match_node'] = ''
if not 'geomatch_dist' in stations_gdf:
    stations_gdf['geomatch_dist'] = ''
if not 'string_match' in stations_gdf:
    stations_gdf['string_match'] = ''
if not 'string_match_score' in stations_gdf:
    stations_gdf['string_match_score'] = ''

### Geomap stations

In [None]:
dest_unary = map_positions_gdf["geometry"].unary_union
stations_gdf.loc[:, ['geom_match', 'geo_match_node', 'geomatch_dist']] = stations_gdf.loc[:, :].progress_apply(geo_map, destination=map_positions_gdf, dest_unary=dest_unary, val='geometry', axis=1)

### String match (difflib) stations

In [None]:
map_positions_df = pd.DataFrame({'bhf':list(map_positions.keys()), 'location':list(map_positions.values())})
map_positions_gdf = gpd.GeoDataFrame(map_positions_df, geometry=map_positions_df['location'])

In [None]:
map_names = map_positions_gdf['bhf'].to_list()
for i in range(len(map_names)):
    map_names[i] = name_replace(map_names[i])

In [None]:
stations_gdf.loc[:, ['string_match', 'string_match_score']] = stations_gdf.loc[:, :].progress_apply(get_closest_match, names_to_match=map_names, axis=1)

### Upload data to db

In [None]:
from config import db_database, db_password, db_server, db_username
import sqlalchemy
engine = sqlalchemy.create_engine('postgresql://'+ db_username +':' + db_password + '@' + db_server + '/' + db_database + '?sslmode=require')

### Upload Streckennetz

In [None]:
strecke_atr.to_sql('streckennetz', if_exists='replace', method='multi', con=engine)

### Upload Mappings

### Other Stuff

In [None]:
stations_gdf.to_csv('station_matches.csv')

In [None]:
string_matches = []
for name in tqdm_bar(stations):
    string_matches.append(get_closest_match(name, map_names))

In [None]:
sta_pos = []
for i in range(len(stations)):
    sta_pos.append(Point(stations.station_df.at[i, 'lon'], stations.station_df.at[i, 'lat']))

In [None]:
sta_pos = MultiPoint(sta_pos)

In [None]:
map_positions_points = MultiPoint(list(map_positions.values()))

In [None]:
positions = {}
for station in strecke_graph.nodes():
    try:
        positions[station] = betriebstellen.get_location(name=station)
    except KeyError:
        continue
    except NoLocationError:
        continue

In [None]:
pos = nx.spring_layout(strecke_graph, pos=map_positions, fixed=map_positions.keys(), k=0.001)

In [None]:
fig, ax = plt.subplots(figsize=(21*2, 24*2))
ax = nx.draw(strecke_graph, pos=pos, ax=ax)
bahnhöfe.plot(ax=ax)

In [None]:
number_of_colors = len(strecke)

color = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)])
             for i in range(number_of_colors)]
strecke['colors'] = color

In [None]:
ax = strecke.plot(column='colors', figsize=(21*2, 24*2))
bahnhöfe.plot(ax=ax)

In [None]:
bahnhöfe.plot(figsize=(21*2, 24*2))