Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Spoortakmodel #8

Merged
merged 7 commits into from
May 6, 2022
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
11 changes: 11 additions & 0 deletions data/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Spoortak models

## Source

All of these spoortak models were downloaded form the BBMS platform However some of the BBMS_BERICHT_xx.csv files were
Nozziel marked this conversation as resolved.
Show resolved Hide resolved
missing and were provided by a ProRail employee who happened to have them

## Spoortak model 18+

Spoortak model 18 uses the new 'spoortak model 2.0' as defined in BID00023, where spoortak has been replaced for
segment.
4 changes: 4 additions & 0 deletions openspoor/spoortakmodel/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .spoortak_models_data import SpoortakModelsData
from .spoortak_subsection import SpoortakSubsection
from .spoortak_model_inspector import SpoortakModelInspector
from .spoortak_model_mapper import SpoortakModelMapper
8 changes: 8 additions & 0 deletions openspoor/spoortakmodel/singleton.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
class Singleton(object):
""" helper class to implement the singleton pattern """
_instances = {}

def __new__(class_, *args, **kwargs):
if class_ not in class_._instances:
class_._instances[class_] = super(Singleton, class_).__new__(class_)
return class_._instances[class_]
61 changes: 61 additions & 0 deletions openspoor/spoortakmodel/spoortak_model_inspector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from pprint import pprint
import pandas as pd

from ..spoortakmodel import SpoortakModelsData


class SpoortakModelInspector:
def __init__(self, spoortak_model_data: SpoortakModelsData):
self.data = spoortak_model_data
self.old_pd_option_values = dict()

def _set_pd_options(self):

self.old_pd_option_values['display.max_columns'] = pd.get_option('display.max_columns')
self.old_pd_option_values['display.max_rows'] = pd.get_option('display.max_rows')
self.old_pd_option_values['display.expand_frame_repr'] = pd.get_option('display.expand_frame_repr')
self.old_pd_option_values['mode.chained_assignment'] = pd.get_option('mode.chained_assignment')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('mode.chained_assignment', None)

def _reset_pd_options(self):
for key, value in self.old_pd_option_values.items():
pd.set_option(key, value)

def inspect(self, spoortak_identifier: str):
""" Scan's the spoortak model data (including change data) and returns everything related to the given spoortak """
Nozziel marked this conversation as resolved.
Show resolved Hide resolved
self._set_pd_options()

print(f'--- {spoortak_identifier} spoortakmodel data ---')

baseline_model = 3

model_data = pd.DataFrame(columns=list(self.data.models[baseline_model].columns) + ['spoortakmodel'])

for spoortakmodel_version, model in self.data.models.items():
if spoortak_identifier in model.index:
df = model.loc[spoortak_identifier]
df['spoortakmodel'] = spoortakmodel_version
# model_data = pd.concat([model_data, df], axis=1)
model_data = model_data.append(df)
pprint(model_data)

print(f'--- {spoortak_identifier} spoortakmodel changes ---')
Nozziel marked this conversation as resolved.
Show resolved Hide resolved

change_data = pd.DataFrame(columns=list(self.data.model_changes[baseline_model].columns) + ['spoortakmodel'])

for spoortakmodel_version, model_changes in self.data.model_changes.items():
df = model_changes[
(model_changes['MODFWE'] == spoortak_identifier)
| (model_changes['DASSIGNNAME'] == spoortak_identifier)
| (model_changes['FWENAME'] == spoortak_identifier)
]
if len(df) > 0:
df['spoortakmodel'] = spoortakmodel_version
change_data = pd.concat([change_data, df], axis=0)
pprint(change_data)

self._reset_pd_options()
169 changes: 169 additions & 0 deletions openspoor/spoortakmodel/spoortak_model_mapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
from typing import Optional

import numpy as np
import pandas as pd

from .spoortak_subsection import SpoortakSubsection
from .spoortak_models_data import SpoortakModelsData

import logging

log = logging.getLogger(__name__)


class SpoortakModelMapper:
""" Limitations:
- RENAME keyword not supported
- can't map if there is no overlapping geocode
- assumes singular kilometer lint if geocodes match with older segments
- does not support spoortak 2.0 model (v18) yet
- unkown what happens if geocode_begin and geocode_end differ and do not share a km lint

TODO:
- now assume the input exists in 17 and we are searching backward... Make this more flexible to search forward from 'midway'
- Probably should have a validation that we have the same 'length' for all spoortak models in our output

changes as described in BBMS_BERICHT_xx.csv
ASSIGN - implemented
NEWTOP - implemented
EDITCORR - Don't need (?)
EDITREAL - Don't need (?)
RENAME - TODO
rename is often use to swap two names. If we want to support rename we need to keep track of both else we'll
report back on both the spoortakken that swapped the names

"""

def __init__(self, spoortak_model_data: SpoortakModelsData):
self._data = spoortak_model_data
log.warning("SpoortakModelMapper is a best effort mapper to older/newer models, it is far from perfect.")

def _is_new_spoortak(self, spoortak_identifier, spoortak_model: int):
changes = self._data.model_changes[spoortak_model]
action_new = changes[(changes['FWENAME'] == spoortak_identifier) & (changes['ACTION'] == 'NEWTOP')]
return len(action_new) > 0

def _retrieve_spoortak(self, spoortak_identifier: str, model_version: int) -> Optional[pd.Series]:
""" Retrieve the spoortak data for a specific model version """
if spoortak_identifier not in self._data.models[model_version].index:
return None

return self._data.models[model_version].loc[spoortak_identifier]

def _related_spoortakken(self, spoortak_identifier: str, geocodes: [int]) -> [(str, int)]:

related_spoortakken = []

for model_version in self._data.model_version_numbers:
temp_related_spoortakken = set()
model_changes = self._data.model_changes[model_version]
mask = (
(model_changes['MODFWE'] == spoortak_identifier)
| (model_changes['DASSIGNNAME'] == spoortak_identifier)
| (model_changes['FWENAME'] == spoortak_identifier)
)

# not implemented
mask = mask & (model_changes['ACTION'] != 'RENAME')

changes = model_changes[mask]
if len(changes) > 0:
temp_related_spoortakken.update(changes['MODFWE'].unique())
temp_related_spoortakken.update(changes['DASSIGNNAME'].unique())
temp_related_spoortakken.update(changes['FWENAME'].unique())

if np.nan in temp_related_spoortakken:
temp_related_spoortakken.remove(np.nan)
if spoortak_identifier in temp_related_spoortakken:
temp_related_spoortakken.remove(spoortak_identifier)

for entry in temp_related_spoortakken:
# if there is no geocode overlap we can't assume the kilometrering uses the same 'kilometer lint'
spoortak = self._retrieve_spoortak(entry, model_version)
if spoortak is None:
spoortak = self._retrieve_spoortak(entry, model_version - 1)
if spoortak is None:
raise ValueError(
f"Could not find spoortak {entry} in model {model_version} and {model_version - 1}")

if spoortak['GEOCODE_BEGIN'] in geocodes or spoortak['GEOCODE_EIND'] in geocodes:
related_spoortakken.append((entry, model_version))

return related_spoortakken

@staticmethod
def _limit_start_end(subsections: [SpoortakSubsection], km_start: int, km_end: int) -> [SpoortakSubsection]:
""" perforce the limit in place """
return [subsection.limit_start_end(km_start, km_end) for subsection in subsections]

@staticmethod
def _remove_duplicates(subsections: [SpoortakSubsection]) -> [SpoortakSubsection]:
deduped = set(subsections)
return list(deduped)

def map(self, spoortak_subsection: SpoortakSubsection, _ignore_list: [str] = None) -> [SpoortakSubsection]:
""" Maps a spoortak subsection to all other spoortak models

:param spoortak_subsection: subsection to map
:param _ignore_list: list of spoortak identifiers to ignore (need this to avoid infinite loops)

"""

if not _ignore_list:
Nozziel marked this conversation as resolved.
Show resolved Hide resolved
_ignore_list = []

found_subsections = []
spoortak_found = False

# scan backwards
for model_version in self._data.model_version_numbers[::-1]:
model = self._data.models[model_version]

# step 1: Scan the model data for all references
spoortak_data = self._retrieve_spoortak(spoortak_subsection.identification, model_version)

if spoortak_found and spoortak_data is None:
raise ValueError(f'The spoortak was not new, but could not find it in version {model_version}.')

if spoortak_data is None:
continue

spoortak_found = True

# make this nicer, dont need to grab them every time and overwriting them with the oldest?
geocodes = [spoortak_data['GEOCODE_BEGIN'], spoortak_data['GEOCODE_EIND']]

found_subsections.append(
SpoortakSubsection(spoortak_data.name,
max(spoortak_subsection.kilometrering_start, spoortak_data['kilometrering_start']),
min(spoortak_subsection.kilometrering_end, spoortak_data['kilometrering_end']),
model_version))

if self._is_new_spoortak(spoortak_subsection.identification, model_version):
log.info(f'Spoortak was new, not searching further back in the model history')
break

# find anything that is related, assume same km lint if one of the geocodes matches and just add them all to the list
related_spoortakken = self._related_spoortakken(spoortak_subsection.identification, geocodes)
for related_spoortak, related_model_version in related_spoortakken:
if related_spoortak in _ignore_list:
continue

spoortak_subsection = SpoortakSubsection(related_spoortak, spoortak_subsection.kilometrering_start,
spoortak_subsection.kilometrering_end,
related_model_version)
related_segments = self.map(spoortak_subsection, _ignore_list + [spoortak_subsection.identification])
found_subsections.extend(related_segments)

limited = self._limit_start_end(found_subsections, spoortak_subsection.kilometrering_start,
spoortak_subsection.kilometrering_end)
cleaned = self._remove_duplicates(limited)

ordered = sorted(cleaned, key=lambda x: (x.spoortak_model_version, x.kilometrering_start))

return ordered

def map_to(self, spoortak_subsection: SpoortakSubsection, model_version: int) -> [SpoortakSubsection]:
""" Maps a spoortak subsection to a specific model version """
results = self.map(spoortak_subsection)
return [result for result in results if result.spoortak_model_version == model_version]
103 changes: 103 additions & 0 deletions openspoor/spoortakmodel/spoortak_models_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import os
from glob import glob

import numpy as np
from typing import Optional
import logging
import pandas as pd

from ..spoortakmodel.singleton import Singleton

log = logging.getLogger(__name__)


class SpoortakModelsData(Singleton):
""" Helper class that loads spoortak model data (only once)

Improvement backlog:
- Support V18 (spoortak 2.0)
"""

models: dict = None
model_changes: dict = None

@staticmethod
def _convert_dutch_number(value: str) -> float:
""" '55.132,56' => 55132.56"""
if not value:
return np.nan
return float(value.replace('.', '').replace(',', '.'))

@staticmethod
def _km_to_meters(km: float) -> Optional[int]:
if km is None or np.isnan(km):
return None
return int(km * 1000)

@staticmethod
def _replace_km_columns(df: pd.DataFrame) -> None:
""" implace correction of the km columns and renaming them to avoid confusion """
df['kilometrering_start'] = df[['KM_BEGIN', 'KM_EIND']].values.min(1)
df['kilometrering_end'] = df[['KM_BEGIN', 'KM_EIND']].values.max(1)

df_changed = df[df['kilometrering_start'] != df['KM_BEGIN']]
df_changed_geo = df_changed[df_changed['GEOCODE_BEGIN'] != df_changed['GEOCODE_EIND']]

for spoortak, row in df_changed_geo.iterrows():
log.warning(
f'Swapped begin & eind km for {spoortak} met verschillende geocode begin & eind mogenlijk was dit een ander kilometerlint')
Nozziel marked this conversation as resolved.
Show resolved Hide resolved

df.drop(columns=['KM_BEGIN', 'KM_EIND'], inplace=True)

@staticmethod
def _get_model_numbers(data_path: str) -> [int]:
""" Return the available model numbers """
dirs = glob(os.path.join(data_path, 'Versie_*'))

# return [int(os.path.basename(directory).removeprefix('Version_')) for directory in dirs] # python 3.9 feature
Nozziel marked this conversation as resolved.
Show resolved Hide resolved
return [int(os.path.basename(directory).split('_')[1]) for directory in dirs]
Nozziel marked this conversation as resolved.
Show resolved Hide resolved

def __init__(self, data_path: str):
# we've applied the singleton pattern here so we can check if the data is already there.
if not self.models:
log.info('loading data...')

self.models = dict()

self.model_version_numbers = self._get_model_numbers(data_path)
unsupported_version_start = 18
self.model_version_numbers = [version for version in self.model_version_numbers if
version < unsupported_version_start]

try:
for model_version in self.model_version_numbers:
self.models[model_version] = pd.read_csv(os.path.join(data_path,
f'Versie_{model_version:02d}/SPOORTAK_{model_version}.csv'),
delimiter=';',
header=0,
converters={
'KM_BEGIN': lambda km: self._km_to_meters(
self._convert_dutch_number(km)),
'KM_EIND': lambda km: self._km_to_meters(
self._convert_dutch_number(km)),
'LENGTE': lambda km: self._km_to_meters(
self._convert_dutch_number(km))
},
index_col='SPOORTAK_IDENTIFICATIE',
encoding='latin1'
)

self._replace_km_columns(self.models[model_version])
except ValueError:
log.error(f'Failed to read model {model_version}')
raise

self.model_changes = {
model_version: pd.read_csv(
os.path.join(data_path, f'Versie_{model_version:02d}/BBMS_BERICHT_{model_version}.csv'),
delimiter=';',
header=0,
encoding='latin1'
)
for model_version in self.model_version_numbers
}
29 changes: 29 additions & 0 deletions openspoor/spoortakmodel/spoortak_subsection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from dataclasses import dataclass


@dataclass(frozen=True)
class SpoortakSubsection:
""" Spoortak subsection


:param kilometrering_start: Kilometrering in meters
:param kilometrering_end: Kilometrering in meters

Remarks: This is not a spoortak 2.0 segment

"""

identification: str
kilometrering_start: int
kilometrering_end: int

spoortak_model_version: int = None

def limit_start_end(self, start: int, end: int):
""" creates a new SpoortakSubsection limited to start and end"""
return SpoortakSubsection(
self.identification,
max(self.kilometrering_start, start),
min(self.kilometrering_end, end),
self.spoortak_model_version
)