# Data transformation

## Input data railway_rails, railways_stations from original folder

In [1]:
import geojson
import json

def open_json(filename):
    """
    This function loads in the json file stored by osm_download
    Args:
    filename:str = Filename where the data is stored
    Returns:
    A json file in dictionary format
    """
    with open(f"original\\{filename}") as file1:
        data = json.load(file1)
    return data

In [2]:
railway_stations = open_json('railway_stations.json')
railway_rails = open_json('railway_rails.json')

## Conversion Overpass JSON to geopandas geodataframe

The conversion creates a geometry column and also columns for the desired tags

In [3]:
# Besides the geometry columns we want to extract attributes of the OSM tags and store them in columns
# Therefore this function creates dictionaries for each tag and is afterwards implemented in the big conversion function
def append_tags(element, new_element, desired_tags: list):
    """
    This function takes the desired tags and create a dictionary for every tag. 
    This dictionary is appended to an element of the conversion from Overpass json to gpd.geodataframe.
    Therefore each single dictionary gets its own coloumn
    
    Args:
        element: old element dictionary to take the information from
        new_element: new element dictionary to write the information in 
        desired_tags: list[str] list of strings containing the desired tags as columns in the gpd.gdf
    Return:
        new_element: dictionary with new tags
    """
    # Make a for loop for every tag in the list desired tags
    for tag in desired_tags:
        # condition: There must be a dictionary with tags AND the desired tag in it
        if ('tags' in element.keys()) and (tag in element['tags'].keys()): 
            new_element[tag] = element['tags'][tag]
        else:
            new_element[tag] = None
    return new_element


In [4]:
# Implement into the previous big function overpass_json_to_gpd_gdf()

import shapely.geometry as sg
import geopandas as gpd

def overpass_json_to_gpd_gdf(overpass_json, desired_tags):
    """
        This function takes a overpass json file containing nodes or ways and transforms it 
        into an geopandas geodataframe
        
        Args:
            desired_tags: list[str] containing tags to be new column in gpd.gdf
            
            overpass_json: a json dictionary containing a list of elements (noded or ways)
            
            For node elements each element is structured as following:
            [
             first_element,
             # this is an element:
             {'type': 'node', 
              'id': 25414208, 
              'lat': 38.7404678, 
              'lon': -9.1656799, 
              'tags': {'local_ref': '3',...},
             },
             last_element
             ]
             
            For the ways each element is structured as following:
            [
            first_element,
            # this is an element
            {'type': 'way',
             'geometry': [{'lat': lat, 'lon': lon}, {'lat': lat, 'lon': lon}],
             'tags': {'maxspeed': '190', ...}
            },
             last_element
            ]
            
        Returns:
            gpd.gdf: a geopandas geodataframe 
            
            Based on the conversion from a following list of dictionaries:
            [first_ element,
             # this is an element
             {'geometry': shape.object,
              'desired_tag1': 'value1',
              'desired_tag2': 'value2'
             },
             last_element
             ]
    """

    new_data = []
    for element in overpass_json['elements']:
        # create a new element dictionary which stands for one element
        new_element = {}
        
        # 1. This first part is for nodes of OSM
        if element['type'] == 'node':
            # create a new_geometry dictionary
            # data structure for new_geometry to be shaped with function shapely.geometry.shape() afterwards
            """
            [{
                'type': 'Point',
                'coordinates': (lon, lat)
            }]
            """
            new_geometry = {}
            # change 'type' to 'Point'
            new_geometry['type'] = 'Point'
            # create a new geometry point as tuple (lat, lon)    
            lon = element['lon']
            lat = element['lat']
            geometry = [(lon, lat)]
            new_geometry['coordinates'] = geometry
        
        # 2. This second part is for ways of OSM
        if element['type'] == 'way':
            # create a new_geometry dictionary
            # data structure for new_geometry to be shaped with function shapely.geometry.shape() afterwards
            """
            [{
                'type': 'LineString',
                'coordinates': [(lon, lat), (lon, lat)]
            }]
            """
            new_geometry = {}
            # change 'type' to 'LineString'
            new_geometry['type'] = 'Linestring'
            # create a list of geometries
            geometry = [] 
            for node in element['geometry']:
                lon = node['lon']
                lat = node['lat']
                geometry.append((lon, lat))
            new_geometry['coordinates'] = geometry
            
        # shape the new_geometry {'type': 'Linestring OR Point', 'coordinates': [(lat, lon) OR, (lat, lon)]} 
        # and append it as under the tag 'geometry' in the new_element dictionary
        new_element['geometry'] = sg.shape(new_geometry)

        # append atribute tags if available
        append_tags(element, new_element, desired_tags)

        # append each single new element (dict) to the new list of new elements [dict1, dict2]
        new_data.append(new_element)
    
    #transform it to a gpd geodataframe
    return gpd.GeoDataFrame(new_data, crs="EPSG:4326")

Check the functions

In [5]:
gdf_stations = overpass_json_to_gpd_gdf(railway_stations, ['name', 'network'])
gdf_stations.head()

Unnamed: 0,geometry,name,network
0,POINT (-9.10155 38.60990),Fogueteiro,
1,POINT (-9.16568 38.74047),Sete Rios,CP;Fertagus
2,POINT (-9.10320 38.74623),Braço de Prata,
3,POINT (-9.10243 38.74785),Braço de Prata,
4,POINT (-8.61662 40.86398),Ovar,


In [6]:
gdf_rails = overpass_json_to_gpd_gdf(railway_rails, ['maxspeed', 'name'])
gdf_rails.head()

Unnamed: 0,geometry,maxspeed,name
0,"LINESTRING (-9.03409 38.88987, -9.03521 38.888...",190.0,Linha 3 (Alverca)
1,"LINESTRING (-9.10767 38.73436, -9.10704 38.735...",190.0,
2,"LINESTRING (-9.05349 38.87156, -9.05382 38.871...",190.0,Linha do Norte
3,"LINESTRING (-9.04803 38.58256, -9.04901 38.583...",,
4,"LINESTRING (-8.46391 39.52075, -8.46423 39.521...",,Ramal de Tomar


The following code makes changes possible by making lines between single platform changes and adds it to the rails dataset

TODO: Write it into a function

In [7]:
new_df= gdf_stations.groupby('name').count()
new_df = new_df[new_df['geometry'] > 1]
station_2 = list(new_df.index)
for n in station_2:
    line_string = sg.LineString(list(gdf_stations[gdf_stations['name'] == n]['geometry']))
    x = dict(name = f"{n}_change", geometry = line_string)
    gdf_rails = gdf_rails.append(x, ignore_index=True)

In [8]:
gdf_rails.tail(10)

Unnamed: 0,geometry,maxspeed,name
7037,"LINESTRING (-8.62027 41.12974, -8.62029 41.12977)",,Vila Nova de Gaia_change
7038,"LINESTRING (-8.52586 40.19290, -8.52412 40.19304)",,Vila Pouca do Campo_change
7039,"LINESTRING (-7.42145 37.19967, -7.42149 37.199...",,Vila Real de Santo António_change
7040,"LINESTRING (-8.40628 41.35743, -8.40627 41.35741)",,Vila das Aves_change
7041,"LINESTRING (-6.83008 40.60614, -6.83008 40.606...",,Vilar Formoso_change
7042,"LINESTRING (-8.44133 40.26324, -8.44138 40.26325)",,Vilela-Fornos_change
7043,"LINESTRING (-8.31240 41.37910, -8.31236 41.37911)",,Vizela_change
7044,"LINESTRING (-8.60150 40.83420, -8.60151 40.834...",,Válega_change
7045,"LINESTRING (-8.44829 40.57755, -8.44829 40.577...",,Águeda_change
7046,"LINESTRING (-7.90721 38.56047, -7.90709 38.560...",,Évora_change


In [10]:
# save the GeoDataFrame as shapefile
import os
try:
    os.mkdir("processed")
except:
    pass

try:
    os.mkdir("processed\\railway_rails")
except:
    pass

try:
    os.mkdir("processed\\railway_stations")
except:
    pass

gdf_rails.to_file(driver = 'ESRI Shapefile', filename= "processed\\railway_rails\\railway_rails.shp")
gdf_stations.to_file(driver = 'ESRI Shapefile', filename= "processed\\railway_stations\\railway_stations.shp")