In [6]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon, LineString

In [7]:
#reading in the routes and poi data
routes = pd.read_csv('cleaned_routes_data/cleaned_all_routes_data_long.csv')
poi = pd.read_csv('cleaned_all_poi_data.csv').iloc[0:5000, :] #using smaller sample for experimenting

In [8]:
#converting routes df into a GeoDataFrame
gdf_routes = gpd.GeoDataFrame(routes, geometry=gpd.points_from_xy(routes.latitude, routes.longitude))
gdf_routes.drop(['Unnamed: 0', 'lat_lgt'], axis=1, inplace=True)
gdf_routes

Unnamed: 0,route_id,num_of_waypoint,latitude,longitude,geometry
0,1005019,0,52.50607,13.33208,POINT (52.50607 13.33208)
1,1005019,1,52.50553,13.33163,POINT (52.50553 13.33163)
2,1005019,2,52.50525,13.33148,POINT (52.50525 13.33148)
3,1005019,3,52.50515,13.33337,POINT (52.50515 13.33337)
4,1005019,4,52.50520,13.33366,POINT (52.50520 13.33366)
...,...,...,...,...,...
216798,933359,151,52.50444,13.38246,POINT (52.50444 13.38246)
216799,933359,152,52.50525,13.38633,POINT (52.50525 13.38633)
216800,933359,153,52.50643,13.38615,POINT (52.50643 13.38615)
216801,933359,154,52.50648,13.39023,POINT (52.50648 13.39023)


In [9]:
#converting points from routes data to linestring and adding these into a new column
gdf_routes['route_linestring'] = gdf_routes['route_id'].map(gdf_routes.groupby(['route_id'])['geometry'].apply(lambda x: LineString(x.tolist())))
gdf_routes

Unnamed: 0,route_id,num_of_waypoint,latitude,longitude,geometry,route_linestring
0,1005019,0,52.50607,13.33208,POINT (52.50607 13.33208),"LINESTRING (52.50607 13.33208, 52.50553 13.331..."
1,1005019,1,52.50553,13.33163,POINT (52.50553 13.33163),"LINESTRING (52.50607 13.33208, 52.50553 13.331..."
2,1005019,2,52.50525,13.33148,POINT (52.50525 13.33148),"LINESTRING (52.50607 13.33208, 52.50553 13.331..."
3,1005019,3,52.50515,13.33337,POINT (52.50515 13.33337),"LINESTRING (52.50607 13.33208, 52.50553 13.331..."
4,1005019,4,52.50520,13.33366,POINT (52.50520 13.33366),"LINESTRING (52.50607 13.33208, 52.50553 13.331..."
...,...,...,...,...,...,...
216798,933359,151,52.50444,13.38246,POINT (52.50444 13.38246),"LINESTRING (52.50157 13.49061, 52.50300 13.484..."
216799,933359,152,52.50525,13.38633,POINT (52.50525 13.38633),"LINESTRING (52.50157 13.49061, 52.50300 13.484..."
216800,933359,153,52.50643,13.38615,POINT (52.50643 13.38615),"LINESTRING (52.50157 13.49061, 52.50300 13.484..."
216801,933359,154,52.50648,13.39023,POINT (52.50648 13.39023),"LINESTRING (52.50157 13.49061, 52.50300 13.484..."


In [10]:
#converting poi df into a GeoDataFrame
gdf_poi = gpd.GeoDataFrame(poi, geometry=gpd.points_from_xy(poi.lat, poi.lon))
gdf_poi

Unnamed: 0.1,Unnamed: 0,category,name,id,lat,lon,geometry
0,0,['atm'],Bank für Sozialwirtschaft,78252154,52.523744,13.398627,POINT (52.52374 13.39863)
1,1,['atm'],Sparda-Bank,87036263,52.532985,13.384282,POINT (52.53299 13.38428)
2,2,['atm'],Bankhaus August Lenz,89275133,52.518025,13.406956,POINT (52.51802 13.40696)
3,3,['atm'],,213106623,52.542170,13.441137,POINT (52.54217 13.44114)
4,4,['atm'],Berliner Sparkasse,213113204,52.542750,13.392862,POINT (52.54275 13.39286)
...,...,...,...,...,...,...,...
4995,4995,['bench'],,2687646399,52.407078,13.562105,POINT (52.40708 13.56210)
4996,4996,['bench'],,2687646403,52.407081,13.562048,POINT (52.40708 13.56205)
4997,4997,['bench'],,2687646405,52.407066,13.564322,POINT (52.40707 13.56432)
4998,4998,['bench'],,2687646415,52.407187,13.563088,POINT (52.40719 13.56309)


In [11]:
#spacial joining both datasets on nearest distance of a route from a poi
#playing with it leads to unexpected behavior --> needs to be explored further with a fresh head
poi_routes = gpd.sjoin_nearest(gdf_poi, gdf_routes, how='left', max_distance=0.01, distance_col='distance')

#followed by some manipulation for the optics
poi_routes.drop(['Unnamed: 0', 'id', 'lat', 'lon', 'index_right', 'latitude', 'longitude', 'num_of_waypoint'], axis=1, inplace=True) #dropping columns
poi_routes = poi_routes.iloc[:, [3,4,2,0,1,5]] #rearranging columns
poi_routes.rename({'geometry': 'poi_lat_lgt', 'category': 'poi_category', 'name': 'poi_name'}, axis=1, inplace=True) #renaming cloumns
poi_routes['route_id'] = poi_routes['route_id'].astype(int, errors='ignore') #converting float to int for some reason isn't working
poi_routes.dropna(thresh=5, inplace=True) #dropping any poi without a route passing by in max_distance
poi_routes.sort_values(by=['route_id']) #sorting by route_id
poi_routes.reset_index(drop=True, inplace=True) #reset index
poi_routes


Unnamed: 0,route_id,route_linestring,poi_lat_lgt,poi_category,poi_name,distance
0,3543318.0,"LINESTRING (52.54966 13.41400, 52.54965 13.413...",POINT (52.52374 13.39863),['atm'],Bank für Sozialwirtschaft,0.000095
1,3507857.0,"LINESTRING (52.54159 13.41241, 52.54160 13.412...",POINT (52.53299 13.38428),['atm'],Sparda-Bank,0.001917
2,3507825.0,"LINESTRING (52.51791 13.39475, 52.51792 13.394...",POINT (52.53299 13.38428),['atm'],Sparda-Bank,0.001917
3,3546402.0,"LINESTRING (52.50191 13.32906, 52.50191 13.329...",POINT (52.51802 13.40696),['atm'],Bankhaus August Lenz,0.000217
4,3650542.0,"LINESTRING (52.54368 13.44052, 52.54369 13.440...",POINT (52.54217 13.44114),['atm'],,0.000651
...,...,...,...,...,...,...
6327,2573979.0,"LINESTRING (52.42384 13.62925, 52.42169 13.628...",POINT (52.40692 13.56596),['bench'],,0.009639
6328,2573979.0,"LINESTRING (52.42384 13.62925, 52.42169 13.628...",POINT (52.40698 13.56692),['bench'],,0.008871
6329,2573979.0,"LINESTRING (52.42384 13.62925, 52.42169 13.628...",POINT (52.40699 13.56696),['bench'],,0.008838
6330,2573979.0,"LINESTRING (52.42384 13.62925, 52.42169 13.628...",POINT (52.40698 13.56775),['bench'],,0.008286


In [12]:
#writing joint sample data into a csv file
poi_routes.to_csv('joint_sample_data.csv')