In [1]:
import pandas as pd
from scipy.spatial.distance import cdist
import numpy as np
from tqdm import tqdm

In [2]:
df = pd.read_csv('data/locations_data.csv')
df['date'] = df['date'].astype('datetime64[ns, Europe/Warsaw]')

In [3]:
stops = pd.read_csv('data/stops.txt')

In [5]:
stop_radius = 0.001
batch_size = 500000 # more should be faster, too much will overload ram

df.loc[:, 'at_stop'] = False
df.loc[:, 'stop'] = None
df.loc[:, 'stop_dist'] = 0. # for dubugging

stop_coords = stops[['stop_lat', 'stop_lon']].to_numpy()

for i in tqdm(range(0, len(df), batch_size)):
    batch = df.iloc[i:i + batch_size]
    coords = batch[['x', 'y']].to_numpy()
    dists = cdist(coords, stop_coords)
    at_stop = dists.min(1) < stop_radius
    batch.loc[:, 'at_stop'] = at_stop
    batch.loc[at_stop, 'stop'] = dists.argmin(1)[at_stop]
    batch.loc[at_stop, 'stop_dist'] = dists.min(1)[at_stop] # for debugging
    df.iloc[i:i + batch_size] = batch

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
100%|██████████| 7/7 [00:52<00:00,  7.54s/it]


In [6]:
df

Unnamed: 0,name,type,y,x,k,date,at_stop,stop,stop_dist
0,126,bus,17.012617,51.087154,19246880,2022-01-08 23:16:02+01:00,True,1769,0.000119
1,128,bus,16.954292,51.134167,19061372,2022-01-08 23:16:02+01:00,False,,0.000000
2,11,tram,17.041996,51.118793,19227664,2022-01-08 23:16:02+01:00,True,1132,0.000056
3,11,tram,16.976880,51.089455,19227689,2022-01-08 23:16:02+01:00,True,629,0.000274
4,246,bus,16.954378,51.126790,19246649,2022-01-08 23:16:02+01:00,True,491,0.000733
...,...,...,...,...,...,...,...,...,...
3063069,241,bus,17.031246,51.094580,19255991,2022-01-12 01:21:14+01:00,False,,0.000000
3063070,206,bus,17.029966,51.102406,19255945,2022-01-12 01:21:14+01:00,True,1301,0.000389
3063071,101,bus,17.022598,51.149773,18497802,2022-01-12 01:21:14+01:00,False,,0.000000
3063072,242,bus,17.080970,51.139520,19256007,2022-01-12 01:21:14+01:00,True,379,0.000109


In [7]:
df = df[df['at_stop']]
df['stop_name'] = df['stop'].apply(lambda x: stops['stop_name'][x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['stop_name'] = df['stop'].apply(lambda x: stops['stop_name'][x])


In [9]:
def find_changes(x: pd.Series):
    return x.ne(x.shift())

In [11]:
df[df['k'] == df['k'][0]]['stop'].apply(lambda x: stops.iloc[x][['stop_id','stop_name']])[:15]

Unnamed: 0,stop_id,stop_name
0,3891,Hallera
144,3891,Hallera
288,3891,Hallera
432,3891,Hallera
577,3903,Hallera
720,3903,Hallera
863,3904,Hallera
1006,3904,Hallera
1149,3904,Hallera
1293,3814,Racławicka (szkoła)


In [27]:
trips_cols = ['source_name', 'destination_name', 'start_time', 'end_time', 'line', 'k']
trips = []

In [28]:
for k in tqdm(df['k'].unique()):
    k_mask = df['k'] == k
    changes = find_changes(df[k_mask]['stop_name'])
    idxs = np.flatnonzero(changes)
    new_rows = []
    for i in idxs[1:]:
        prev_row, cur_row = df.loc[k_mask].iloc[i-1], df.loc[k_mask].iloc[i]
        if prev_row['name'] == cur_row['name']:
            new_rows.append([prev_row['stop_name'], cur_row['stop_name'], prev_row['date'], cur_row['date'], cur_row['name'], k])
    new_rows = pd.DataFrame(new_rows, columns=trips_cols)
    trips.append(new_rows)
trips = pd.concat(trips, ignore_index=True)

100%|██████████| 10144/10144 [07:30<00:00, 22.50it/s]


In [29]:
trips

Unnamed: 0,source_name,destination_name,start_time,end_time,line,k
0,Hallera,Racławicka (szkoła),2022-01-08 23:17:22+01:00,2022-01-08 23:17:32+01:00,126,19246880
1,Racławicka (szkoła),Modlińska,2022-01-08 23:18:02+01:00,2022-01-08 23:18:52+01:00,126,19246880
2,Modlińska,Wawrzyniaka,2022-01-08 23:19:42+01:00,2022-01-08 23:20:52+01:00,126,19246880
3,Wawrzyniaka,Chłodna,2022-01-08 23:21:22+01:00,2022-01-08 23:21:33+01:00,126,19246880
4,Chłodna,Sowia,2022-01-08 23:22:22+01:00,2022-01-08 23:22:32+01:00,126,19246880
...,...,...,...,...,...,...
252737,rondo Św. Ojca Pio,Ożynowa,2022-01-12 01:17:34+01:00,2022-01-12 01:18:14+01:00,259,19256282
252738,Ożynowa,Malinowa,2022-01-12 01:18:54+01:00,2022-01-12 01:19:04+01:00,259,19256282
252739,Malinowa,Wyścigowa,2022-01-12 01:19:04+01:00,2022-01-12 01:19:44+01:00,259,19256282
252740,Wyścigowa,PARK POŁUDNIOWY,2022-01-12 01:20:15+01:00,2022-01-12 01:20:24+01:00,259,19256282


In [30]:
trips.to_csv('data/recorded_trips.csv')