In [1]:
import pandas as pd
from scipy.spatial.distance import cdist
import numpy as np
from tqdm import tqdm
from datetime import datetime, timedelta

In [2]:
df = pd.read_csv('data/locations_data.csv')
df['date'] = df['date'].astype('datetime64[ns, Europe/Warsaw]')

In [3]:
stops = pd.read_csv('data/stops.txt')

In [4]:
stop_radius = 0.001
batch_size = 500000 # more should be faster, too much will overload ram

df.loc[:, 'at_stop'] = False
df.loc[:, 'stop'] = None
df.loc[:, 'stop_dist'] = 0. # for dubugging

stop_coords = stops[['stop_lat', 'stop_lon']].to_numpy()

for i in tqdm(range(0, len(df), batch_size)):
    batch = df.iloc[i:i + batch_size]
    coords = batch[['x', 'y']].to_numpy()
    dists = cdist(coords, stop_coords)
    at_stop = dists.min(1) < stop_radius
    batch.loc[:, 'at_stop'] = at_stop
    batch.loc[at_stop, 'stop'] = dists.argmin(1)[at_stop]
    batch.loc[at_stop, 'stop_dist'] = dists.min(1)[at_stop] # for debugging
    df.iloc[i:i + batch_size] = batch

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
100%|██████████| 7/7 [00:53<00:00,  7.57s/it]


In [5]:
df = df[df['at_stop']]
df['stop_name'] = df['stop'].apply(lambda x: stops['stop_name'][x])

In [6]:
def find_changes(x: pd.Series):
    return x.ne(x.shift())

In [7]:
df[df['k'] == df['k'][0]]['stop'].apply(lambda x: stops.iloc[x][['stop_id','stop_name']])[:15]

Unnamed: 0,stop_id,stop_name
0,3891,Hallera
144,3891,Hallera
288,3891,Hallera
432,3891,Hallera
577,3903,Hallera
720,3903,Hallera
863,3904,Hallera
1006,3904,Hallera
1149,3904,Hallera
1293,3814,Racławicka (szkoła)


In [10]:
trips_cols = ['source_name', 'destination_name', 'start_time', 'end_time', 'line', 'k']
trips = []

In [11]:
for k in tqdm(df['k'].unique()):
    k_mask = df['k'] == k
    changes = find_changes(df[k_mask]['stop_name'])
    idxs = np.flatnonzero(changes)
    new_rows = []
    for i in idxs[1:]:
        prev_row, cur_row = df.loc[k_mask].iloc[i-1], df.loc[k_mask].iloc[i]
        if prev_row['name'] == cur_row['name']:
            new_rows.append([prev_row['stop_name'], cur_row['stop_name'], prev_row['date'], cur_row['date'], cur_row['name'], k])
    new_rows = pd.DataFrame(new_rows, columns=trips_cols)
    trips.append(new_rows)
trips = pd.concat(trips, ignore_index=True)

100%|██████████| 10144/10144 [07:51<00:00, 21.51it/s]


In [12]:
trips

Unnamed: 0,source_name,destination_name,start_time,end_time,line,k
0,Hallera,Racławicka (szkoła),2022-01-08 23:17:22+01:00,2022-01-08 23:17:32+01:00,126,19246880
1,Racławicka (szkoła),Modlińska,2022-01-08 23:18:02+01:00,2022-01-08 23:18:52+01:00,126,19246880
2,Modlińska,Wawrzyniaka,2022-01-08 23:19:42+01:00,2022-01-08 23:20:52+01:00,126,19246880
3,Wawrzyniaka,Chłodna,2022-01-08 23:21:22+01:00,2022-01-08 23:21:33+01:00,126,19246880
4,Chłodna,Sowia,2022-01-08 23:22:22+01:00,2022-01-08 23:22:32+01:00,126,19246880
...,...,...,...,...,...,...
252737,rondo Św. Ojca Pio,Ożynowa,2022-01-12 01:17:34+01:00,2022-01-12 01:18:14+01:00,259,19256282
252738,Ożynowa,Malinowa,2022-01-12 01:18:54+01:00,2022-01-12 01:19:04+01:00,259,19256282
252739,Malinowa,Wyścigowa,2022-01-12 01:19:04+01:00,2022-01-12 01:19:44+01:00,259,19256282
252740,Wyścigowa,PARK POŁUDNIOWY,2022-01-12 01:20:15+01:00,2022-01-12 01:20:24+01:00,259,19256282


In [13]:
# trips.to_csv('data/recorded_trips.csv')

In [14]:
# trips = pd.read_csv('data/recorded_trips.csv', index_col=0)

In [15]:
trips

Unnamed: 0,source_name,destination_name,start_time,end_time,line,k
0,Hallera,Racławicka (szkoła),2022-01-08 23:17:22+01:00,2022-01-08 23:17:32+01:00,126,19246880
1,Racławicka (szkoła),Modlińska,2022-01-08 23:18:02+01:00,2022-01-08 23:18:52+01:00,126,19246880
2,Modlińska,Wawrzyniaka,2022-01-08 23:19:42+01:00,2022-01-08 23:20:52+01:00,126,19246880
3,Wawrzyniaka,Chłodna,2022-01-08 23:21:22+01:00,2022-01-08 23:21:33+01:00,126,19246880
4,Chłodna,Sowia,2022-01-08 23:22:22+01:00,2022-01-08 23:22:32+01:00,126,19246880
...,...,...,...,...,...,...
252737,rondo Św. Ojca Pio,Ożynowa,2022-01-12 01:17:34+01:00,2022-01-12 01:18:14+01:00,259,19256282
252738,Ożynowa,Malinowa,2022-01-12 01:18:54+01:00,2022-01-12 01:19:04+01:00,259,19256282
252739,Malinowa,Wyścigowa,2022-01-12 01:19:04+01:00,2022-01-12 01:19:44+01:00,259,19256282
252740,Wyścigowa,PARK POŁUDNIOWY,2022-01-12 01:20:15+01:00,2022-01-12 01:20:24+01:00,259,19256282


In [16]:
routes = pd.read_csv('routes_df.csv')

In [17]:
def name_id_split(s):
    l = s.split('-')
    return ''.join(l[:-1]), l[-1]

routes['source_name'] = routes['source'].apply(lambda x: name_id_split(x)[0])
routes['source_id'] = routes['source'].apply(lambda x: name_id_split(x)[1])
routes['target_name'] = routes['target'].apply(lambda x: name_id_split(x)[0])
routes['target_id'] = routes['target'].apply(lambda x: name_id_split(x)[1])

In [18]:
source_ids = []
destination_ids = []

for row in tqdm(trips.itertuples()):
    df2 = routes[(routes['route_id'] == row.line) & (routes['source_name'] == row.source_name) & (routes['target_name'] == row.destination_name)]
    try:
        row2 = df2.iloc[0]
        source_ids.append(row2['source_id'])
        destination_ids.append(row2['target_id'])
    except IndexError:
        # print(row)
        source_ids.append(None)
        destination_ids.append(None)

trips['source_id'] = pd.Series(source_ids)
trips['destination_id'] = pd.Series(destination_ids)

252742it [04:25, 952.92it/s]


In [19]:
trips['likely_incorrect'] = trips['source_id'].isna()

In [32]:
# trips['start_time'] = trips['start_time'].apply(datetime.fromisoformat)
# trips['end_time'] = trips['end_time'].apply(datetime.fromisoformat)

In [34]:
for k in tqdm(trips['k'].values):
    ktrips = trips.loc[trips['k'] == k]

    starts = ktrips.loc[:, 'start_time']
    ends = ktrips.loc[:, 'end_time']

    for i in range(len(ktrips) - 1):
        # row1 = trips.iloc[i]
        # row2 = trips.iloc[i+1]
        delta = starts.iat[i+1] - ends.iat[i]
        if delta < timedelta(minutes=2):
            ends.iat[i] += delta / 2
            starts.iat[i+1] -= delta / 2
        else:
            ends.iat[i] += timedelta(seconds=15)
            starts.iat[i+1] -= timedelta(seconds=15)

    starts.iat[0] -= timedelta(seconds=15)
    ends.iat[len(ends)-1] += timedelta(seconds=15)

100%|██████████| 252742/252742 [47:03<00:00, 89.50it/s] 


In [35]:
trips['duration'] = trips['end_time'] - trips['start_time']

In [41]:
trips.iloc[0:10]

Unnamed: 0,source_name,destination_name,start_time,end_time,line,k,source_id,destination_id,likely_incorrect,duration
0,Hallera,Racławicka (szkoła),2022-01-08 23:17:22+01:00,2022-01-08 23:17:32+01:00,126,19246880,11303,11545,False,0 days 00:00:10
1,Racławicka (szkoła),Modlińska,2022-01-08 23:18:02+01:00,2022-01-08 23:18:52+01:00,126,19246880,11545,11543,False,0 days 00:00:50
2,Modlińska,Wawrzyniaka,2022-01-08 23:19:42+01:00,2022-01-08 23:20:52+01:00,126,19246880,11543,16518,False,0 days 00:01:10
3,Wawrzyniaka,Chłodna,2022-01-08 23:21:22+01:00,2022-01-08 23:21:33+01:00,126,19246880,16518,16516,False,0 days 00:00:11
4,Chłodna,Sowia,2022-01-08 23:22:22+01:00,2022-01-08 23:22:32+01:00,126,19246880,16516,16514,False,0 days 00:00:10
5,Sowia,KRZYKI,2022-01-08 23:23:22+01:00,2022-01-08 23:23:43+01:00,126,19246880,16514,11314,False,0 days 00:00:21
6,KRZYKI,PARK POŁUDNIOWY,2022-01-08 23:24:53+01:00,2022-01-08 23:25:33+01:00,126,19246880,11314,11367,False,0 days 00:00:40
7,PARK POŁUDNIOWY,Wyścigowa,2022-01-08 23:26:03+01:00,2022-01-08 23:26:13+01:00,126,19246880,11367,16521,False,0 days 00:00:10
8,Na Szańcach,Jedności Narodowej,2022-01-08 23:16:52+01:00,2022-01-08 23:17:02+01:00,11,19227664,20623,20625,False,0 days 00:00:10
9,Jedności Narodowej,Nowowiejska,2022-01-08 23:17:32+01:00,2022-01-08 23:17:42+01:00,11,19227664,20625,20627,False,0 days 00:00:10


In [42]:
trips2 = pd.read_csv('data/recorded_trips.csv')

In [46]:
trips.drop('duration', axis=1)

Unnamed: 0,source_name,destination_name,start_time,end_time,line,k,source_id,destination_id,likely_incorrect
0,Hallera,Racławicka (szkoła),2022-01-08 23:17:07+01:00,2022-01-08 23:17:47+01:00,126,19246880,11303,11545,False
1,Racławicka (szkoła),Modlińska,2022-01-08 23:17:47+01:00,2022-01-08 23:19:17+01:00,126,19246880,11545,11543,False
2,Modlińska,Wawrzyniaka,2022-01-08 23:19:17+01:00,2022-01-08 23:21:07+01:00,126,19246880,11543,16518,False
3,Wawrzyniaka,Chłodna,2022-01-08 23:21:07+01:00,2022-01-08 23:21:57.500000+01:00,126,19246880,16518,16516,False
4,Chłodna,Sowia,2022-01-08 23:21:57.500000+01:00,2022-01-08 23:22:57+01:00,126,19246880,16516,16514,False
...,...,...,...,...,...,...,...,...,...
252737,rondo Św. Ojca Pio,Ożynowa,2022-01-12 01:17:24+01:00,2022-01-12 01:18:34+01:00,259,19256282,26340,26328,False
252738,Ożynowa,Malinowa,2022-01-12 01:18:34+01:00,2022-01-12 01:19:04+01:00,259,19256282,26328,26326,False
252739,Malinowa,Wyścigowa,2022-01-12 01:19:04+01:00,2022-01-12 01:19:59.500000+01:00,259,19256282,26326,16522,False
252740,Wyścigowa,PARK POŁUDNIOWY,2022-01-12 01:19:59.500000+01:00,2022-01-12 01:20:24+01:00,259,19256282,,,True


In [45]:
trips['start_time'] = trips2['start_time']
trips['end_time'] = trips2['end_time']

In [47]:
trips.to_csv('data/recorded_trips.csv')