In [40]:
import pandas as pd
from pandas_profiling import ProfileReport
from pathlib import Path

In [41]:
#import data, this may take some time
gdf = pd.read_parquet(Path('full_data_cleaned_fixed.parquet'))

In [42]:
#dataset size:
print('Dataset size',gdf.size)
print('Data options',gdf.columns)

Dataset size 726963399
Data options Index(['Rental Id', 'Duration', 'Bike Id', 'End Date', 'EndStation Id',
       'EndStation Name', 'Start Date', 'StartStation Id',
       'StartStation Name'],
      dtype='object')


In [43]:
#list all possible stations
end_list = gdf['EndStation Name'].unique()
start_list = gdf['StartStation Name'].unique()

#### Önálló csomók a gráfban
Olyan pontokat, megállókat keresek, amikbe egyirányú a forgalom, azaz a biciklik onnan csak kijönnek, vagy csak bemennek
Ilyenből pont van 3 olyan állomás, ahova csak bemennek a biciklik, ezek karbantartási jellegű állomások.

In [44]:
one_way_list = list()
print("Egyedi végállomás, ahonnan senki sem idult tovább (soha):")
for i in end_list:
    if i not in start_list:
        print('\t',i)
        one_way_list.append(i)

print("Egyedi indulási állomás, ahova senki sem érkezett (soha):")
for i in start_list:
    if i not in end_list:
        print('\t',i)
        one_way_list.append(i)

print("Végállomások száma:" , len(end_list))
print("Kezdőállomások száma:" , len(start_list))

Egyedi végállomás, ahonnan senki sem idult tovább (soha):
	 Mechanical Workshop Clapham
	 None
	 Electrical Workshop PS
	 Canada Water Station
Egyedi indulási állomás, ahova senki sem érkezett (soha):
Végállomások száma: 949
Kezdőállomások száma: 945


In [45]:
print("Has None", None in gdf['EndStation Name'])
print("Has None", None in gdf['StartStation Name'])

Has None False
Has None False


In [46]:
# select one way results
one_way = gdf[gdf['EndStation Name'].isin(one_way_list)]

In [47]:
print('one way size:',one_way.size)
one_way.head()

one way size: 1512225


       Rental Id  Duration  Bike Id            End Date  EndStation Id  \
118     50754348    456120    11903 2016-01-15 07:25:00            434   
984     50755271      1260     5954 2016-10-01 09:01:00           <NA>   
7996    50762735    405720    13200 2016-01-15 07:23:00            434   
11847   50766802      1560     5954 2016-10-01 18:11:00           <NA>   
18796   50774060      1620    12072 2016-11-01 09:16:00           <NA>   

                   EndStation Name          Start Date  StartStation Id  \
118    Mechanical Workshop Clapham 2016-10-01 00:43:00              706   
984                           None 2016-10-01 08:40:00              239   
7996   Mechanical Workshop Clapham 2016-10-01 14:41:00              775   
11847                         None 2016-10-01 17:45:00              594   
18796                         None 2016-11-01 08:49:00              334   

                         StartStation Name  
118             Snowsfields, London Bridge  
984          W

In [48]:
def mini_duration_stat(df:pd.DataFrame)->None:
    print('IQR',f"({df['Duration'].quantile(.25)},{df['Duration'].quantile(.5)},{df['Duration'].quantile(.75)})")
    print('min',df['Duration'].min(),"max",df["Duration"].max())

###

#### Invalid leadások
Mivel az adatszettez nem tartozik magyarázat, ezért nem tudni pontosan, milyen esetek tartoznak abba, amikor a leadásnak a helyszíne invalid.
Tapasztalataimból kiindulva ez a gyűjtőpoton kívüli leadás lehet.

In [49]:
invalid_dropoff = one_way[one_way['EndStation Id'].isna()]
invalid_dropoff.size

1505493

In [50]:
mini_duration_stat(invalid_dropoff)

IQR (660,1020,1320)
min 60 max 1182300


In [51]:
service_routes = one_way[~one_way['EndStation Id'].isna()]
service_routes.size

6732

In [52]:
mini_duration_stat(service_routes)

IQR (177345.0,369120.0,740325.0)
min 60 max 2533620


In [53]:
valid_routes = gdf[~gdf['EndStation Name'].isin(one_way_list)]

In [54]:
import bamboolib as bam
sampled = valid_routes.sample(10000)
sampled

        Rental Id  Duration  Bike Id            End Date  EndStation Id  \
202379   41430085       900     1778 2015-02-25 15:29:00            288   
178811   70471374       720     7339 2017-10-15 23:34:00            170   
389773   46023251       180    13172 2015-07-23 08:26:00            498   
52783    72347671       600     2611 2018-01-01 20:44:00            227   
103690   52647084       120    13453 2016-05-04 06:35:00            379   
...           ...       ...      ...                 ...            ...   
207565   97597784     24120    12254 2020-05-30 18:23:00            484   
457085   47106603       360     1124 2015-08-21 16:58:00            352   
8717    119916614       780    18745 2022-09-05 16:12:00            251   
140571   51797111       360    10949 2016-02-27 13:30:00            214   
1851     88042483       720     5190 2019-06-19 14:27:00            153   

                            EndStation Name          Start Date  \
202379           Elizabeth Bridg

In [55]:
invalid_dropoff

        Rental Id  Duration  Bike Id            End Date  EndStation Id  \
984      50755271      1260     5954 2016-10-01 09:01:00           <NA>   
11847    50766802      1560     5954 2016-10-01 18:11:00           <NA>   
18796    50774060      1620    12072 2016-11-01 09:16:00           <NA>   
80371    50838789       540     2092 2016-01-13 19:47:00           <NA>   
82208    50840709       900     6251 2016-01-13 22:16:00           <NA>   
...           ...       ...      ...                 ...            ...   
275570   47442702       600    10283 2015-02-09 15:56:00           <NA>   
298575   47465758      1260    12101 2015-03-09 09:40:00           <NA>   
224350   47739138       600     4455 2015-11-09 17:49:00           <NA>   
262279   47779054       480     8028 2015-12-09 22:53:00           <NA>   
402649   47924790      1140     9558 2015-09-18 09:11:00           <NA>   

       EndStation Name          Start Date  StartStation Id  \
984               None 2016-10-01 08

In [56]:
bam.plot(invalid_dropoff, 'StartStation Name')

TabSection(children=(BrowserCheck(), HBox(children=(Tab(closable=False, title='plot', _dom_classes=('bamboolib…

In [57]:
service_routes

        Rental Id  Duration  Bike Id            End Date  EndStation Id  \
118      50754348    456120    11903 2016-01-15 07:25:00            434   
7996     50762735    405720    13200 2016-01-15 07:23:00            434   
181099   50944517    542460      759 2016-01-25 14:47:00            434   
50628    51106908    246960    10767 2016-01-29 06:14:00            434   
75340    51132789    423600     7910 2016-01-02 14:47:00            434   
...           ...       ...      ...                 ...            ...   
177946   47690164    110820     4218 2015-11-09 18:40:00            434   
191652   47704703    503280     3037 2015-09-16 14:03:00            434   
235588   47750706    417660      870 2015-09-16 22:01:00            434   
279430   47797716    251100    11827 2015-09-16 13:59:00            434   
280080   47797358    714720    12235 2015-09-21 22:37:00            434   

                    EndStation Name          Start Date  StartStation Id  \
118     Mechanical Work

In [58]:
bam.plot(service_routes, 'EndStation Name')

TabSection(children=(BrowserCheck(), HBox(children=(Tab(closable=False, title='plot', _dom_classes=('bamboolib…

Az egyik legizgalmasabb invalid leadás a anada Water Station területén történt, ami egy metróállomás. Nem tisztázott, hogy ez hogyan történt, mindenesetre elszigetelt eset, mindössze egyszer történt meg.
Relatív gyakran előforul ugyanakkor, hogy negatív időratamot utaznak emberek.

In [59]:
# based on TfL it's a metro station
print(gdf[gdf["EndStation Name"]=='Canada Water Station'])
print(gdf[gdf["Duration"]<0])


       Rental Id  Duration  Bike Id            End Date  EndStation Id  \
34757  104637793      1680    17268 2020-12-22 19:30:00            844   

            EndStation Name          Start Date  StartStation Id  \
34757  Canada Water Station 2020-12-22 19:02:00              358   

                  StartStation Name  
34757  High Holborn , Covent Garden  
        Rental Id  Duration  Bike Id            End Date  EndStation Id  \
182402   49063704     -2880     1146 2015-10-25 01:00:00            464   
182407   49063563     -1500      551 2015-10-25 01:00:00            508   
182408   49063692     -2640     7467 2015-10-25 01:00:00             37   
182411   49063526     -1200     8354 2015-10-25 01:01:00            158   
182412   49063634     -1980     7159 2015-10-25 01:01:00             62   
...           ...       ...      ...                 ...            ...   
194041   70877104      -360    11391 2017-10-29 01:12:00            128   
196186   70877305     -2640     7744 2

In [60]:
overnight = gdf[gdf["Duration"]<0]
print(overnight.size)
overnight

2142


        Rental Id  Duration  Bike Id            End Date  EndStation Id  \
182402   49063704     -2880     1146 2015-10-25 01:00:00            464   
182407   49063563     -1500      551 2015-10-25 01:00:00            508   
182408   49063692     -2640     7467 2015-10-25 01:00:00             37   
182411   49063526     -1200     8354 2015-10-25 01:01:00            158   
182412   49063634     -1980     7159 2015-10-25 01:01:00             62   
...           ...       ...      ...                 ...            ...   
194041   70877104      -360    11391 2017-10-29 01:12:00            128   
196186   70877305     -2640     7744 2017-10-29 01:03:00            206   
196187   70877308     -2640     5760 2017-10-29 01:04:00            206   
202014   70877312     -2940    14205 2017-10-29 01:00:00             78   
204784   70877230     -1800    14705 2017-10-29 01:05:00            371   

                               EndStation Name          Start Date  \
182402  St. Mary & St. Michae

## Negative values:
these values are the result from misshandling the daylight saving clock change in 2015, 2016 and 2017.
source: 
- [2015 daylight change](https://www.timeanddate.com/time/change/uk/london?year=2015)
- [2016 daylight change](https://www.timeanddate.com/time/change/uk/london?year=2016)
- [2017 daylight change](https://www.timeanddate.com/time/change/uk/london?year=2017)

the date was changed while a ride was in progress, so the actual duration of the ride is longer with 60*60 seconds
this probably affects the other way around a

In [61]:
import datetime
valid_routes[valid_routes['Duration']<0]['Duration'] += 60*60
valid_routes[valid_routes['Duration']<0]['End Date'] += datetime.timedelta(hours=1)

Drop invalid dates, and add Duration in minutes for convinience, and Start hour, and End hour in time format, for convinient aggregation, and data visualization

In [62]:
invalid_dates = valid_routes[valid_routes['Start Date'].isna() | valid_routes['End Date'].isna()]
valid_routes = valid_routes[~(valid_routes['Start Date'].isna() | valid_routes['End Date'].isna())]
valid_routes["Duration minute"] = valid_routes["Duration"].floordiv(60)
valid_routes["Start hour"] = valid_routes["Start Date"].apply(lambda x:x.time())
valid_routes["End hour"] = valid_routes["End Date"].apply(lambda x:x.time())

In [63]:
from pandas_profiling import ProfileReport

report = ProfileReport(valid_routes.sample(1000000))
report

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [64]:
time_limit_12_hours = 60*60*6
valid_data_cleaned = valid_routes[valid_routes["Duration"] < time_limit_12_hours]
extra_long_rides = valid_routes[~(valid_routes["Duration"] < time_limit_12_hours)]

In [65]:
ext_long_report = ProfileReport(extra_long_rides)
ext_long_report

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [66]:
extra_long_rides

        Rental Id  Duration  Bike Id            End Date  EndStation Id  \
170      50754401    112680     3670 2016-11-01 08:19:00            640   
495      50754755     33000    12410 2016-10-01 12:08:00            695   
499      50754759     40920      893 2016-10-01 14:21:00            695   
512      50754772     22620     3130 2016-10-01 09:22:00            351   
513      50754773    106080     3532 2016-11-01 08:36:00            351   
...           ...       ...      ...                 ...            ...   
414764   47938576     54660     1137 2015-09-19 08:41:00            644   
415964   47939624     66420     3729 2015-09-19 12:13:00            572   
416451   47938688     35640     8705 2015-09-19 03:26:00            588   
418872   47942177     68340     2731 2015-09-19 13:29:00            688   
420062   47943958     22440     1768 2015-09-19 01:24:00            724   

                     EndStation Name          Start Date  StartStation Id  \
170     Silverthorne R

In [67]:
excess_time_travellers = valid_data_cleaned[valid_data_cleaned['Duration'] < 0]

In [68]:
valid_data_cleaned.to_parquet("full_data_cleaned.parquet")
print(valid_data_cleaned.size)

965485860
