In [None]:
import pandas as pd
import pyreadr
import geopy.distance
from tqdm.notebook import tqdm
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import Point
import contextily as ctx
import os
from datetime import datetime


In [None]:
filename_DD = '../data/nextbike/rents_returns_by_bike_Dresden_01-03_09-10.24.pkl'
filename_FB = '../data/nextbike/rents_returns_by_bike_Freiburg_06-07.23_09-10.24.pkl'
df_DD = pd.read_pickle(filename_DD)
df_FB = pd.read_pickle(filename_FB)

In [None]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
pd.set_option('display.max_colwidth', 400)

In [None]:
df_DD.head()

In [None]:
df_DD = df_DD.drop(columns=["maintenance", "time_stamp"], errors="ignore")

In [None]:
df_rents_copy = df_DD.loc[(df_DD.rent_or_return == "rent") & (df_DD.bike_number==930034)]
df_return_copy = df_DD.loc[(df_DD.rent_or_return == "returns") & (df_DD.bike_number==930034)]
rents = df_rents_copy.sort_values(by=['bike_number', 'datetime'])
returns = df_return_copy.sort_values(by=['bike_number', 'datetime'])

In [None]:
len(rents)

In [None]:
len(returns)

In [None]:
rents.head(3)

In [None]:
returns.head(3)

In [None]:
returns = returns.rename(columns={"datetime": "datetime_return"})
rents = rents.rename(columns={"datetime": "datetime_rent"})

In [None]:
df_trips = pd.merge_asof(
    rents,
    returns,
    by="bike_number",
    left_on="datetime_rent",
    right_on="datetime_return",
    suffixes=("_rent", "_return"),
    allow_exact_matches=False,  
    direction="forward"  
)

In [None]:
(df_trips.datetime_rent == pd.DataFrame(trips).datetime_rent).all()

In [None]:
(df_trips.datetime_return== pd.DataFrame(trips).datetime_return).all()

In [None]:
df_trips[['datetime_rent', 'datetime_return']].head()

In [None]:
df_trips_loop[['datetime_rent', 'datetime_return']].head()

In [None]:
len(df_trips)==len(df_trips_loop)

In [None]:
flt = df_trips_loop.date_rent == pd.to_datetime("2024-01-15").date()
df_trips_loop.loc[flt, ['datetime_rent', 'datetime_return']]

In [None]:
df_trips.loc[flt, ['datetime_rent', 'datetime_return']]

In [None]:
pd.to_datetime("2024-01-15 07:07:00")

In [None]:
rents.loc[rents.datetime_rent == pd.to_datetime("2024-01-15 07:07:00")]

In [None]:
df_DD.loc[(df_DD.datetime == pd.to_datetime("2024-01-15 07:07:00")) & (df_DD.bike_number==930034)]

In [None]:
df_DD.loc[(df_DD.date == pd.to_datetime("2024-01-15").date()) & (df_DD.bike_number==930034)]

In [None]:
df_DD.loc[(df_DD.datetime == pd.to_datetime("2024-01-15 07:07:00")) & (df_DD.bike_number==930034)]``

In [None]:
tmp_series = pd.Series([True, False])
tmp_series.all()

In [None]:
tmp_series.any()

In [None]:
trips = []

for i, rent_row in tqdm(df_rents_copy.iterrows(), total = len(df_rents_copy)): #rent_row is a pandas Series
    bike_number = rent_row['bike_number']
    rent_time = rent_row['datetime']
    return_time = df_DD.loc[(df_DD.rent_or_return == "returns") & (df_DD.bike_number == bike_number) & (df_DD.datetime > rent_time), 'datetime'].min()
    if return_time > rent_time:
        return_row = df_DD.loc[(df_DD.rent_or_return == "returns") & (df_DD.bike_number == bike_number) & (df_DD.datetime == return_time)]
        assert len(return_row) == 1, print(return_row)
        return_row = return_row.iloc[0]
        rent_row = rent_row.drop("rent_or_return")
        # print(return_row)
        return_row = return_row.drop(["rent_or_return", "bike_number"])

        rent_row = rent_row.rename(lambda x: x if x == "bike_number" else x + "_rent")
        return_row = return_row.rename(lambda x: x + "_return")
        trip = pd.concat([rent_row, return_row])
        trips.append(trip)


In [None]:
df_trips_loop = pd.DataFrame(trips)

# for whole dataset

In [None]:
def group_minutes(x):
    return 'under 10 min' if x < 10 else \
           '10-30 min' if 10 <= x < 30 else \
           '30 min-1 hour' if 30 <= x < 60 else \
           '1-24 hours' if 60 <= x < 1440 else \
           'more than 24 hours'
    

# save

In [None]:
time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
df_trips_DD.to_pickle(f'../data/nextbike/trips_Dresden {time}.pkl')
df_trips_FB.to_pickle(f'../data/nextbike/trips_Freiburg {time}.pkl')

# how many outliers were cut

In [None]:
df_trips_DD = pd.merge_asof(
    df_rents_DD,
    df_returns_DD,
    by="bike_number",
    left_on="datetime_rent",
    right_on="datetime_return",
    suffixes=("_rent", "_return"),
    allow_exact_matches=False,  
    direction="forward"  
)

df_trips_DD = df_trips_DD.dropna(subset="datetime_return")
df_trips_DD['duration'] = df_trips_DD['datetime_return'] - df_trips_DD['datetime_rent']
df_trips_DD.sort_values(by="duration", ascending=False).head()
df_trips_DD = df_trips_DD.loc[~(df_trips_DD.duration.dt.days>= 150)]
initial_len = len(df_trips_DD)
initial_len 
df_trips_DD = df_trips_DD.loc[~(df_trips_DD.duration.dt.days>= 1)]
initial_len - len(df_trips_DD)
(initial_len - len(df_trips_DD))/initial_len*100

In [None]:
df_trips_FB = pd.merge_asof(
    df_rents_FB,
    df_returns_FB,
    by="bike_number",
    left_on="datetime_rent",
    right_on="datetime_return",
    suffixes=("_rent", "_return"),
    allow_exact_matches=False,  
    direction="forward"  
)

df_trips_FB = df_trips_FB.dropna(subset="datetime_return")
df_trips_FB['duration'] = df_trips_FB['datetime_return'] - df_trips_FB['datetime_rent']
df_trips_FB.sort_values(by="duration", ascending=False).head()
df_trips_FB = df_trips_FB.loc[~(df_trips_FB.duration.dt.days>= 150)]
initial_len = len(df_trips_FB)
initial_len 
df_trips_FB = df_trips_FB.loc[~(df_trips_FB.duration.dt.days>= 1)]
initial_len - len(df_trips_FB)
(initial_len - len(df_trips_FB))/initial_len*100


In [None]:
df_DD = df_DD.drop(columns=["maintenance", "time_stamp"], errors="ignore")
df_rents_DD = df_DD.loc[(df_DD.rent_or_return == "rent")]
df_returns_DD = df_DD.loc[(df_DD.rent_or_return == "returns") ]
df_rents_DD = df_rents_DD.sort_values(by=['datetime', 'bike_number'])
df_returns_DD = df_DD.sort_values(by=['datetime', 'bike_number'])
df_rents_DD = df_rents_DD.rename(columns={"datetime": "datetime_rent"})
df_returns_DD = df_returns_DD.rename(columns={"datetime": "datetime_return"})

start = datetime.now()
df_trips_DD = pd.merge_asof(
    df_rents_DD,
    df_returns_DD,
    by="bike_number",
    left_on="datetime_rent",
    right_on="datetime_return",
    suffixes=("_rent", "_return"),
    allow_exact_matches=False,  
    direction="forward"  
)
end = datetime.now()
df_trips_DD.drop(columns=["rent_or_return_rent", "rent_or_return_return"], errors="ignore", inplace=True)
df_trips_DD = df_trips_DD.dropna(subset="datetime_return")
df_trips_DD['duration'] = df_trips_DD['datetime_return'] - df_trips_DD['datetime_rent']
# df_trips_DD[['duration']].describe()
df_trips_DD = df_trips_DD.loc[~(df_trips_DD.duration.dt.days>= 1)]
# df_trips_DD[['duration']].describe()
# df_trips_DD[df_trips_DD.duration.dt.total_seconds() // 3600 == 23]
# df_one_bike = df_DD.loc[df_DD.bike_number == 930803]
df_trips_DD['duration_min']=(df_trips_DD.duration.dt.total_seconds() / 60).astype(int)
df_trips_DD.duration_min.value_counts(normalize=True).sort_index()
df_trips_DD['duration_min_bin'] = df_trips_DD.duration_min.apply(group_minutes)

In [None]:
df_FB = df_FB.drop(columns=["maintenance", "time_stamp"], errors="ignore")
df_rents_FB = df_FB.loc[(df_FB.rent_or_return == "rent")]
df_returns_FB = df_FB.loc[(df_FB.rent_or_return == "returns") ]
df_rents_FB = df_rents_FB.sort_values(by=['datetime', 'bike_number'])
df_returns_FB = df_FB.sort_values(by=['datetime', 'bike_number'])
df_rents_FB = df_rents_FB.rename(columns={"datetime": "datetime_rent"})
df_returns_FB = df_returns_FB.rename(columns={"datetime": "datetime_return"})

start = datetime.now()
df_trips_FB = pd.merge_asof(
    df_rents_FB,
    df_returns_FB,
    by="bike_number",
    left_on="datetime_rent",
    right_on="datetime_return",
    suffixes=("_rent", "_return"),
    allow_exact_matches=False,  
    direction="forward"  
)
end = datetime.now()
df_trips_FB.drop(columns=["rent_or_return_rent", "rent_or_return_return"], errors="ignore", inplace=True)
df_trips_FB = df_trips_FB.dropna(subset="datetime_return")
df_trips_FB['duration'] = df_trips_FB['datetime_return'] - df_trips_FB['datetime_rent']
# df_trips_FB[['duration']].describe()
df_trips_FB = df_trips_FB.loc[~(df_trips_FB.duration.dt.days>= 1)]
# df_trips_FB[['duration']].describe()
# df_trips_FB[df_trips_FB.duration.dt.total_seconds() // 3600 == 23]
# df_one_bike = df_FB.loc[df_FB.bike_number == 930803]
df_trips_FB['duration_min']=(df_trips_FB.duration.dt.total_seconds() / 60).astype(int)
df_trips_FB.duration_min.value_counts(normalize=True).sort_index()
df_trips_FB['duration_min_bin'] = df_trips_FB.duration_min.apply(group_minutes)


In [None]:
grouping_FB = df_trips_FB.duration_min_bin.value_counts(normalize=True)*100

In [None]:
category_order = ['under 10 min', '10-30 min', '30 min-1 hour', '1-24 hours']


In [None]:
grouping_FB.index

In [None]:
grouping_FB = grouping_FB.loc[category_order]

In [None]:
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
grouping_DD = df_trips_DD.duration_min_bin.value_counts(normalize=True)*100
grouping_DD = grouping_DD.loc[category_order]
ax = grouping_DD.plot(kind='bar', color="midnightblue")
for x, y in enumerate(grouping_DD):
    ax.text(x, y, f'{y:.1f}', ha='center', va='bottom', fontsize=10)
plt.xlabel('Dresden')
plt.ylabel('Share, %')

plt.subplot(1, 2, 2)
ax = grouping_FB.plot(kind='bar', color="midnightblue")
for x, y in enumerate(grouping_FB):
    ax.text(x, y, f'{y:.1f}', ha='center', va='bottom', fontsize=10)
plt.xlabel('Freiburg')
plt.ylabel('Share, %')
plt.savefig(f'/Users/v.sinichenko/Downloads/plots/trip_duration_bar.png', format='png', bbox_inches='tight')
plt.show()

In [None]:
len(df_trips_DD)/1000

In [None]:
len(df_trips_FB)/1000

In [None]:
# (df_trips_DD.duration.dt.total_seconds() / 60).plot(kind='hist')

In [None]:
# end-start

In [None]:
# len(df_trips_DD)

In [None]:
# len(df_rents_DD)


In [None]:
# (~df_trips_DD.datetime_return.isna()).sum()

In [None]:
# df_DD[df_DD.bike_number == 930686]

In [None]:
# df_trips_DD[(df_trips_DD.datetime_return.isna())].date_rent.value_counts()

In [None]:
# df_trips_DD[(df_trips_DD.datetime_return.isna()) & (df_trips_DD.bike_number == 930686)]

In [None]:
# df_trips_DD[df_trips_DD.bike_number == 930034]