### Imports

In [9]:
import pyreadr
import pandas as pd
from tqdm.notebook import tqdm
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
from datetime import datetime, timedelta

In [10]:
columns_to_keep = ["time_stamp", "bike_number", "bike", "uid", "lat", "lng", "name", "station_number", "maintenance"]

In [21]:
START_TIME_DD_1 = datetime(2024, 1, 1, 0, 1, 0) # the second filename
END_TIME_DD_1 = datetime(2024, 3, 31, 23, 59, 0) # the very last filename

START_TIME_DD_2 = datetime(2024, 9, 3, 17, 46, 0) # the second filename
END_TIME_DD_2 = datetime(2024, 10, 31, 23, 59, 0) # the very last filename

FOLDER_NAME_1 ="Dresden_01-03.24"
FOLDER_NAME_2 = "Dresden_09-10.24"

In [90]:
OUTPUT_FILE_PATH_1 = "../data/nextbike/rents_returns_by_bike_Dresden_01-03.24.pkl"
OUTPUT_FILE_PATH_2 = "../data/nextbike/rents_returns_by_bike_Dresden_09-10.24.pkl"
OUTPUT_FILE_PATH_BOTH = "../data/nextbike/rents_returns_by_bike_Dresden_01-03_09-10.24.pkl" 

In [13]:
def get_filename(time, FOLDER_NAME):
    return f'../nextbike_original_data/{FOLDER_NAME}/{time.strftime("%Y-%m-%d")}/{time.strftime("%Y-%m-%d-%H-%M-00")}.rds'

# Iterate over datasets to create df of rents and returns

In [22]:
def loop_over_time_period(START_TIME, END_TIME, FOLDER_NAME):
    rent_list = []
    return_list = []

    lag_time = START_TIME - timedelta(minutes=1)
    lag_filename = get_filename(lag_time, FOLDER_NAME)
    df_lag = pyreadr.read_r(lag_filename)[None][columns_to_keep]
    lag_bike_numbers = set(df_lag.bike_number)

    for current_time in tqdm(pd.date_range(start=START_TIME, end=END_TIME, freq="min")):
            current_filename = get_filename(current_time, FOLDER_NAME)
            try:
                    df_current = pyreadr.read_r(current_filename)[None][columns_to_keep]
                    current_bike_numbers = set(df_current.bike_number)

                    rented_bikes = df_lag[df_lag.bike_number.isin(lag_bike_numbers-current_bike_numbers)]
                    if not rented_bikes.empty:
                            rent_list.extend(rented_bikes.to_dict(orient="records"))

                    returned_bikes = df_current[df_current.bike_number.isin(current_bike_numbers-lag_bike_numbers)]
                    if not returned_bikes.empty:
                            return_list.extend(returned_bikes.to_dict(orient="records"))
                    
                    df_lag = df_current
                    lag_bike_numbers = current_bike_numbers
            except Exception as e:
                    print(f"Skip timestamp {current_time} due to {e}")
                    pass 
    return rent_list, return_list
    

# time slot 1 

In [28]:
rent_list_1, return_list_1 = loop_over_time_period(START_TIME=START_TIME_DD_1, END_TIME=END_TIME_DD_1, FOLDER_NAME=FOLDER_NAME_1)

  0%|          | 0/131039 [00:00<?, ?it/s]

Skip timestamp 2024-03-27 14:52:00 due to File b'../nextbike_original_data/Dresden_01-03.24/2024-03-27/2024-03-27-14-52-00.rds' does not exist!
Skip timestamp 2024-03-27 14:53:00 due to File b'../nextbike_original_data/Dresden_01-03.24/2024-03-27/2024-03-27-14-53-00.rds' does not exist!
Skip timestamp 2024-03-27 14:54:00 due to File b'../nextbike_original_data/Dresden_01-03.24/2024-03-27/2024-03-27-14-54-00.rds' does not exist!
Skip timestamp 2024-03-27 14:55:00 due to File b'../nextbike_original_data/Dresden_01-03.24/2024-03-27/2024-03-27-14-55-00.rds' does not exist!
Skip timestamp 2024-03-27 14:56:00 due to File b'../nextbike_original_data/Dresden_01-03.24/2024-03-27/2024-03-27-14-56-00.rds' does not exist!
Skip timestamp 2024-03-27 14:57:00 due to File b'../nextbike_original_data/Dresden_01-03.24/2024-03-27/2024-03-27-14-57-00.rds' does not exist!
Skip timestamp 2024-03-27 14:58:00 due to File b'../nextbike_original_data/Dresden_01-03.24/2024-03-27/2024-03-27-14-58-00.rds' does not

In [30]:
df_rent = pd.DataFrame(rent_list_1)
df_rent['rent_or_return']="rent"
df_return = pd.DataFrame(return_list_1)
df_return['rent_or_return']="returns" # return is a reserved word
df = pd.concat([df_rent,df_return], ignore_index=True)
df = df.sort_values(["bike_number", "time_stamp"])

### data type transformations

In [31]:
df.head()

Unnamed: 0,time_stamp,bike_number,bike,uid,lat,lng,name,station_number,maintenance,rent_or_return
481181,2024-01-19 08:43:00,930000,False,32939517,51.04893,13.7442,MOBIpunkt Pirnaischer Platz,43003.0,False,returns
64361,2024-01-21 15:39:00,930000,False,32939517,51.04893,13.7442,MOBIpunkt Pirnaischer Platz,43003.0,False,rent
486104,2024-01-21 15:46:00,930000,False,32939517,51.04893,13.7442,MOBIpunkt Pirnaischer Platz,43003.0,False,returns
64421,2024-01-21 15:50:00,930000,False,32939517,51.04893,13.7442,MOBIpunkt Pirnaischer Platz,43003.0,False,rent
486217,2024-01-21 16:09:00,930000,True,348898836,51.033422,13.707902,,,False,returns


In [32]:
df.time_stamp.max()

'2024-03-31 23:59:00'

In [33]:
df.dtypes

time_stamp         object
bike_number         int64
bike                 bool
uid                 int64
lat               float64
lng               float64
name               object
station_number    float64
maintenance          bool
rent_or_return     object
dtype: object

In [34]:
df.time_stamp.min()

'2024-01-01 00:02:00'

In [35]:
df.time_stamp.apply(len).unique()

array([19, 10])

In [36]:
df.time_stamp = df.time_stamp.apply(lambda x: x + " 00:00:00" if len(x) == 10 else x)

In [37]:
df["date"]=pd.to_datetime(df.time_stamp).dt.date

In [37]:
# df = pd.read_pickle(FILE_PATH)

In [38]:
df.bike_number = df.bike_number.astype(int)

In [39]:
df["datetime"]= pd.to_datetime(df.time_stamp)

In [40]:
df["datetime_hour"]= df.datetime.dt.floor(freq="h")

In [41]:
df.station_number = df.station_number.astype(pd.Int64Dtype())

In [42]:
print(len(df))

843909


### remove cases when the bike "appears" as return without bein rent before, and vice versa

In [47]:
def filter_group(group):
    if group.iloc[0]['rent_or_return'] == "returns":
        group = group.iloc[1:]
    if group.iloc[-1]['rent_or_return'] == "rent":
        group = group.iloc[:-1]
    return group

In [52]:
def filter_out_first_returns(group):
    if group.iloc[0]['rent_or_return'] == "returns":
        group = group.iloc[1:]
    return group

In [53]:
df_cut = df.groupby('bike_number', group_keys=False).apply(filter_out_first_returns)


  df_cut = df.groupby('bike_number', group_keys=False).apply(filter_out_first_returns)


In [54]:
print(len(df_cut))

843014


In [55]:
def filter_out_last_rents(group):
    try:
        if group.iloc[-1]['rent_or_return'] == "returns":
            group = group.iloc[1:]
    except Exception as e:
        group = group
    return group

In [56]:
df_cut = df_cut.groupby('bike_number', group_keys=False).apply(filter_out_last_rents)

  df_cut = df_cut.groupby('bike_number', group_keys=False).apply(filter_out_last_rents)


In [57]:
print(len(df_cut))

841428


In [58]:
df.to_pickle(OUTPUT_FILE_PATH_1)

# time slot 2

In [59]:
rent_list_2, return_list_2 = loop_over_time_period(START_TIME=START_TIME_DD_2, END_TIME=END_TIME_DD_2, FOLDER_NAME=FOLDER_NAME_2)

  0%|          | 0/83894 [00:00<?, ?it/s]

Skip timestamp 2024-09-09 17:48:00 due to File b'../nextbike_original_data/Dresden_09-10.24/2024-09-09/2024-09-09-17-48-00.rds' does not exist!
Skip timestamp 2024-09-11 22:02:00 due to File b'../nextbike_original_data/Dresden_09-10.24/2024-09-11/2024-09-11-22-02-00.rds' does not exist!
Skip timestamp 2024-09-12 10:45:00 due to File b'../nextbike_original_data/Dresden_09-10.24/2024-09-12/2024-09-12-10-45-00.rds' does not exist!
Skip timestamp 2024-09-13 02:41:00 due to File b'../nextbike_original_data/Dresden_09-10.24/2024-09-13/2024-09-13-02-41-00.rds' does not exist!
Skip timestamp 2024-09-13 12:05:00 due to File b'../nextbike_original_data/Dresden_09-10.24/2024-09-13/2024-09-13-12-05-00.rds' does not exist!
Skip timestamp 2024-09-13 21:52:00 due to File b'../nextbike_original_data/Dresden_09-10.24/2024-09-13/2024-09-13-21-52-00.rds' does not exist!
Skip timestamp 2024-09-14 13:45:00 due to File b'../nextbike_original_data/Dresden_09-10.24/2024-09-14/2024-09-14-13-45-00.rds' does not

In [60]:
df_rent = pd.DataFrame(rent_list_2)
df_rent['rent_or_return']="rent"
df_return = pd.DataFrame(return_list_2)
df_return['rent_or_return']="returns" # return is a reserved word
df = pd.concat([df_rent,df_return], ignore_index=True)
df = df.sort_values(["bike_number", "time_stamp"])

### data type transformations

In [61]:
df.head()

Unnamed: 0,time_stamp,bike_number,bike,uid,lat,lng,name,station_number,maintenance,rent_or_return
636,2024-09-03 18:53:00,930000,True,474071761,51.054213,13.734416,BIKE 930000,0,True,rent
348786,2024-09-03 18:57:00,930000,True,474115452,51.05456,13.727954,BIKE 930000,0,True,returns
3744,2024-09-04 09:22:00,930000,True,474248361,51.054507,13.728052,BIKE 930000,0,True,rent
351968,2024-09-04 09:28:00,930000,True,474289559,51.051929,13.723389,BIKE 930000,0,True,returns
5038,2024-09-04 14:02:00,930000,True,474324488,51.051902,13.723566,BIKE 930000,0,True,rent


In [62]:
df.time_stamp.max()

'2024-10-31 23:58:00'

In [63]:
df.dtypes

time_stamp         object
bike_number         int64
bike                 bool
uid                 int64
lat               float64
lng               float64
name               object
station_number      int64
maintenance          bool
rent_or_return     object
dtype: object

In [64]:
df.time_stamp.min()

'2024-09-03 17:45:00'

In [65]:
df.time_stamp.apply(len).unique()

array([19, 10])

In [66]:
df.time_stamp = df.time_stamp.apply(lambda x: x + " 00:00:00" if len(x) == 10 else x)

In [67]:
df["date"]=pd.to_datetime(df.time_stamp).dt.date

In [68]:
# df = pd.read_pickle(FILE_PATH)

In [69]:
df.bike_number = df.bike_number.astype(int)

In [70]:
df["datetime"]= pd.to_datetime(df.time_stamp)

In [71]:
df["datetime_hour"]= df.datetime.dt.floor(freq="h")

In [72]:
df.station_number = df.station_number.astype(pd.Int64Dtype())

In [73]:
print(len(df))

696295


### remove cases when the bike "appears" as return without bein rent before, and vice versa

In [74]:
def filter_group(group):
    if group.iloc[0]['rent_or_return'] == "returns":
        group = group.iloc[1:]
    if group.iloc[-1]['rent_or_return'] == "rent":
        group = group.iloc[:-1]
    return group

In [75]:
def filter_out_first_returns(group):
    if group.iloc[0]['rent_or_return'] == "returns":
        group = group.iloc[1:]
    return group

In [76]:
df_cut = df.groupby('bike_number', group_keys=False).apply(filter_out_first_returns)


  df_cut = df.groupby('bike_number', group_keys=False).apply(filter_out_first_returns)


In [77]:
print(len(df_cut))

695897


In [78]:
def filter_out_last_rents(group):
    try:
        if group.iloc[-1]['rent_or_return'] == "returns":
            group = group.iloc[1:]
    except Exception as e:
        group = group
    return group

In [79]:
df_cut = df_cut.groupby('bike_number', group_keys=False).apply(filter_out_last_rents)

  df_cut = df_cut.groupby('bike_number', group_keys=False).apply(filter_out_last_rents)


In [80]:
print(len(df_cut))

694385


In [81]:
df.to_pickle(OUTPUT_FILE_PATH_2)

# Combine 2 dfs

In [82]:
df1 = pd.read_pickle(OUTPUT_FILE_PATH_1)
df2 = pd.read_pickle(OUTPUT_FILE_PATH_2)

In [83]:
len(df1)

843909

In [84]:
len(df2)

696295

In [85]:
df2

Unnamed: 0,time_stamp,bike_number,bike,uid,lat,lng,name,station_number,maintenance,rent_or_return,date,datetime,datetime_hour
636,2024-09-03 18:53:00,930000,True,474071761,51.054213,13.734416,BIKE 930000,0,True,rent,2024-09-03,2024-09-03 18:53:00,2024-09-03 18:00:00
348786,2024-09-03 18:57:00,930000,True,474115452,51.054560,13.727954,BIKE 930000,0,True,returns,2024-09-03,2024-09-03 18:57:00,2024-09-03 18:00:00
3744,2024-09-04 09:22:00,930000,True,474248361,51.054507,13.728052,BIKE 930000,0,True,rent,2024-09-04,2024-09-04 09:22:00,2024-09-04 09:00:00
351968,2024-09-04 09:28:00,930000,True,474289559,51.051929,13.723389,BIKE 930000,0,True,returns,2024-09-04,2024-09-04 09:28:00,2024-09-04 09:00:00
5038,2024-09-04 14:02:00,930000,True,474324488,51.051902,13.723566,BIKE 930000,0,True,rent,2024-09-04,2024-09-04 14:02:00,2024-09-04 14:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
688627,2024-10-30 18:30:00,932011,True,493018921,51.072151,13.726703,BIKE 932011,0,True,returns,2024-10-30,2024-10-30 18:30:00,2024-10-30 18:00:00
340731,2024-10-30 18:41:00,932011,True,493018921,51.072129,13.726697,BIKE 932011,0,True,rent,2024-10-30,2024-10-30 18:41:00,2024-10-30 18:00:00
688839,2024-10-30 18:47:00,932011,True,493025650,51.074226,13.723983,BIKE 932011,0,True,returns,2024-10-30,2024-10-30 18:47:00,2024-10-30 18:00:00
340948,2024-10-30 18:56:00,932011,True,493025650,51.074278,13.723977,BIKE 932011,0,True,rent,2024-10-30,2024-10-30 18:56:00,2024-10-30 18:00:00


In [86]:
df_both = pd.concat([df1, df2], ignore_index=True)

In [87]:
len(df_both)

1540204

In [88]:
df_both = df_both.sort_values(["datetime", "bike_number"])

In [91]:
df_both.to_pickle(OUTPUT_FILE_PATH_BOTH)

In [92]:
df_both = pd.read_pickle(OUTPUT_FILE_PATH_BOTH)

In [93]:
df_both.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1540204 entries, 13231 to 1332721
Data columns (total 13 columns):
 #   Column          Non-Null Count    Dtype         
---  ------          --------------    -----         
 0   time_stamp      1540204 non-null  object        
 1   bike_number     1540204 non-null  int64         
 2   bike            1540204 non-null  bool          
 3   uid             1540204 non-null  int64         
 4   lat             1540204 non-null  float64       
 5   lng             1540204 non-null  float64       
 6   name            943069 non-null   object        
 7   station_number  943069 non-null   Int64         
 8   maintenance     1540204 non-null  bool          
 9   rent_or_return  1540204 non-null  object        
 10  date            1540204 non-null  object        
 11  datetime        1540204 non-null  datetime64[ns]
 12  datetime_hour   1540204 non-null  datetime64[ns]
dtypes: Int64(1), bool(2), datetime64[ns](2), float64(2), int64(2), object(4)


In [94]:
df_both.to_csv(OUTPUT_FILE_PATH_BOTH.replace(".pkl", ".csv"), index=False)

In [96]:
df_both.groupby("maintenance").size()

maintenance
False    842443
True     697761
dtype: int64