### Imports

In [None]:
import pyreadr
import pandas as pd
from tqdm.notebook import tqdm
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
from datetime import datetime, timedelta

In [None]:
columns_to_keep = ["time_stamp", "bike_number", "bike", "uid", "lat", "lng", "name", "station_number", "maintenance"]

In [None]:
START_TIME_DD_1 = datetime(2024, 1, 1, 0, 1, 0) # the second filename
END_TIME_DD_1 = datetime(2024, 3, 31, 23, 59, 0) # the very last filename

START_TIME_DD_2 = datetime(2024, 9, 3, 17, 46, 0) # the second filename
END_TIME_DD_2 = datetime(2024, 10, 31, 23, 59, 0) # the very last filename

FOLDER_NAME_1 ="Dresden_01-03.24"
FOLDER_NAME_2 = "Dresden_09-10.24"

In [None]:
OUTPUT_FILE_PATH_1 = "../data/nextbike/rents_returns_by_bike_Dresden_01-03.24.pkl"
OUTPUT_FILE_PATH_2 = "../data/nextbike/rents_returns_by_bike_Dresden_09-10.24.pkl"
OUTPUT_FILE_PATH_BOTH = "../data/nextbike/rents_returns_by_bike_Dresden_01-03_09-10.24.pkl" 

In [None]:
def get_filename(time, FOLDER_NAME):
    return f'../nextbike_original_data/{FOLDER_NAME}/{time.strftime("%Y-%m-%d")}/{time.strftime("%Y-%m-%d-%H-%M-00")}.rds'

# Iterate over datasets to create df of rents and returns

In [None]:
def loop_over_time_period(START_TIME, END_TIME, FOLDER_NAME):
    rent_list = []
    return_list = []

    lag_time = START_TIME - timedelta(minutes=1)
    lag_filename = get_filename(lag_time, FOLDER_NAME)
    df_lag = pyreadr.read_r(lag_filename)[None][columns_to_keep]
    lag_bike_numbers = set(df_lag.bike_number)

    for current_time in tqdm(pd.date_range(start=START_TIME, end=END_TIME, freq="min")):
            current_filename = get_filename(current_time, FOLDER_NAME)
            try:
                    df_current = pyreadr.read_r(current_filename)[None][columns_to_keep]
                    current_bike_numbers = set(df_current.bike_number)

                    rented_bikes = df_lag[df_lag.bike_number.isin(lag_bike_numbers-current_bike_numbers)]
                    if not rented_bikes.empty:
                            rent_list.extend(rented_bikes.to_dict(orient="records"))

                    returned_bikes = df_current[df_current.bike_number.isin(current_bike_numbers-lag_bike_numbers)]
                    if not returned_bikes.empty:
                            return_list.extend(returned_bikes.to_dict(orient="records"))
                    
                    df_lag = df_current
                    lag_bike_numbers = current_bike_numbers
            except Exception as e:
                    print(f"Skip timestamp {current_time} due to {e}")
                    pass 
    return rent_list, return_list
    

# time slot 1 

In [None]:
rent_list_1, return_list_1 = loop_over_time_period(START_TIME=START_TIME_DD_1, END_TIME=END_TIME_DD_1, FOLDER_NAME=FOLDER_NAME_1)

In [None]:
df_rent = pd.DataFrame(rent_list_1)
df_rent['rent_or_return']="rent"
df_return = pd.DataFrame(return_list_1)
df_return['rent_or_return']="returns" # return is a reserved word
df = pd.concat([df_rent,df_return], ignore_index=True)
df = df.sort_values(["bike_number", "time_stamp"])

### data type transformations

In [None]:
df.head()

In [None]:
df.time_stamp.max()

In [None]:
df.dtypes

In [None]:
df.time_stamp.min()

In [None]:
df.time_stamp.apply(len).unique()

In [None]:
df.time_stamp = df.time_stamp.apply(lambda x: x + " 00:00:00" if len(x) == 10 else x)

In [None]:
df["date"]=pd.to_datetime(df.time_stamp).dt.date

In [None]:
# df = pd.read_pickle(FILE_PATH)

In [None]:
df.bike_number = df.bike_number.astype(int)

In [None]:
df["datetime"]= pd.to_datetime(df.time_stamp)

In [None]:
df["datetime_hour"]= df.datetime.dt.floor(freq="h")

In [None]:
df.station_number = df.station_number.astype(pd.Int64Dtype())

In [None]:
print(len(df))

### remove cases when the bike "appears" as return without bein rent before, and vice versa

In [None]:
def filter_group(group):
    if group.iloc[0]['rent_or_return'] == "returns":
        group = group.iloc[1:]
    if group.iloc[-1]['rent_or_return'] == "rent":
        group = group.iloc[:-1]
    return group

In [None]:
def filter_out_first_returns(group):
    if group.iloc[0]['rent_or_return'] == "returns":
        group = group.iloc[1:]
    return group

In [None]:
df_cut = df.groupby('bike_number', group_keys=False).apply(filter_out_first_returns)


In [None]:
print(len(df_cut))

In [None]:
def filter_out_last_rents(group):
    try:
        if group.iloc[-1]['rent_or_return'] == "returns":
            group = group.iloc[1:]
    except Exception as e:
        group = group
    return group

In [None]:
df_cut = df_cut.groupby('bike_number', group_keys=False).apply(filter_out_last_rents)

In [None]:
print(len(df_cut))

In [None]:
df.to_pickle(OUTPUT_FILE_PATH_1)

# time slot 2

In [None]:
rent_list_2, return_list_2 = loop_over_time_period(START_TIME=START_TIME_DD_2, END_TIME=END_TIME_DD_2, FOLDER_NAME=FOLDER_NAME_2)

In [None]:
df_rent = pd.DataFrame(rent_list_2)
df_rent['rent_or_return']="rent"
df_return = pd.DataFrame(return_list_2)
df_return['rent_or_return']="returns" # return is a reserved word
df = pd.concat([df_rent,df_return], ignore_index=True)
df = df.sort_values(["bike_number", "time_stamp"])

### data type transformations

In [None]:
df.head()

In [None]:
df.time_stamp.max()

In [None]:
df.dtypes

In [None]:
df.time_stamp.min()

In [None]:
df.time_stamp.apply(len).unique()

In [None]:
df.time_stamp = df.time_stamp.apply(lambda x: x + " 00:00:00" if len(x) == 10 else x)

In [None]:
df["date"]=pd.to_datetime(df.time_stamp).dt.date

In [None]:
# df = pd.read_pickle(FILE_PATH)

In [None]:
df.bike_number = df.bike_number.astype(int)

In [None]:
df["datetime"]= pd.to_datetime(df.time_stamp)

In [None]:
df["datetime_hour"]= df.datetime.dt.floor(freq="h")

In [None]:
df.station_number = df.station_number.astype(pd.Int64Dtype())

In [None]:
print(len(df))

### remove cases when the bike "appears" as return without bein rent before, and vice versa

In [None]:
def filter_group(group):
    if group.iloc[0]['rent_or_return'] == "returns":
        group = group.iloc[1:]
    if group.iloc[-1]['rent_or_return'] == "rent":
        group = group.iloc[:-1]
    return group

In [None]:
def filter_out_first_returns(group):
    if group.iloc[0]['rent_or_return'] == "returns":
        group = group.iloc[1:]
    return group

In [None]:
df_cut = df.groupby('bike_number', group_keys=False).apply(filter_out_first_returns)


In [None]:
print(len(df_cut))

In [None]:
def filter_out_last_rents(group):
    try:
        if group.iloc[-1]['rent_or_return'] == "returns":
            group = group.iloc[1:]
    except Exception as e:
        group = group
    return group

In [None]:
df_cut = df_cut.groupby('bike_number', group_keys=False).apply(filter_out_last_rents)

In [None]:
print(len(df_cut))

In [None]:
df.to_pickle(OUTPUT_FILE_PATH_2)

# Combine 2 dfs

In [None]:
df1 = pd.read_pickle(OUTPUT_FILE_PATH_1)
df2 = pd.read_pickle(OUTPUT_FILE_PATH_2)

In [None]:
len(df1)

In [None]:
len(df2)

In [None]:
df2

In [None]:
df_both = pd.concat([df1, df2], ignore_index=True)

In [None]:
len(df_both)

In [None]:
df_both = df_both.sort_values(["datetime", "bike_number"])

In [None]:
df_both.to_pickle(OUTPUT_FILE_PATH_BOTH)

In [None]:
df_both = pd.read_pickle(OUTPUT_FILE_PATH_BOTH)

In [None]:
df_both.info()

In [None]:
df_both.to_csv(OUTPUT_FILE_PATH_BOTH.replace(".pkl", ".csv"), index=False)

In [None]:
df_both.groupby("maintenance").size()