### Imports

In [None]:
import pyreadr
import pandas as pd
from tqdm.notebook import tqdm
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
from datetime import datetime, timedelta

In [None]:
columns_to_keep = ["time_stamp", "bike_number", "bike", "uid", "lat", "lng", "name", "station_number", "maintenance"]

In [None]:
START_TIME = datetime(2024, 9, 3, 17, 46, 0) # the second filename
END_TIME = datetime(2024, 10, 31, 23, 59, 0) # the very last filename

FOLDER_NAME = "Dresden_09-10.24"
FILE_PATH = "../data/nextbike/rents_returns_by_bike_Dresden_09-10.24.pkl"

In [None]:
FILE_PATH_1 = "../data/nextbike/rents_returns_by_bike_Dresden_01-03.24.pkl"
FILE_PATH_2 = FILE_PATH
FILE_PATH_BOTH = "../data/nextbike/rents_returns_by_bike_Dresden_01-03_09-10.24.pkl" 

In [None]:
def get_filename(time):
    return f'../nextbike_original_data/{FOLDER_NAME}/{time.strftime("%Y-%m-%d")}/{time.strftime("%Y-%m-%d-%H-%M-00")}.rds'

# Practice joins and filters

In [None]:
current_time = START_TIME
current_filename = get_filename(current_time)
lag_time = current_time - timedelta(minutes=1)
lag_filename = get_filename(lag_time)



In [None]:
result = pyreadr.read_r(current_filename)
df_current = result[None][columns_to_keep]


In [None]:
df_current

In [None]:
result = pyreadr.read_r(lag_filename)
df_lag = result[None]

In [None]:
df_lag

df_merge = df_current[columns_to_keep].merge(df_lag[columns_to_keep], 
                                             on="bike_number", how="outer", 
                                             suffixes=("_current", "_lag"), validate="one_to_one", indicator=True)

df_merge.columns

df_merge[df_merge._merge!="both"]

In [None]:
df_lag[~df_lag.bike_number.isin(df_current.bike_number)][columns_to_keep]

In [None]:
to_add_rent = df_lag[~df_lag.bike_number.isin(df_current.bike_number)][columns_to_keep]
to_add_rent

In [None]:
to_add_rent = df_lag[~df_lag.bike_number.isin(df_current.bike_number.tolist())][columns_to_keep]
to_add_rent["rent_or_return"] = "rent"


In [None]:
df = pd.DataFrame()

In [None]:
df = pd.concat([df, to_add_rent], ignore_index=True)

In [None]:
df

In [None]:
to_add_rent

In [None]:
df_current[~df_current.bike_number.isin(df_lag.bike_number)][columns_to_keep]

# Iterate over datasets to create df of rents and returns

In [None]:
rent_list = []
return_list = []

In [None]:
lag_time = START_TIME - timedelta(minutes=1)
lag_filename = get_filename(lag_time)
df_lag = pyreadr.read_r(lag_filename)[None][columns_to_keep]
lag_bike_numbers = set(df_lag.bike_number)

for current_time in tqdm(pd.date_range(start=START_TIME, end=END_TIME, freq="min")):
        current_filename = get_filename(current_time)
        try:
                df_current = pyreadr.read_r(current_filename)[None][columns_to_keep]
                current_bike_numbers = set(df_current.bike_number)

                rented_bikes = df_lag[df_lag.bike_number.isin(lag_bike_numbers-current_bike_numbers)]
                if not rented_bikes.empty:
                        rent_list.extend(rented_bikes.to_dict(orient="records"))

                returned_bikes = df_current[df_current.bike_number.isin(current_bike_numbers-lag_bike_numbers)]
                if not returned_bikes.empty:
                        return_list.extend(returned_bikes.to_dict(orient="records"))
                
                df_lag = df_current
                lag_bike_numbers = current_bike_numbers
        except Exception as e:
                print(f"Skip timestamp {current_time} due to {e}")
                pass


        



In [None]:
current_time # Timestamp('2024-03-15 06:17:00')

In [None]:
df_rent = pd.DataFrame(rent_list)
df_rent['rent_or_return']="rent"
df_return = pd.DataFrame(return_list)
df_return['rent_or_return']="returns" # return is a reserved word
df = pd.concat([df_rent,df_return], ignore_index=True)
df = df.sort_values(["bike_number", "time_stamp"])

# adjustments

In [None]:
df.time_stamp.max()

In [None]:
df.dtypes

In [None]:
df.time_stamp.min()

In [None]:
df.time_stamp.apply(len).unique()

In [None]:
df.time_stamp = df.time_stamp.apply(lambda x: x + " 00:00:00" if len(x) == 10 else x)

In [None]:
df["date"]=pd.to_datetime(df.time_stamp).dt.date

In [None]:
# df = pd.read_pickle(FILE_PATH)

In [None]:
df.bike_number = df.bike_number.astype(int)

In [None]:
df["datetime"]= pd.to_datetime(df.time_stamp)

In [None]:
df["datetime_hour"]= df.datetime.dt.floor(freq="h")

In [None]:
df.station_number = df.station_number.astype(pd.Int64Dtype())

In [None]:
len(df)

### remove cases when the bike "appears" as return without bein rent before, and vice versa

In [None]:
def filter_group(group):
    if group.iloc[0]['rent_or_return'] == "returns":
        group = group.iloc[1:]
    if group.iloc[-1]['rent_or_return'] == "rent":
        group = group.iloc[:-1]
    return group

In [None]:
df = df.groupby('bike_number', group_keys=False).apply(filter_group)


In [None]:
df

In [None]:
df.to_pickle(FILE_PATH)

# Combine 2 dfs

In [None]:
df1 = pd.read_pickle(FILE_PATH_1)
df2 = pd.read_pickle(FILE_PATH_2)

In [None]:
len(df1)

In [None]:
len(df2)

In [None]:
df2

In [None]:
df_both = pd.concat([df1, df2], ignore_index=True)

In [None]:
len(df_both)

In [None]:
df_both = df_both.sort_values(["datetime", "bike_number"])

In [None]:
df_both.to_pickle(FILE_PATH_BOTH)

In [None]:
df_both = pd.read_pickle(FILE_PATH_BOTH)

In [None]:
df_both.info()

In [None]:
df_both.to_csv(FILE_PATH_BOTH.replace(".pkl", ".csv"), index=False)