### Imports

In [None]:
import seaborn as sns
import pandas as pd
from tqdm.notebook import tqdm
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
from datetime import datetime, timedelta
import matplotlib.pyplot as plt


In [None]:
CITY = "FB"

In [None]:
#file_path = "../data/nextbike/rents_returns_by_bike_Dresden_01-03_09-10.24.pkl" 
file_path = "../data/nextbike/rents_returns_by_bike_Freiburg_06-07.23_09-10.24.pkl" 
df = pd.read_pickle(file_path)

# define tools

In [None]:
from datetime import datetime
import pyreadr


def get_filename(time):
    return f'../data/Dresden_01-03.24/{time.strftime("%Y-%m-%d")}/{time.strftime("%Y-%m-%d-%H-%M-00")}.rds'



def import_raw_file(year,month, day, hour, minute, second):
    time = datetime(year,month, day, hour, minute, second)
    current_filename = get_filename(time)
    df = pyreadr.read_r(current_filename)[None]
    return df


# Explore anomalies

In [None]:
df.head()

# datetime_counts

In [None]:
datetime_counts = df.groupby("datetime").rent_or_return.value_counts().unstack(fill_value=0)

In [None]:
datetime_counts.columns

In [None]:
datetime_counts

In [None]:
datetime_counts = datetime_counts.sort_values("rent", ascending=False)

In [None]:
datetime_counts.head(10)

In [None]:
datetime_counts = datetime_counts.sort_values("returns", ascending=False)

In [None]:
datetime_counts.head(10)
# too many bikes returned at 2024-02-14 00:27:00

In [None]:
datetime_counts.query("rent-returns>100")

In [None]:
tmp = datetime_counts.query("rent > 50 or returns > 50").sort_index()

In [None]:
tmp

In [None]:
df.columns

# explore strange time period 14th Feb 40 and 27 min

In [None]:
start = datetime(2024, 2, 14, 0, 0, 0)
end = datetime(2024, 2, 14, 1, 59, 0)

In [None]:
df_anomaly = df[(df.datetime>=start) & (df.datetime<=end)][["datetime", "bike_number", "station_number", "uid", "rent_or_return", "name", "lat", "lng"]]

In [None]:
df_anomaly = df_anomaly.sort_values(["bike_number", "datetime"])

In [None]:
start_bikes = set(df[df.datetime==start].bike_number)

In [None]:
end_bikes = set(df[df.datetime==end].bike_number)

In [None]:
len(start_bikes)

In [None]:
len(end_bikes)

In [None]:
len(start_bikes-end_bikes)

In [None]:
len(end_bikes-start_bikes)

In [None]:
# looks like bikes were replaces by new ones ... or renamed? or is everything normal?

In [None]:
df_start = import_raw_file(2024, 2, 14, 0, 27, 0)

In [None]:
df_end = import_raw_file(2024, 2, 14, 0, 40, 0)

In [None]:
len(set(df_end.bike_number))

In [None]:
len(set(df_end.boardcomputer))

In [None]:
len(set(df_start.boardcomputer))

In [None]:
len(set(df_start.bike_number))

In [None]:
len(set(df_start.boardcomputer)-set(df_end.boardcomputer))

In [None]:
len(set(df_end.boardcomputer)-set(df_start.boardcomputer))

In [None]:
len(set(df_start.bike_number)-set(df_end.bike_number))

In [None]:
len(set(df_end.bike_number)-set(df_start.bike_number))

columns_to_keep = ["time_stamp", "bike_number", "bike", "uid", "lat", "lng", "name", "station_number", "boardcomputer"]
start_time = start
end_time = end

rent_list=[]
return_list=[]

lag_time = start_time - timedelta(minutes=1)
lag_filename = get_filename(lag_time)
df_lag = pyreadr.read_r(lag_filename)[None][columns_to_keep]
lag_bike_numbers = set(df_lag.bike_number)

for current_time in [start, end]:
        current_filename = get_filename(current_time)
        try:
                df_current = pyreadr.read_r(current_filename)[None][columns_to_keep]
                current_bike_numbers = set(df_current.bike_number)

                rented_bikes = df_lag[df_lag.bike_number.isin(lag_bike_numbers-current_bike_numbers)]
                if not rented_bikes.empty:
                        rent_list.extend(rented_bikes.to_dict(orient="records"))

                returned_bikes = df_current[df_current.bike_number.isin(current_bike_numbers-lag_bike_numbers)]
                if not returned_bikes.empty:
                        return_list.extend(returned_bikes.to_dict(orient="records"))
                
                df_lag = df_current
                lag_bike_numbers = current_bike_numbers
        except Exception as e:
                pass

df = pd.DataFrame(rent_list)
df['rent_or_return']="rent"
df_return = pd.DataFrame(return_list)
df_return['rent_or_return']="returns"
df = pd.concat([df,df_return], ignore_index=True)
df = df.sort_values(["bike_number", "time_stamp"])

df.groupby(["time_stamp","rent_or_return"]).size()

In [None]:
df_anomaly_counts = df_anomaly.groupby(["datetime"]).rent_or_return.value_counts().unstack(fill_value=0)

In [None]:
df_anomaly

In [None]:
df_anomaly.uid.dropna()

In [None]:
df_anomaly_counts.query("rent>50 or returns>50")

In [None]:
df_anomaly_counts_by_uid = df_anomaly.groupby(["datetime", "uid"]).rent_or_return.value_counts().unstack(fill_value=0)

In [None]:
df_anomaly_counts_by_uid.query("rent>1 or returns>1")

In [None]:
df_anomaly.datetime.unique()


In [None]:
len(df_anomaly_counts)

# plot of one day

In [None]:
datetime_counts

In [None]:
datetime_counts[datetime_counts.index.date==pd.to_datetime("2024-02-14").date()]

In [None]:
def plot_day(day_str):
    plt.figure(figsize=(10, 5)) 
    filtered_data = datetime_counts[datetime_counts.index.date==pd.to_datetime(day_str).date()]
    sns.lineplot(data = filtered_data)
    plt.title(f"Rents and returns by minute on {day_str}")
    plt.xticks(rotation=90)
    plt.savefig(f"../data/nextbike/images_{CITY}/{day_str}.png", bbox_inches='tight')
    plt.close()
    

In [None]:
plot_day("2024-02-11")

In [None]:
plot_day("2024-03-01")

In [None]:
plot_day("2024-10-24")

In [None]:
for pd_date in tqdm(pd.date_range(start="2023-06-01", end="2024-07-31")):
    str_date = pd_date.strftime("%Y-%m-%d")
    plot_day(str_date)
    
    

# date_counts

In [None]:
date_counts = df.groupby("date").rent_or_return.value_counts().unstack(fill_value=0)

In [None]:
date_counts

In [None]:
date_counts.sort_values("rent", ascending=False).head(10)

In [None]:
sns.lineplot(date_counts)

# hour_counts

In [None]:
hour_counts = df.groupby("datetime_hour").rent_or_return.value_counts().unstack(fill_value=0)

In [None]:
sns.lineplot(hour_counts.rent-hour_counts["returns"])

In [None]:
hourly_diffs = hour_counts.rent-hour_counts["returns"]

In [None]:
hourly_diffs.sort_values()

In [None]:
hour_counts.query("rent-returns>100")
#suspiciously the top differences are at 00 minutes

In [None]:
sns.lineplot(hour_counts)

# unique bikes
bikes get added and excluded throughout the timeline

In [None]:
jan_bikes = set(df[df.datetime.dt.month==1].bike_number.unique())
feb_bikes = set(df[df.datetime.dt.month==2].bike_number.unique())
mar_bikes = set(df[df.datetime.dt.month==3].bike_number.unique())

In [None]:
len(feb_bikes-jan_bikes)

In [None]:
len(jan_bikes)

In [None]:
len(feb_bikes-jan_bikes)

In [None]:
len(jan_bikes-feb_bikes)

In [None]:
df.datetime.dt.month

In [None]:
len(df.bike_number.unique())

In [None]:
len(df)-len(df.drop_duplicates())

In [None]:
df

# individual bike checks

In [None]:
df[df.bike_number==931492]

In [None]:
df[df.bike_number==931492]

In [None]:
df[df.bike_number==930494]  

In [None]:
df[df.bike_number==930494]

In [None]:
df[df.bike_number==930009].sort_values("datetime")