In [1]:
import pandas as pd

df_march = pd.read_csv('../../data/bicing/processed/months/2023_03_STATIONS.csv')
df_july = pd.read_csv('../../data/bicing/processed/months/2023_07_STATIONS.csv')

# Add usage

In [2]:
#add usage to the dataset
df_march['usage'] = df_march.groupby(
            ['station_id'])['num_bikes_available'].diff()
df_march['usage'] = df_march['usage'].apply(
            lambda x: abs(x))


In [3]:
#add usage to the dataset
df_july['usage'] = df_july.groupby(
            ['station_id'])['num_bikes_available'].diff()
df_july['usage'] = df_july['usage'].apply(
            lambda x: abs(x))


## add datetime column as date

In [4]:
#add date column
df_march['grouped_date'] = pd.to_datetime(df_march['grouped_date'])
df_march['date'] = df_march['grouped_date'].dt.date
#convert date to datetime
df_march['date'] = pd.to_datetime(df_march['date'])


In [5]:
#add date column
df_july['grouped_date'] = pd.to_datetime(df_july['grouped_date'])
df_july['date'] = df_july['grouped_date'].dt.date
#convert date to datetime
df_july['date'] = pd.to_datetime(df_july['date'])

## Usage in a month

### March

In [6]:
import plotly.express as px

df_b = df_march.groupby('date')['usage'].sum().reset_index()
# add day of the week like "Sunday", "Monday", etc.
df_b['day_of_week'] = df_b['date'].dt.day_name()

fig = px.line(df_b, x='date', y='usage', title='Bicing usage over the month of March 2023', markers=True, hover_data=['date', 'usage','day_of_week'])

# Identify weekends
weekends = pd.date_range(start='2023-03-01', end='2023-03-31', freq='W-SAT').tolist() + \
           pd.date_range(start='2023-03-01', end='2023-03-31', freq='W-SUN').tolist()

# Ensure weekends are in pairs
weekends.sort()
weekend_pairs = [(weekends[i], weekends[i + 1]) for i in range(0, len(weekends), 2)]

# Add vertical rectangles for weekends
show_legend_added = False
for start, end in weekend_pairs:
    fig.add_vrect(
        x0=start, x1=end,
        fillcolor="turquoise", opacity=0.5,
        layer="below", line_width=0,
    )

fig.add_scatter(
    x=[None], y=[None],
    mode='markers',
    marker=dict(color='turquoise', size=10),
    showlegend=True,
    name='Weekend Days'
)


fig.show()

### July

In [7]:
import plotly.express as px

df_c = df_july.groupby('date')['usage'].sum().reset_index()
# add day of the week like "Sunday", "Monday", etc.
df_c['day_of_week'] = df_c['date'].dt.day_name()

fig = px.line(df_c, x='date', y='usage', title='Bicing usage over the month of March 2023', markers=True, hover_data=['date', 'usage','day_of_week'])

# Identify weekends
weekends = pd.date_range(start='2023-07-01', end='2023-07-31', freq='W-SAT').tolist() + \
           pd.date_range(start='2023-07-01', end='2023-07-31', freq='W-SUN').tolist()

# Ensure weekends are in pairs
weekends.sort()
weekend_pairs = [(weekends[i], weekends[i + 1]) for i in range(0, len(weekends), 2)]

# Add vertical rectangles for weekends
show_legend_added = False
for start, end in weekend_pairs:
    fig.add_vrect(
        x0=start, x1=end,
        fillcolor="turquoise", opacity=0.5,
        layer="below", line_width=0,
    )

fig.add_scatter(
    x=[None], y=[None],
    mode='markers',
    marker=dict(color='turquoise', size=10),
    showlegend=True,
    name='Weekend Days'
)


fig.show()

## Hourly Usage

### March

In [8]:
import plotly.express as px

df_weekend_march = df_march[df_march['is_weekend'] == 1].groupby('hour')['usage'].sum().reset_index()
df_week_march = df_march[df_march['is_weekend'] == 0].groupby('hour')['usage'].sum().reset_index()
# add day of the week like "Sunday", "Monday", etc.


fig = px.line(df_week_march, x='hour', y='usage', title='Bicing hourly usage of March 2023', markers=True)
#fig.add_scatter(x=df_weekend['hour'], y=df_weekend['usage'], mode='lines')
# Add the weekend line with red markers
fig.add_scatter(
    x=df_weekend_march['hour'],
    y=df_weekend_march['usage'],
    mode='lines+markers',
    name='Weekend',
    line=dict(color='red'),
    marker=dict(symbol='circle', size=6)
)

# Add a dummy scatter trace for legend entry
fig.add_scatter(
    x=[None],
    y=[None],
    mode='lines',
    marker=dict(color='blue', size=10),
    name='Weekdays'
)




fig.show()

In [9]:
import plotly.express as px

df_weekend_july = df_july[df_july['is_weekend'] == 1].groupby('hour')['usage'].sum().reset_index()
df_week_july = df_july[df_july['is_weekend'] == 0].groupby('hour')['usage'].sum().reset_index()
# add day of the week like "Sunday", "Monday", etc.


fig = px.line(df_week_july, x='hour', y='usage', title='Bicing hourly usage of July 2023', markers=True)
#fig.add_scatter(x=df_weekend['hour'], y=df_weekend['usage'], mode='lines')
# Add the weekend line with red markers
fig.add_scatter(
    x=df_weekend_july['hour'],
    y=df_weekend_july['usage'],
    mode='lines+markers',
    name='Weekend',
    line=dict(color='red'),
    marker=dict(symbol='circle', size=6)
)

# Add a dummy scatter trace for legend entry
fig.add_scatter(
    x=[None],
    y=[None],
    mode='lines',
    marker=dict(color='blue', size=10),
    name='Weekdays'
)




fig.show()

## Hourly usave July vs March

In [10]:
import plotly.express as px

df_weekend_july = df_july[df_july['is_weekend'] == 1].groupby('hour')['usage'].sum().reset_index()
df_week_july = df_july[df_july['is_weekend'] == 0].groupby('hour')['usage'].sum().reset_index()
df_weekend_march = df_march[df_march['is_weekend'] == 1].groupby('hour')['usage'].sum().reset_index()
df_week_march = df_march[df_march['is_weekend'] == 0].groupby('hour')['usage'].sum().reset_index()

# add day of the week like "Sunday", "Monday", etc.


fig = px.line(df_week_july, x='hour', y='usage', title='Bicing hourly usage of July vs March 2023', markers=True)
#fig.add_scatter(x=df_weekend['hour'], y=df_weekend['usage'], mode='lines')
# Add the weekend July line with red markers
fig.add_scatter(
    x=df_weekend_july['hour'],
    y=df_weekend_july['usage'],
    mode='lines+markers',
    name='Weekends July',
    line=dict(color='red'),
    marker=dict(symbol='circle', size=6)
)

fig.add_scatter(
    x=df_weekend_march['hour'],
    y=df_weekend_march['usage'],
    mode='lines+markers',
    name='Weekends March',
    line=dict(color='orange'),
    marker=dict(symbol='circle', size=6)
)

fig.add_scatter(
    x=df_week_march['hour'],
    y=df_week_march['usage'],
    mode='lines+markers',
    name='Week Days March',
    line=dict(color='turquoise'),
    marker=dict(symbol='circle', size=6)
)

# Add a dummy scatter trace for legend entry
fig.add_scatter(
    x=[None],
    y=[None],
    mode='lines',
    marker=dict(color='blue', size=10),
    name='Weekdays July'
)




fig.show()

In [11]:
df_march['day_of_week_name'] = df_march['date'].dt.day_name()
df_day_of_week_usage_march = df_march.groupby(['date','day_of_week_name'])['usage'].sum().reset_index(name='usage').groupby(['day_of_week_name'])['usage'].median().reset_index() 
df_day_of_week_usage_march

Unnamed: 0,day_of_week_name,usage
0,Friday,75812.0
1,Monday,70034.5
2,Saturday,63283.5
3,Sunday,58768.0
4,Thursday,77115.0
5,Tuesday,73638.0
6,Wednesday,76658.0


In [12]:
df_july['day_of_week_name'] = df_july['date'].dt.day_name()
df_day_of_week_usage_july = df_july.groupby(['date','day_of_week_name'])['usage'].sum().reset_index(name='usage').groupby(['day_of_week_name'])['usage'].median().reset_index() 
df_day_of_week_usage_july

Unnamed: 0,day_of_week_name,usage
0,Friday,71663.0
1,Monday,69139.0
2,Saturday,59782.0
3,Sunday,56252.0
4,Thursday,73111.5
5,Tuesday,70706.5
6,Wednesday,73780.5


In [13]:
# order dataset weekdays
# Define the correct order of days
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# Convert 'day_of_week_name' to a categorical type with the correct order
df_day_of_week_usage_march['day_of_week_name'] = pd.Categorical(df_day_of_week_usage_march['day_of_week_name'], categories=day_order, ordered=True)
df_day_of_week_usage_july['day_of_week_name'] = pd.Categorical(df_day_of_week_usage_july['day_of_week_name'], categories=day_order, ordered=True)

# Sort DataFrame by the ordered categorical
df_day_of_week_usage_march = df_day_of_week_usage_march.sort_values('day_of_week_name')
df_day_of_week_usage_july = df_day_of_week_usage_july.sort_values('day_of_week_name')


df_day_of_week_usage_march['month'] = 'March'
df_day_of_week_usage_july['month'] = 'July'
df_combined = pd.concat([df_day_of_week_usage_march, df_day_of_week_usage_july])

fig = px.line(df_combined, x='day_of_week_name', y='usage', color='month', title='Bicing usage per day of the week in March and July 2023', markers=True,
              category_orders={'day_of_week_name': day_order})

# Find the most used day for March and July
most_used_day_march = df_day_of_week_usage_march.loc[df_day_of_week_usage_march['usage'].idxmax()]
most_used_day_july = df_day_of_week_usage_july.loc[df_day_of_week_usage_july['usage'].idxmax()]

# Add annotations for the most used days
fig.add_annotation(
    x=most_used_day_march['day_of_week_name'],
    y=most_used_day_march['usage'],
    text=f"Most used in March: {most_used_day_march['day_of_week_name']}",
    showarrow=True,
    arrowhead=2,
    ax=20,
    ay=-30,
    bgcolor="turquoise"
)

fig.add_annotation(
    x=most_used_day_july['day_of_week_name'],
    y=most_used_day_july['usage'],
    text=f"Most used in July: {most_used_day_july['day_of_week_name']}",
    showarrow=True,
    arrowhead=2,
    ax=20,
    ay=-30,
    bgcolor="springgreen"
)
fig.show()

In [14]:
df_bikes_available_hour = df_march.groupby(['year', 'month','day', 'hour','grouped_minute'])['num_bikes_available'].sum().reset_index(name='bikes_available')
df_bikes_available_hour.sort_values('bikes_available', ascending=False).head(10)

Unnamed: 0,year,month,day,hour,grouped_minute,bikes_available
2001,2023,3,21,18,15,13885
2002,2023,3,21,18,30,13879
1999,2023,3,21,17,45,13290
843,2023,3,9,16,45,13162
1885,2023,3,20,13,15,12827
2000,2023,3,21,18,0,12800
1997,2023,3,21,17,15,12632
2523,2023,3,28,13,0,12478
847,2023,3,9,17,45,12123
1884,2023,3,20,13,0,12105


In [15]:
df_bikes_available_hour = df_july.groupby(['year', 'month','day', 'hour','grouped_minute'])['num_bikes_available'].median().sum().reset_index(name='bikes_available')
df_bikes_available_hour.sort_values('bikes_available', ascending=False).head(10)

AttributeError: 'numpy.float64' object has no attribute 'reset_index'

In [None]:
df = df_july.groupby(['year', 'month','day', 'hour','grouped_minute','station_id'])['num_bikes_available'].median().reset_index(name='bikes_available')
df.groupby(['year', 'month','day', 'hour','grouped_minute'])['bikes_available'].sum().reset_index(name='bikes_available').sort_values('bikes_available', ascending=False).head(10)


Unnamed: 0,year,month,day,hour,grouped_minute,bikes_available
502,2023,7,6,3,15,5035.0
501,2023,7,6,3,0,5034.0
500,2023,7,6,2,45,5032.5
499,2023,7,6,2,30,5030.5
505,2023,7,6,4,0,5024.0
504,2023,7,6,3,45,5021.5
408,2023,7,5,3,45,5019.0
405,2023,7,5,3,0,5008.5
597,2023,7,7,3,0,5005.5
404,2023,7,5,2,45,5005.0


In [None]:
df = df_march.groupby(['year', 'month','day', 'hour','grouped_minute','station_id'])['num_bikes_available'].median().reset_index(name='bikes_available')
df.groupby(['year', 'month','day', 'hour','grouped_minute'])['bikes_available'].sum().reset_index(name='bikes_available').sort_values('bikes_available', ascending=False).head(10)


Unnamed: 0,year,month,day,hour,grouped_minute,bikes_available
113,2023,3,2,2,15,6003.0
124,2023,3,2,5,0,5997.5
121,2023,3,2,4,15,5995.5
122,2023,3,2,4,30,5995.0
112,2023,3,2,2,0,5988.5
109,2023,3,2,1,15,5988.5
114,2023,3,2,2,30,5988.0
115,2023,3,2,2,45,5985.0
120,2023,3,2,4,0,5983.5
123,2023,3,2,4,45,5981.0


In [None]:
df_2024 = pd.read_csv('../../data/bicing/raw/2024_03_STATIONS.csv')
df_2024


Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.



Unnamed: 0,station_id,num_bikes_available,num_bikes_available_types.mechanical,num_bikes_available_types.ebike,num_docks_available,last_reported,is_charging_station,status,is_installed,is_renting,is_returning,traffic,last_updated,ttl,V1
0,1.0,17.0,6.0,11.0,27.0,1.709247e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.709248e+09,0.0,
1,2.0,8.0,4.0,4.0,17.0,1.709247e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.709248e+09,0.0,
2,3.0,20.0,20.0,0.0,6.0,1.709247e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.709248e+09,0.0,
3,4.0,3.0,1.0,2.0,16.0,1.709247e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.709248e+09,0.0,
4,5.0,9.0,8.0,1.0,30.0,1.709247e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.709248e+09,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4533443,515.0,13.0,12.0,1.0,11.0,1.711923e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.711923e+09,0.0,
4533444,516.0,5.0,2.0,3.0,16.0,1.711923e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.711923e+09,0.0,
4533445,517.0,17.0,7.0,10.0,1.0,1.711923e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.711923e+09,0.0,
4533446,518.0,1.0,0.0,1.0,26.0,1.711923e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.711923e+09,0.0,


In [None]:
df_2024['date'] = pd.to_datetime(df_2024['last_reported'], unit='s')
df_2024['date_updated'] = pd.to_datetime(df_2024['last_updated'], unit='s')
df_2024[df_2024['date_updated'] == '2024-02-29 23:00:02']

Unnamed: 0,station_id,num_bikes_available,num_bikes_available_types.mechanical,num_bikes_available_types.ebike,num_docks_available,last_reported,is_charging_station,status,is_installed,is_renting,is_returning,traffic,last_updated,ttl,V1,date,date_updated
0,1.0,17.0,6.0,11.0,27.0,1.709247e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.709248e+09,0.0,,2024-02-29 22:58:03,2024-02-29 23:00:02
1,2.0,8.0,4.0,4.0,17.0,1.709247e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.709248e+09,0.0,,2024-02-29 22:57:02,2024-02-29 23:00:02
2,3.0,20.0,20.0,0.0,6.0,1.709247e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.709248e+09,0.0,,2024-02-29 22:57:00,2024-02-29 23:00:02
3,4.0,3.0,1.0,2.0,16.0,1.709247e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.709248e+09,0.0,,2024-02-29 22:55:34,2024-02-29 23:00:02
4,5.0,9.0,8.0,1.0,30.0,1.709247e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.709248e+09,0.0,,2024-02-29 22:56:45,2024-02-29 23:00:02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
505,515.0,18.0,2.0,16.0,6.0,1.709247e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.709248e+09,0.0,,2024-02-29 22:56:20,2024-02-29 23:00:02
506,516.0,5.0,0.0,5.0,15.0,1.709247e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.709248e+09,0.0,,2024-02-29 22:56:40,2024-02-29 23:00:02
507,517.0,19.0,6.0,13.0,1.0,1.709247e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.709248e+09,0.0,,2024-02-29 22:55:59,2024-02-29 23:00:02
508,518.0,3.0,0.0,3.0,24.0,1.709247e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.709248e+09,0.0,,2024-02-29 22:56:09,2024-02-29 23:00:02


In [None]:
df = df_2024.groupby(['date_updated','station_id'])['num_bikes_available'].median().reset_index(name='bikes_available')
df.groupby(['date_updated'])['bikes_available'].sum().reset_index(name='bikes_available').sort_values('bikes_available', ascending=False).head(10)


Unnamed: 0,date_updated,bikes_available
3764,2024-03-14 02:54:59,6253.0
3765,2024-03-14 03:00:04,6253.0
3789,2024-03-14 05:00:02,6250.0
3762,2024-03-14 02:45:01,6241.0
3769,2024-03-14 03:20:01,6240.0
3768,2024-03-14 03:14:59,6239.0
3790,2024-03-14 05:05:00,6239.0
3760,2024-03-14 02:35:01,6238.0
3766,2024-03-14 03:04:59,6237.0
4063,2024-03-15 03:50:01,6236.0


In [None]:
df_2004_jun = pd.read_csv('../../data/other/2024_06_Juny_BicingNou_ESTACIONS.csv')

df_2004_jun


Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.



Unnamed: 0,station_id,num_bikes_available,num_bikes_available_types.mechanical,num_bikes_available_types.ebike,num_docks_available,last_reported,is_charging_station,status,is_installed,is_renting,is_returning,traffic,last_updated,ttl,V1
0,1.0,17.0,12.0,5.0,28.0,1.717193e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.717193e+09,1.0,
1,2.0,14.0,10.0,4.0,14.0,1.717193e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.717193e+09,1.0,
2,3.0,9.0,8.0,1.0,17.0,1.717193e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.717193e+09,1.0,
3,4.0,12.0,11.0,1.0,9.0,1.717193e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.717193e+09,1.0,
4,5.0,3.0,3.0,0.0,36.0,1.717193e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.717193e+09,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4417473,537.0,7.0,0.0,7.0,27.0,1.719786e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.719786e+09,1.0,
4417474,538.0,1.0,0.0,1.0,23.0,1.719785e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.719786e+09,1.0,
4417475,539.0,8.0,2.0,6.0,6.0,1.719786e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.719786e+09,1.0,
4417476,540.0,12.0,7.0,5.0,3.0,1.719786e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.719786e+09,1.0,


In [None]:
df_2004_jun['date'] = pd.to_datetime(df_2004_jun['last_reported'], unit='s')
df_2004_jun['date_updated'] = pd.to_datetime(df_2004_jun['last_updated'], unit='s')

#df = df_2004_jun.groupby(['date_updated','station_id'])['num_bikes_available'].median().reset_index(name='bikes_available')
df_2004_jun.groupby(['date_updated'])['num_bikes_available'].sum().reset_index(name='num_bikes_available').sort_values('num_bikes_available', ascending=False).head(10)

Unnamed: 0,date_updated,num_bikes_available
7525,2024-06-27 02:04:01,52630.0
3234,2024-06-12 03:40:00,6008.0
3214,2024-06-12 02:00:04,6006.0
3233,2024-06-12 03:35:04,6005.0
3238,2024-06-12 04:00:03,6003.0
3221,2024-06-12 02:35:00,5999.0
3213,2024-06-12 01:55:01,5999.0
3235,2024-06-12 03:45:00,5997.0
3239,2024-06-12 04:05:02,5996.0
3237,2024-06-12 03:55:01,5995.0


In [None]:
df_2004_jun[(df_2004_jun['date_updated'] == '2024-06-27 02:04:01'	) & (df_2004_jun['station_id'] == 312)]

Unnamed: 0,station_id,num_bikes_available,num_bikes_available_types.mechanical,num_bikes_available_types.ebike,num_docks_available,last_reported,is_charging_station,status,is_installed,is_renting,is_returning,traffic,last_updated,ttl,V1,date,date_updated
3850401,312.0,16.0,4.0,12.0,19.0,1719454000.0,True,IN_SERVICE,1.0,1.0,1.0,,1719454000.0,0.0,,2024-06-27 02:02:07,2024-06-27 02:04:01
3854505,312.0,16.0,4.0,12.0,19.0,1719454000.0,True,IN_SERVICE,1.0,1.0,1.0,,1719454000.0,0.0,,2024-06-27 02:02:07,2024-06-27 02:04:01
3856044,312.0,16.0,4.0,12.0,19.0,1719454000.0,True,IN_SERVICE,1.0,1.0,1.0,,1719454000.0,0.0,,2024-06-27 02:02:07,2024-06-27 02:04:01
3857583,312.0,16.0,4.0,12.0,19.0,1719454000.0,True,IN_SERVICE,1.0,1.0,1.0,,1719454000.0,0.0,,2024-06-27 02:02:07,2024-06-27 02:04:01
3858096,312.0,16.0,4.0,12.0,19.0,1719454000.0,True,IN_SERVICE,1.0,1.0,1.0,,1719454000.0,0.0,,2024-06-27 02:02:07,2024-06-27 02:04:01
3858609,312.0,16.0,4.0,12.0,19.0,1719454000.0,True,IN_SERVICE,1.0,1.0,1.0,,1719454000.0,0.0,,2024-06-27 02:02:07,2024-06-27 02:04:01
3860148,312.0,16.0,4.0,12.0,19.0,1719454000.0,True,IN_SERVICE,1.0,1.0,1.0,,1719454000.0,0.0,,2024-06-27 02:02:07,2024-06-27 02:04:01
3860661,312.0,16.0,4.0,12.0,19.0,1719454000.0,True,IN_SERVICE,1.0,1.0,1.0,,1719454000.0,0.0,,2024-06-27 02:02:07,2024-06-27 02:04:01
3862200,312.0,16.0,4.0,12.0,19.0,1719454000.0,True,IN_SERVICE,1.0,1.0,1.0,,1719454000.0,0.0,,2024-06-27 02:02:07,2024-06-27 02:04:01
3862713,312.0,16.0,4.0,12.0,19.0,1719454000.0,True,IN_SERVICE,1.0,1.0,1.0,,1719454000.0,0.0,,2024-06-27 02:02:07,2024-06-27 02:04:01


In [None]:
df_2004_jun['station_id'].unique()

array([  1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.,
        12.,  13.,  14.,  15.,  17.,  18.,  19.,  20.,  21.,  22.,  23.,
        24.,  25.,  26.,  27.,  28.,  29.,  30.,  31.,  32.,  33.,  34.,
        35.,  36.,  37.,  39.,  40.,  41.,  42.,  43.,  45.,  46.,  47.,
        48.,  49.,  50.,  51.,  52.,  53.,  54.,  55.,  56.,  57.,  58.,
        60.,  61.,  62.,  63.,  64.,  65.,  66.,  67.,  68.,  69.,  70.,
        71.,  72.,  73.,  74.,  75.,  76.,  77.,  78.,  79.,  80.,  81.,
        82.,  83.,  84.,  85.,  86.,  87.,  88.,  89.,  90.,  91.,  92.,
        94.,  95.,  96.,  97.,  98.,  99., 100., 101., 102., 103., 104.,
       105., 106., 107., 108., 109., 110., 111., 112., 113., 114., 115.,
       116., 117., 118., 119., 120., 121., 122., 123., 124., 125., 126.,
       127., 128., 129., 130., 131., 132., 133., 134., 135., 136., 137.,
       138., 139., 140., 141., 142., 143., 144., 145., 146., 147., 148.,
       149., 150., 151., 152., 153., 154., 155., 15

In [None]:
df_2004_jun['station_id'].nunique()

516

In [None]:
df_2004_jun['station_id'].isna().sum()

6

In [None]:
df_2004_jun.dropna(subset=['station_id'], inplace=True)
df_2004_jun[df_2004_jun['station_id'].isnull()]

Unnamed: 0,station_id,num_bikes_available,num_bikes_available_types.mechanical,num_bikes_available_types.ebike,num_docks_available,last_reported,is_charging_station,status,is_installed,is_renting,is_returning,traffic,last_updated,ttl,V1,date,date_updated


In [None]:
len(df_2004_jun)

4417472

In [None]:
df_2004_jun.drop_duplicates()

Unnamed: 0,station_id,num_bikes_available,num_bikes_available_types.mechanical,num_bikes_available_types.ebike,num_docks_available,last_reported,is_charging_station,status,is_installed,is_renting,is_returning,traffic,last_updated,ttl,V1,date,date_updated
0,1.0,17.0,12.0,5.0,28.0,1.717193e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.717193e+09,1.0,,2024-05-31 21:56:35,2024-05-31 22:00:04
1,2.0,14.0,10.0,4.0,14.0,1.717193e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.717193e+09,1.0,,2024-05-31 21:58:40,2024-05-31 22:00:04
2,3.0,9.0,8.0,1.0,17.0,1.717193e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.717193e+09,1.0,,2024-05-31 21:59:36,2024-05-31 22:00:04
3,4.0,12.0,11.0,1.0,9.0,1.717193e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.717193e+09,1.0,,2024-05-31 21:57:15,2024-05-31 22:00:04
4,5.0,3.0,3.0,0.0,36.0,1.717193e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.717193e+09,1.0,,2024-05-31 21:57:55,2024-05-31 22:00:04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4417473,537.0,7.0,0.0,7.0,27.0,1.719786e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.719786e+09,1.0,,2024-06-30 22:12:45,2024-06-30 22:15:04
4417474,538.0,1.0,0.0,1.0,23.0,1.719785e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.719786e+09,1.0,,2024-06-30 22:10:42,2024-06-30 22:15:04
4417475,539.0,8.0,2.0,6.0,6.0,1.719786e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.719786e+09,1.0,,2024-06-30 22:13:09,2024-06-30 22:15:04
4417476,540.0,12.0,7.0,5.0,3.0,1.719786e+09,True,IN_SERVICE,1.0,1.0,1.0,,1.719786e+09,1.0,,2024-06-30 22:14:17,2024-06-30 22:15:04


In [None]:
df_2004_jun.groupby(['date_updated'])['num_bikes_available'].sum().reset_index(name='num_bikes_available').sort_values('num_bikes_available', ascending=False).head(10)

Unnamed: 0,date_updated,num_bikes_available
7525,2024-06-27 02:04:01,52630.0
3234,2024-06-12 03:40:00,6008.0
3214,2024-06-12 02:00:04,6006.0
3233,2024-06-12 03:35:04,6005.0
3238,2024-06-12 04:00:03,6003.0
3221,2024-06-12 02:35:00,5999.0
3213,2024-06-12 01:55:01,5999.0
3235,2024-06-12 03:45:00,5997.0
3239,2024-06-12 04:05:02,5996.0
3237,2024-06-12 03:55:01,5995.0


In [None]:
df_2004_jun[(df_2004_jun['date_updated'] == '2024-06-27 02:04:01'	) & (df_2004_jun['station_id'] == 312)]

Unnamed: 0,station_id,num_bikes_available,num_bikes_available_types.mechanical,num_bikes_available_types.ebike,num_docks_available,last_reported,is_charging_station,status,is_installed,is_renting,is_returning,traffic,last_updated,ttl,V1,date,date_updated
3850401,312.0,16.0,4.0,12.0,19.0,1719454000.0,True,IN_SERVICE,1.0,1.0,1.0,,1719454000.0,0.0,,2024-06-27 02:02:07,2024-06-27 02:04:01
3854505,312.0,16.0,4.0,12.0,19.0,1719454000.0,True,IN_SERVICE,1.0,1.0,1.0,,1719454000.0,0.0,,2024-06-27 02:02:07,2024-06-27 02:04:01
3856044,312.0,16.0,4.0,12.0,19.0,1719454000.0,True,IN_SERVICE,1.0,1.0,1.0,,1719454000.0,0.0,,2024-06-27 02:02:07,2024-06-27 02:04:01
3857583,312.0,16.0,4.0,12.0,19.0,1719454000.0,True,IN_SERVICE,1.0,1.0,1.0,,1719454000.0,0.0,,2024-06-27 02:02:07,2024-06-27 02:04:01
3858096,312.0,16.0,4.0,12.0,19.0,1719454000.0,True,IN_SERVICE,1.0,1.0,1.0,,1719454000.0,0.0,,2024-06-27 02:02:07,2024-06-27 02:04:01
3858609,312.0,16.0,4.0,12.0,19.0,1719454000.0,True,IN_SERVICE,1.0,1.0,1.0,,1719454000.0,0.0,,2024-06-27 02:02:07,2024-06-27 02:04:01
3860148,312.0,16.0,4.0,12.0,19.0,1719454000.0,True,IN_SERVICE,1.0,1.0,1.0,,1719454000.0,0.0,,2024-06-27 02:02:07,2024-06-27 02:04:01
3860661,312.0,16.0,4.0,12.0,19.0,1719454000.0,True,IN_SERVICE,1.0,1.0,1.0,,1719454000.0,0.0,,2024-06-27 02:02:07,2024-06-27 02:04:01
3862200,312.0,16.0,4.0,12.0,19.0,1719454000.0,True,IN_SERVICE,1.0,1.0,1.0,,1719454000.0,0.0,,2024-06-27 02:02:07,2024-06-27 02:04:01
3862713,312.0,16.0,4.0,12.0,19.0,1719454000.0,True,IN_SERVICE,1.0,1.0,1.0,,1719454000.0,0.0,,2024-06-27 02:02:07,2024-06-27 02:04:01


In [None]:
df_2024_jun_subset = df_2004_jun[['date_updated','station_id','num_bikes_available']]
df_2024_jun_subset #4417472
df_2024_jun_subset = df_2024_jun_subset.drop_duplicates()#4412855

In [None]:
df_2024_jun_subset.groupby(['date_updated'])['num_bikes_available'].sum().reset_index(name='num_bikes_available').sort_values('num_bikes_available', ascending=False).head(10)

Unnamed: 0,date_updated,num_bikes_available
3234,2024-06-12 03:40:00,6008.0
3214,2024-06-12 02:00:04,6006.0
3233,2024-06-12 03:35:04,6005.0
3238,2024-06-12 04:00:03,6003.0
3221,2024-06-12 02:35:00,5999.0
3213,2024-06-12 01:55:01,5999.0
3235,2024-06-12 03:45:00,5997.0
3239,2024-06-12 04:05:02,5996.0
3237,2024-06-12 03:55:01,5995.0
3236,2024-06-12 03:49:59,5995.0


In [None]:
df_2024_jun_subset[(df_2024_jun_subset['date_updated'] == '2024-06-27 02:04:01'	) & (df_2024_jun_subset['station_id'] == 312)]
#drop duplicates 

Unnamed: 0,date_updated,station_id,num_bikes_available
3850401,2024-06-27 02:04:01,312.0,16.0
3854505,2024-06-27 02:04:01,312.0,16.0
3856044,2024-06-27 02:04:01,312.0,16.0
3857583,2024-06-27 02:04:01,312.0,16.0
3858096,2024-06-27 02:04:01,312.0,16.0
3858609,2024-06-27 02:04:01,312.0,16.0
3860148,2024-06-27 02:04:01,312.0,16.0
3860661,2024-06-27 02:04:01,312.0,16.0
3862200,2024-06-27 02:04:01,312.0,16.0
3862713,2024-06-27 02:04:01,312.0,16.0


In [None]:
import pandas as pd

df_raw_march_2023 = pd.read_csv('../../data/bicing/raw/2023_03_STATIONS.csv')

In [None]:

df_raw_march_2023['date'] = pd.to_datetime(df_raw_march_2023['last_reported'], unit='s')
df_raw_march_2023['date_updated'] = pd.to_datetime(df_raw_march_2023['last_updated'], unit='s')
df_raw_march_2023_subset = df_raw_march_2023[['date_updated','station_id','num_bikes_available']]
df_raw_march_2023_subset = df_raw_march_2023_subset.drop_duplicates()
df_raw_march_2023_subset.groupby(['date_updated'])['num_bikes_available'].sum().reset_index(name='num_bikes_available').sort_values('num_bikes_available', ascending=False).head(10)

Unnamed: 0,date_updated,num_bikes_available
335,2023-03-02 02:54:49,6015
334,2023-03-02 02:49:49,6011
333,2023-03-02 02:44:49,6011
329,2023-03-02 02:25:03,6008
328,2023-03-02 02:19:49,6007
332,2023-03-02 02:39:49,6001
363,2023-03-02 05:14:49,6000
354,2023-03-02 04:30:03,5999
360,2023-03-02 04:59:49,5998
336,2023-03-02 02:59:49,5998


In [None]:
date = '2023-03-02 02:54:49'
average_bike_ride = 15
#add 15 minutes to the date
date_threshold = pd.to_datetime(date) + pd.Timedelta(minutes=average_bike_ride)
df_raw_march_2023['usage'] = df_raw_march_2023.groupby(['station_id'])['num_bikes_available'].diff()
df_subset_threshold = df_raw_march_2023[(df_raw_march_2023['date_updated'] >= date) & (df_raw_march_2023['date_updated'] <= date_threshold)]
df_subset_threshold['usage'] = df_subset_threshold['usage'].apply(lambda x: abs(x) if x < 0 else 0)
df_subset_threshold['usage'].sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset_threshold['usage'] = df_subset_threshold['usage'].apply(lambda x: abs(x) if x < 0 else 0)


76.0

In [None]:
#sum the usage if it is negative



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset_threshold['usage'] = df_subset_threshold['usage'].apply(lambda x: abs(x) if x < 0 else 0)


76.0

In [None]:
df_raw_march_2023

Unnamed: 0,station_id,num_bikes_available,num_bikes_available_types.mechanical,num_bikes_available_types.ebike,num_docks_available,last_reported,is_charging_station,status,is_installed,is_renting,is_returning,traffic,last_updated,ttl,date,date_updated,usage
0,1,35,32,3,11,1677625106,True,IN_SERVICE,1,1,1,,1677625186,12,2023-02-28 22:58:26,2023-02-28 22:59:46,
1,2,6,6,0,23,1677625076,True,IN_SERVICE,1,1,1,,1677625186,12,2023-02-28 22:57:56,2023-02-28 22:59:46,
2,3,18,17,1,9,1677625072,True,IN_SERVICE,1,1,1,,1677625186,12,2023-02-28 22:57:52,2023-02-28 22:59:46,
3,4,2,0,2,19,1677624929,True,IN_SERVICE,1,1,1,,1677625186,12,2023-02-28 22:55:29,2023-02-28 22:59:46,
4,5,14,14,0,25,1677625051,True,IN_SERVICE,1,1,1,,1677625186,12,2023-02-28 22:57:31,2023-02-28 22:59:46,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4295368,515,10,6,4,13,1680300804,True,IN_SERVICE,1,1,1,,1680300883,9,2023-03-31 22:13:24,2023-03-31 22:14:43,
4295369,516,2,1,1,19,1680300617,True,IN_SERVICE,1,1,1,,1680300883,9,2023-03-31 22:10:17,2023-03-31 22:14:43,
4295370,517,1,0,1,17,1680300712,True,IN_SERVICE,1,1,1,,1680300883,9,2023-03-31 22:11:52,2023-03-31 22:14:43,
4295371,518,5,0,5,22,1680300730,True,IN_SERVICE,1,1,1,,1680300883,9,2023-03-31 22:12:10,2023-03-31 22:14:43,


In [None]:
df_raw_march_2023_subset = df_raw_march_2023[['date_updated','station_id','num_bikes_available']]

In [None]:
df_raw_march_2023_subset = df_raw_march_2023_subset.drop_duplicates()

In [None]:
df_raw_march_2023_subset.sort()

<class 'pandas.core.frame.DataFrame'>
Index: 4294363 entries, 0 to 4295372
Data columns (total 4 columns):
 #   Column               Dtype         
---  ------               -----         
 0   date_updated         datetime64[ns]
 1   station_id           int64         
 2   num_bikes_available  int64         
 3   usage                float64       
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 163.8 MB


In [None]:
df_raw_march_2023_subset['usage'] = df_raw_march_2023_subset.groupby(['station_id'])['num_bikes_available'].diff()

In [None]:
df_raw_march_2023_subset

Unnamed: 0,date_updated,station_id,num_bikes_available,usage
0,2023-02-28 22:59:46,1,35,
1,2023-02-28 22:59:46,2,6,
2,2023-02-28 22:59:46,3,18,
3,2023-02-28 22:59:46,4,2,
4,2023-02-28 22:59:46,5,14,
...,...,...,...,...
4295368,2023-03-31 22:14:43,515,10,0.0
4295369,2023-03-31 22:14:43,516,2,0.0
4295370,2023-03-31 22:14:43,517,1,0.0
4295371,2023-03-31 22:14:43,518,5,0.0
