In [None]:
import pandas as pd


## Read Dataframes

In [None]:
df_2019 = pd.read_csv('../../data/bicing/raw/2019_INFO.csv')
df_2020 = pd.read_csv('../../data/bicing/raw/2020_INFO.csv')
df_2021 = pd.read_csv('../../data/bicing/raw/2021_INFO.csv')
df_2022 = pd.read_csv('../../data/bicing/raw/2022_INFO.csv')
df_2023 = pd.read_csv('../../data/bicing/raw/2023_INFO.csv')
df_2024 = pd.read_csv('../../data/bicing/raw/2024_INFO.csv')


## Prepare the data
### Drop Columns

In [None]:
df_2019.drop(['Unnamed: 0', 'cross_street', 'nearby_distance'], axis=1, inplace=True)
df_2020.drop(['Unnamed: 0', 'cross_street', 'planned_date'], axis=1, inplace=True)
df_2021.drop(['Unnamed: 0', 'cross_street'], axis=1, inplace=True)
df_2022.drop(['Unnamed: 0', 'cross_street', 'is_charging_station', 'x_ride_code_support'], axis=1, inplace=True)
df_2023.drop(['Unnamed: 0', 'cross_street', 'rental_uris'], axis=1, inplace=True)
df_2024.drop(['Unnamed: 0', 'cross_street', 'rental_uris', 'V1', 'is_valet_station', 'x_valet_station_details'], axis=1, inplace=True)


### Clean Data

In [None]:

#remove row with invalid post_code
df_2019 = df_2019[df_2019['post_code'] != 'J3y8Y9']

#remove invalid test rows name='Gera_Testing - 535'
df_2024 = df_2024[df_2024['name'] != 'Gera_Testing - 535']

#remove invalid station df_2022[df_2022['name'] == 'Planned-527']
df_2022 = df_2022[df_2022['name'] != 'Planned-527']

#remove invalid rows by keyword
key_words = ['prueba', 'test']
dfs = [df_2019, df_2020, df_2021, df_2022, df_2023, df_2024]
for word in key_words:
    for df in dfs:
        #print(df[df['name'].str.contains(word, case=False)])
        df.drop(df[df['name'].str.contains(word, case=False)].index, inplace=True)

#remove temporary stations e.g Merced
temporary_stations = [529, 530]
for df in dfs:
    df.drop(df[df['station_id'].isin(temporary_stations)].index, inplace=True)

### Convert Datatypes and filter info

In [None]:
df_2019['date'] = pd.to_datetime(df_2019['date'])
df_2020['date'] = pd.to_datetime(df_2020['date'])
df_2021['date'] = pd.to_datetime(df_2021['date'])
df_2022['date'] = pd.to_datetime(df_2022['date'])
df_2023['date'] = pd.to_datetime(df_2023['date'])
df_2024['date'] = pd.to_datetime(df_2024['date'])


df_2019[df_2019['date'] == '2019-12-31']
df_2020[df_2020['date'] == '2020-12-31']
df_2021[df_2021['date'] == '2021-12-31']
df_2022[df_2022['date'] == '2022-12-31']
df_2023[df_2023['date'] == '2023-12-31']
df_2024[df_2024['date'] == '2024-12-31']

# postcode to int
df_2019['postcode'] = df_2019['post_code'].astype(int)
df_2020['postcode'] = df_2020['post_code'].astype(int)
df_2021['postcode'] = df_2021['post_code'].astype(int)
df_2022['postcode'] = df_2022['post_code'].astype(int)
df_2023['postcode'] = df_2023['post_code'].astype(int)
df_2024['postcode'] = df_2024['post_code'].astype(int)

#station_id to int
df_2019['station_id'] = df_2019['station_id'].astype(int)
df_2020['station_id'] = df_2020['station_id'].astype(int)
df_2021['station_id'] = df_2021['station_id'].astype(int)
df_2022['station_id'] = df_2022['station_id'].astype(int)
df_2023['station_id'] = df_2023['station_id'].astype(int)
df_2024['station_id'] = df_2024['station_id'].astype(int)


#### Add Year

In [None]:
df_2019['year'] = 2019
df_2020['year'] = 2020
df_2021['year'] = 2021
df_2022['year'] = 2022
df_2023['year'] = 2023
df_2024['year'] = 2024


#### Max Date and Median capacity

In [None]:
df_2019['median_capacity'] = df_2019.groupby('station_id')['capacity'].transform('median')
df_2020['median_capacity'] = df_2020.groupby('station_id')['capacity'].transform('median')
df_2021['median_capacity'] = df_2021.groupby('station_id')['capacity'].transform('median')
df_2022['median_capacity'] = df_2022.groupby('station_id')['capacity'].transform('median')
df_2023['median_capacity'] = df_2023.groupby('station_id')['capacity'].transform('median')
df_2024['median_capacity'] = df_2024.groupby('station_id')['capacity'].transform('median')



In [None]:
columns_to_keep = ['station_id', 'date', 'name', 'address', 'capacity', 'year', 'median_capacity', 'post_code','lat','lon','altitude']
df_2019 = df_2019[columns_to_keep]
df_2020 = df_2020[columns_to_keep]
df_2021 = df_2021[columns_to_keep]
df_2022 = df_2022[columns_to_keep]
df_2023 = df_2023[columns_to_keep]
df_2024 = df_2024[columns_to_keep]

In [None]:
df_2019_max_date = df_2019.loc[df_2019.groupby('station_id')['date'].idxmax()]
df_2020_max_date = df_2020.loc[df_2020.groupby('station_id')['date'].idxmax()]
df_2021_max_date = df_2021.loc[df_2021.groupby('station_id')['date'].idxmax()]
df_2022_max_date = df_2022.loc[df_2022.groupby('station_id')['date'].idxmax()]
df_2023_max_date = df_2023.loc[df_2023.groupby('station_id')['date'].idxmax()]
df_2024_max_date = df_2024.loc[df_2024.groupby('station_id')['date'].idxmax()]

### Concat DF in years_df

In [None]:
df_years = pd.concat([df_2019_max_date, df_2020_max_date, df_2021_max_date, df_2022_max_date, df_2023_max_date, df_2024_max_date], ignore_index=True)
#order by date
df_years.sort_values(by='date', inplace=True)
#
print(df_years['year'].value_counts())


In [None]:
df_years.head(4)

### Scatter Map

In [None]:
import plotly.express as px

stations_count_by_year = df_years.groupby('year')['station_id'].count().values
total_bikes = df_years.groupby('year')['median_capacity'].sum().values
min_capacity = df_years['median_capacity'].min() 
max_capacity = df_years['median_capacity'].max() 
# add threshold to color
print(min_capacity, max_capacity)
fig = px.scatter_mapbox(df_years, lat="lat", lon="lon", hover_name="station_id", hover_data=["address","post_code", "capacity", "altitude"], color='capacity',
                        #color_continuous_scale=px.colors.sequential.Viridis, 
                         zoom=11, height=630, width=800,animation_frame='year', title='Stations by Year', range_color=[min_capacity, max_capacity])
fig.update_layout(mapbox_style="open-street-map", margin={"r":0,"t":80,"l":0,"b":0},mapbox={"center": {"lat": 41.40484, "lon": 2.17482}}) 

for i, frame in enumerate(fig.frames):
    year = frame.name
    frame.layout.title = "Total Stations: {}".format(str(stations_count_by_year[i]) + " Total Slots for Bikes: " + str(int(total_bikes[i])) + " in  " + year)
    
for step in fig.layout.sliders[0].steps:
    step["args"][1]["frame"]["redraw"] = True
fig.show()


<img src="../assets/map.png" width="800" />

In [None]:
from IPython.display import Image
Image(filename="../../assets/map.png")

In [None]:
df_years[df_years['median_capacity'] == 12]

In [None]:
import pandas as pd
df_station_locations =pd.read_csv('../../data/bicing/processed/2024_STATION_LOCATIONS.csv', dtype={'post_code': str})
df_station_locations

In [None]:
import plotly.express as px
import pandas as pd

df_station_locations =pd.read_csv('../../data/bicing/processed/2024_STATION_LOCATIONS.csv', dtype={'post_code': str})
# Assuming 'df_stations' is your DataFrame and it includes 'lat', 'lon', and 'district' columns
fig = px.scatter_mapbox(df_station_locations, lat="lat", lon="lon", color="district", size="capacity", size_max=12,
                        hover_name="address", hover_data=["station_id","altitude", "post_code", "capacity"], title="Stations by District",
                        zoom=11.5, height=630, width=1200)
fig.update_layout(mapbox_style="open-street-map", margin={"r":0,"t":80,"l":0,"b":0},mapbox={"center": {"lat": 41.40484, "lon": 2.17482}}) 
fig.show()

In [None]:
#total stations per discrict and have column as total stations
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go

fig1 = px.bar(df_station_locations.groupby('district')['capacity'].sum(), 
    title='Total Bikes by District', labels={'value':'Total Bikes', 'district':'District'},
    height=600, 
    width=800,
    color_discrete_sequence=px.colors.qualitative.Set3)

fig2 = px.bar(df_station_locations.groupby('district')['station_id'].count().reset_index(name='total_stations'), 
    x='district', y='total_stations',
    title='Total Stations by District', labels={'total_stations':'Total Stations', 'district':'District'},
    color_continuous_scale='thermal',
    height=600, 
    width=800)

fig = make_subplots(rows=1, cols=2, subplot_titles=('Total Bikes by District', 'Total Stations by District'))


fig.add_trace(fig2['data'][0], row=1, col=2)

for trace in fig1.data:
    trace.showlegend = False  # Hide the legend for this trace
    fig.add_trace(trace, row=1, col=1)

fig.update_layout(height=500, width=1000, title_text="Total Bikes and Stations by District",title_font=dict(size=24, family="Arial, sans-serif"), title_x=0.5)
fig.show()

In [None]:
import pandas as pd
df_station_locations =pd.read_csv('../../data/bicing/processed/2024_STATION_LOCATIONS.csv', dtype={'post_code': str})
df_station_locations.sort_values(by='capacity', ascending=False, inplace=True)

In [None]:
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go

# Create subplots
fig = make_subplots(rows=1, cols=2, subplot_titles=("Density Heatmap of Stations by Capacity", "Area with Highest Capacity"),
                    specs=[[{"type": "xy"}, {"type": "scattermapbox"}]])

# Create the density heatmap
heatmap = px.density_heatmap(df_station_locations, x='lat', y='lon', z='capacity',
                             color_continuous_scale='fall')

# Normalize capacity values for color scale
norm_capacity = (df_station_locations['capacity'] - df_station_locations['capacity'].min()) / (df_station_locations['capacity'].max() - df_station_locations['capacity'].min())
colorscale = px.colors.sample_colorscale('thermal', norm_capacity)

# Add heatmap to the first subplot
for trace in heatmap.data:
    fig.add_trace(trace, row=1, col=1)

fig.add_trace(go.Scattermapbox(
    lat=df_station_locations['lat'],
    lon=df_station_locations['lon'],
    mode='markers',
    marker=go.scattermapbox.Marker(
        size=10,
        color=colorscale,
    ),
    text=df_station_locations['address'] + ' lon=' + df_station_locations['lat'].astype(
        str) + ' lon=' + df_station_locations['lon'].astype(str) + ' ' + df_station_locations['capacity'].astype(str) + ' bikes',
    hoverinfo='text',
    name='Stations'
), row=1, col=2)

highest_density_lat = 41.385
highest_density_lon = 2.16999

fig.add_trace(go.Scattermapbox(
    lat=[highest_density_lat],
    lon=[highest_density_lon],
    mode='markers',
    marker=go.scattermapbox.Marker(
        size=30,
        color='green',
        opacity=0.7
    ),
    text='Highest lat=41.385, lon=2.17',
    hoverinfo='text',
    name='Highest Density'
), row=1, col=2)
# Update layout for the mapbox subplot
fig.update_layout(
    mapbox=dict(
        style="open-street-map",
        domain={'x': [0.55, 1.0], 'y': [0, 1]},
        center=dict(lat=41.38, lon=2.16999),
        zoom=11.5
    ),
    height=600,
    width=1300,
    title_text="Station Capacity Analysis",title_font=dict(size=24, family="Arial, sans-serif"), title_x=0.5,
    margin={"r": 0, "t": 80, "l": 0, "b": 0},
    legend=dict(
        x=1.1,  
        y=1,
        traceorder='normal',
        orientation='v' 
    )
)
fig.show()

## Busiest Days

In [None]:
import pandas as pd
df_busiest_day_may = pd.read_csv('../../data/bicing/processed/months/days/0530_2023_05_busiest_day.csv')

In [None]:
import plotly.express as px

# Assuming df_month_busiest_days is your DataFrame and it includes the 'is_returning' column with values 0 and 1
fig = px.scatter_mapbox(df_busiest_day_may, lat="lat", lon="lon", color="bikes_available", # size="num_bikes_available",size_max=2,
                        color_discrete_map={0: "red", 1: "green"},
                        hover_name="address", hover_data=["station_id", "altitude", "post_code", "capacity","num_bikes_available"],
                        title="May's busiest day Bikes Availability",
                        animation_frame='grouped_date',
                        zoom=11.5, height=630, width=800,
                        category_orders={"bikes_available": [0, 1]})  # Explicitly setting the order of categories

fig.update_layout(mapbox_style="open-street-map",
                  margin={"r": 0, "t": 80, "l": 0, "b": 0},
                  mapbox={"center": {"lat": 41.40484, "lon": 2.17482}})

for i, frame in enumerate(fig.frames):
    time_frame = frame.name[-5:]
    frame.layout.title = "May's busiest day Bikes Availability At: {}".format(str(time_frame))
    
fig.show()

In [None]:
import plotly.express as px

fig = px.scatter_mapbox(df_busiest_day_may, lat="lat", lon="lon", color="docking_available",
                        color_discrete_map={0: "red", 1: "green"},
                        hover_name="address", hover_data=["station_id", "altitude", "post_code", "capacity"],
                        title="May's busiest day Docking Availability",
                        animation_frame='grouped_date',
                        zoom=11.5, height=630, width=800,
                        category_orders={"docking_available": [0, 1]})  # Explicitly setting the order of categories

fig.update_layout(mapbox_style="open-street-map",
                  margin={"r": 0, "t": 80, "l": 0, "b": 0},
                  mapbox={"center": {"lat": 41.40484, "lon": 2.17482}})

for i, frame in enumerate(fig.frames):
    time_frame = frame.name[-5:]
    frame.layout.title = "May's busiest day Docking Availability At: {}".format(str(time_frame))

fig.show()

### July

In [None]:
import pandas as pd
df_busiest_day_july = pd.read_csv('../../data/bicing/processed/months/days/076_2023_07_busiest_day.csv')

In [None]:
import plotly.express as px

# Assuming df_month_busiest_days is your DataFrame and it includes the 'is_returning' column with values 0 and 1
fig = px.scatter_mapbox(df_busiest_day_july, lat="lat", lon="lon", color="bikes_available", # size="num_bikes_available",size_max=2,
                        color_discrete_map={0: "red", 1: "green"},
                        hover_name="address", hover_data=["station_id", "altitude", "post_code", "capacity","num_bikes_available"],
                        title="July's busiest day Bikes Availability",
                        animation_frame='grouped_date',
                        zoom=11.5, height=630, width=800,
                        category_orders={"bikes_available": [0, 1]})  # Explicitly setting the order of categories

fig.update_layout(mapbox_style="open-street-map",
                  margin={"r": 0, "t": 80, "l": 0, "b": 0},
                  mapbox={"center": {"lat": 41.40484, "lon": 2.17482}})

for i, frame in enumerate(fig.frames):
    time_frame = frame.name[-5:]
    frame.layout.title = "July's busiest day Bikes Availability At: {}".format(str(time_frame))
    
fig.show()

In [None]:
import plotly.express as px

fig = px.scatter_mapbox(df_busiest_day_july, lat="lat", lon="lon", color="docking_available",
                        color_discrete_map={0: "red", 1: "green"},
                        hover_name="address", hover_data=["station_id", "altitude", "post_code", "capacity"],
                        title="July's busiest day Docking Availability",
                        animation_frame='grouped_date',
                        zoom=11.5, height=630, width=800,
                        category_orders={"docking_available": [0, 1]})  # Explicitly setting the order of categories

fig.update_layout(mapbox_style="open-street-map",
                  margin={"r": 0, "t": 80, "l": 0, "b": 0},
                  mapbox={"center": {"lat": 41.40484, "lon": 2.17482}})

for i, frame in enumerate(fig.frames):
    time_frame = frame.name[-5:]
    frame.layout.title = "July's busiest day Docking Availability At: {}".format(str(time_frame))

fig.show()