In [None]:
FILES_NAME = ["2023_01_STATIONS", "2023_02_STATIONS", "2023_03_STATIONS", "2023_04_STATIONS", "2023_05_STATIONS", "2023_06_STATIONS",
                  "2023_07_STATIONS", "2023_08_STATIONS", "2023_09_STATIONS", "2023_10_STATIONS", "2023_11_STATIONS", "2023_12_STATIONS"]
PROCCESED_FOLDER = "../../data/bicing/processed"

import pandas as pd
import os 

df_2023 = pd.DataFrame()
for file_name in FILES_NAME:
    file_path = os.path.join(PROCCESED_FOLDER, f"{file_name}.csv")

    df = pd.read_csv(file_path)
    print(f"File: {file_name}")
    df_2023 = pd.concat([df_2023, df])

df_2023.count()

In [None]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import *

spark = SparkSession.builder \
    .master("local") \
    .config("spark.sql.autoBroadcastJoinThreshold", -1) \
    .config("spark.executor.memory", "4000mb") \
    .appName("Exercise1") \
    .getOrCreate()


df=spark.read.csv("../../data/bicing/processed/stations/", header=True, inferSchema=True)
df.count()

In [None]:
# i want to do in spark the same as in pandas pd.isnull().sum()
columns = [F.count(F.when(F.isnull(F.col(f"`{c}`")), F.col(f"`{c}`"))).alias(c) if '.' in c else F.count(F.when(F.isnull(c), c)).alias(c) for c in df.columns]

# Count nulls in each column
null_counts = df.select(columns)
null_counts.show()

In [None]:
df_2023_station_1 = df_2023[df_2023["station_id"] == 1]
df_2023_station_1.count()

In [None]:
df_station_1 = df.filter(df["station_id"] == 1)
df_station_1.count()

In [None]:
df_jan_13 = df.filter((df["month"] == 1) & (df['day'] == 13))


In [None]:
df_jan_13.count()

In [None]:
df_jan_13.write.csv("../../data/bicing/processed/jan_13.csv", header=True)


In [None]:
df_jan_13.write.parquet("../../data/bicing/processed/jan_13.parquet", mode="overwrite")

In [None]:
import tensorflow as tf

# Define the CSV file path
csv_file_path = '../../data/bicing/processed/stations/'

# Create a TensorFlow dataset from the CSV file
dataset = tf.data.experimental.make_csv_dataset(
    csv_file_path,
    batch_size=1024,  # Adjust based on your memory capacity
    label_name='target_column',  # Replace with your label column name
    num_epochs=1,
    ignore_errors=True
)

# Process the dataset (e.g., batching, shuffling)
dataset = dataset.shuffle(buffer_size=10000).batch(32)

# Example: Iterate over the dataset
for batch in dataset:
    print(batch)


In [None]:
import dask.dataframe as dd
FILES_NAME = ["2023_01_STATIONS", "2023_02_STATIONS", "2023_03_STATIONS", "2023_04_STATIONS", "2023_05_STATIONS", "2023_06_STATIONS",
                  "2023_07_STATIONS", "2023_08_STATIONS", "2023_09_STATIONS", "2023_10_STATIONS", "2023_11_STATIONS", "2023_12_STATIONS"]
PROCCESED_FOLDER = "../../data/bicing/processed"

import pandas as pd
import os 

dd_2023 = dd.from_pandas(pd.DataFrame(), npartitions=1)
for file_name in FILES_NAME:
    file_path = os.path.join(PROCCESED_FOLDER, f"{file_name}.csv")

    df = dd.read_csv(file_path)
    print(f"File: {file_name}")
    dd_2023 = dd.concat([dd_2023, df])

dd_2023.count().compute()

In [None]:
dd_2023_station_1 = dd_2023[dd_2023["station_id"] == 1]
dd_2023_station_1.head()

In [None]:
dd_2023_station_1 = dd_2023[dd_2023["station_id"] == 1]
dd_2023_station_1.count().compute()

In [None]:
#count na values
dd_2023.isna().sum().compute()

In [None]:
df_2023.isna().sum()

In [None]:
from pyspark.sql import functions as F
#count na in spark
df.head(3)
print(type(df   ))
#count na values
df.select([F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in df.columns]).show()

# PAndas PArquet vs CSV

In [None]:

import pandas as pd
import glob

csv_files = glob.glob("../../data/bicing/processed/jan_13.csv/*.csv")
dfs = []

for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)

combined_df = pd.concat(dfs)
combined_df.count()

In [None]:

import pandas as pd
import glob

parquet_files = glob.glob("../../data/bicing/processed/jan_13.parquet/*.parquet")
dfs = []

for file in parquet_files:
    df = pd.read_parquet(file)
    dfs.append(df)

combined_df = pd.concat(dfs)
combined_df.count()

In [None]:
!pip install pyarrow

In [None]:
import pandas as pd

df_2023_oct = pd.read_csv("../../data/bicing/processed/months/2023_10_STATIONS.csv")

In [None]:
df_2023_oct.isnull().sum()
#df_2023_oct.dropna(inplace=True)

In [None]:
df_2023_oct.isnull().sum()

In [None]:
import pandas as pd
df_2023_oct = pd.read_csv("../../data/bicing/processed/months/2023_10_STATIONS.csv")
df_2023_oct.head()

In [None]:

def assign_datatypes_month_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.astype({
        'station_id': int,
        'num_bikes_available': int,
        'num_bikes_available_types.mechanical': int,
        'num_bikes_available_types.ebike': int,
        'num_docks_available': int,
        'status': str,
        'is_renting': int,
        'is_returning': int,
        'year': int,
        'month': int,
        'day': int,
        'hour': int,
        'grouped_minute': int,
        'day_of_week': int,
        'is_weekend': int
    })
    return df

In [None]:
df_2023_oct = assign_datatypes_month_df(df_2023_oct)
df_2023_oct.head()

In [None]:
# find busiest days
df_2023_oct['bike_available_diff'] = df_2023_oct.groupby(['station_id'])['num_bikes_available'].diff()
df_2023_oct['bike_available_diff'] = df_2023_oct['bike_available_diff'].apply(lambda x: abs(x))
df_month_busiest_days = df_2023_oct.groupby('day').agg({'bike_available_diff': 'sum'}).reset_index()
df_month_busiest_days.sort_values(by='bike_available_diff', ascending=False).head(20)

In [None]:
df_2023_oct[df_2023_oct['station_id'] == 46]

In [None]:
df_2023_oct['grouped_date'] = df_2023_oct['year'].astype(str) + '-' + df_2023_oct['month'].astype(str) + '-' + df_2023_oct['day'].astype(str) + ' ' + df_2023_oct['hour'].astype(
            str) + ':' + df_2023_oct['grouped_minute'].astype(str).apply(lambda x: '0' + str(x) if int(x) < 10 else str(x))

In [None]:
df_2023_oct_6 = df_2023_oct[df_2023_oct['day'] == 6]

In [None]:
df_stations = pd.read_csv('../../data/bicing/processed/2024_STATION_LOCATIONS.csv')
df_month_busiest_days = df_2023_oct_6.merge(df_stations, on='station_id', how='left')
df_month_busiest_days.head(3)

In [None]:
df_month_busiest_days['docking_available'] = df_month_busiest_days['num_docks_available'].apply(lambda x: False if x == 0 else True) & df_month_busiest_days['is_returning'].apply(lambda x: False if x == 0 else True)

In [None]:
df_month_busiest_days['bikes_available'] = df_month_busiest_days['num_bikes_available'].apply(lambda x: False if x == 0 else True) & df_month_busiest_days['is_renting'].apply(lambda x: False if x == 0 else True)b

In [None]:
df_month_busiest_days.to

In [None]:
import plotly.express as px

# Assuming df_month_busiest_days is your DataFrame and it includes the 'is_returning' column with values 0 and 1
fig = px.scatter_mapbox(df_month_busiest_days, lat="lat", lon="lon", color="bikes_available", # size="num_bikes_available",size_max=2,
                        color_discrete_map={0: "red", 1: "green"},
                        hover_name="address", hover_data=["station_id", "altitude", "post_code", "capacity","num_bikes_available"],
                        title="October's busiest day Bikes Availability",
                        animation_frame='grouped_date',
                        zoom=11.5, height=630, width=600,
                        category_orders={"bikes_available": [0, 1]})  # Explicitly setting the order of categories

fig.update_layout(mapbox_style="open-street-map",
                  margin={"r": 0, "t": 80, "l": 0, "b": 0},
                  mapbox={"center": {"lat": 41.40484, "lon": 2.17482}})

for i, frame in enumerate(fig.frames):
    time_frame = frame.name[-5:]
    frame.layout.title = "January's busiest day Bikes Availability At: {}".format(str(time_frame))
    
fig.show()

In [None]:
import plotly.express as px

fig = px.scatter_mapbox(df_month_busiest_days, lat="lat", lon="lon", color="docking_available",
                        color_discrete_map={0: "red", 1: "green"},
                        hover_name="address", hover_data=["station_id", "altitude", "post_code", "capacity"],
                        title="October's busiest day Docking Availability",
                        animation_frame='grouped_date',
                        zoom=11.5, height=630, width=600,
                        category_orders={"docking_available": [0, 1]})  # Explicitly setting the order of categories

fig.update_layout(mapbox_style="open-street-map",
                  margin={"r": 0, "t": 80, "l": 0, "b": 0},
                  mapbox={"center": {"lat": 41.40484, "lon": 2.17482}})

for i, frame in enumerate(fig.frames):
    time_frame = frame.name[-5:]
    frame.layout.title = "January's busiest day Docking Availability At: {}".format(str(time_frame))

fig.show()

In [None]:
import plotly.express as px

# Assuming df_month_busiest_days is your DataFrame and it includes the 'is_returning' column with values 0 and 1
fig1 = px.scatter_mapbox(df_month_busiest_days, lat="lat", lon="lon", color="bikes_available", # size="num_bikes_available",size_max=2,
                        color_discrete_map={0: "red", 1: "green"},
                        hover_name="address", hover_data=["station_id", "altitude", "post_code", "capacity","num_bikes_available"],
                        title="October's busiest day Bikes Availability",
                        animation_frame='grouped_date',
                        zoom=11.5, height=630, width=600,
                        category_orders={"bikes_available": [0, 1]})  # Explicitly setting the order of categories

fig1.update_layout(mapbox_style="open-street-map",
                  margin={"r": 0, "t": 80, "l": 0, "b": 0},
                  mapbox={"center": {"lat": 41.40484, "lon": 2.17482}})

for i, frame in enumerate(fig1.frames):
    time_frame = frame.name[-5:]
    frame.layout.title = "January's busiest day Bikes Availability At: {}".format(str(time_frame))
    

import plotly.express as px

fig2 = px.scatter_mapbox(df_month_busiest_days, lat="lat", lon="lon", color="docking_available",
                        color_discrete_map={0: "red", 1: "green"},
                        hover_name="address", hover_data=["station_id", "altitude", "post_code", "capacity"],
                        title="October's busiest day Docking Availability",
                        animation_frame='grouped_date',
                        zoom=11.5, height=630, width=600,
                        category_orders={"docking_available": [0, 1]})  # Explicitly setting the order of categories

fig2.update_layout(mapbox_style="open-street-map",
                  margin={"r": 0, "t": 80, "l": 0, "b": 0},
                  mapbox={"center": {"lat": 41.40484, "lon": 2.17482}})

for i, frame in enumerate(fig2.frames):
    time_frame = frame.name[-5:]
    frame.layout.title = "January's busiest day Docking Availability At: {}".format(str(time_frame))


    import plotly.subplots as sp

    fig = sp.make_subplots(rows=1, cols=2)

    fig.add_trace(fig1.data[0], row=1, col=1)
    fig.add_trace(fig2.data[0], row=1, col=2)

    fig.update_layout(height=630, width=1200, title_text="October's busiest day Bikes and Docking Availability")

    fig.show()# Create a subplot layout



In [None]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go


# Assuming df_month_busiest_days is your DataFrame and it includes the 'is_returning' column with values 0 and 1
fig1 = px.scatter_mapbox(df_month_busiest_days, lat="lat", lon="lon", color="bikes_available", # size="num_bikes_available",size_max=2,
                        color_discrete_map={0: "red", 1: "green"},
                        hover_name="address", hover_data=["station_id", "altitude", "post_code", "capacity","num_bikes_available"],
                        title="October's busiest day Bikes Availability",
                        animation_frame='grouped_date',
                        zoom=11.5, height=630, width=600,
                        category_orders={"bikes_available": [0, 1]})  # Explicitly setting the order of categories

fig1.update_layout(mapbox_style="open-street-map",
                  margin={"r": 0, "t": 80, "l": 0, "b": 0},
                  mapbox={"center": {"lat": 41.40484, "lon": 2.17482}})

for i, frame in enumerate(fig1.frames):
    time_frame = frame.name[-5:]
    frame.layout.title = "January's busiest day Bikes Availability At: {}".format(str(time_frame))
    

import plotly.express as px

fig2 = px.scatter_mapbox(df_month_busiest_days, lat="lat", lon="lon", color="docking_available",
                        color_discrete_map={0: "red", 1: "green"},
                        hover_name="address", hover_data=["station_id", "altitude", "post_code", "capacity"],
                        title="October's busiest day Docking Availability",
                        animation_frame='grouped_date',
                        zoom=11.5, height=630, width=600,
                        category_orders={"docking_available": [0, 1]})  # Explicitly setting the order of categories

fig2.update_layout(mapbox_style="open-street-map",
                  margin={"r": 0, "t": 80, "l": 0, "b": 0},
                  mapbox={"center": {"lat": 41.40484, "lon": 2.17482}})

for i, frame in enumerate(fig2.frames):
    time_frame = frame.name[-5:]
    frame.layout.title = "January's busiest day Docking Availability At: {}".format(str(time_frame))


# Create a subplot layout
fig = make_subplots(rows=1, cols=2, subplot_titles=("Bikes Availability", "Docking Availability"),
                    specs=[[{"type": "scattermapbox"}, {"type": "scattermapbox"}]])

# Add the two figures to the subplot layout
for trace in fig1.data:
    fig.add_trace(trace, row=1, col=1)

for trace in fig2.data:
    fig.add_trace(trace, row=1, col=2)

# Update the layout
fig.update_layout(height=700, width=1300, title_text="October's Busiest Day: Bikes and Docking Availability")

# Show the combined figure
fig.show()


In [None]:

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np

mapboxtoken = "pk.eyJ1IjoianBhbGV4MTIzIiwiYSI6ImNsN3NsemlieTA0ZGszdnAzNHRzb3k4emkifQ.DOmCc34Bj1t4zG6O5yVzoA"
mapboxstyle = 'mapbox://styles/jpalex123/cl7sp1tou001q14qtcjt5ndhn'

df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/Nuclear%20Waste%20Sites%20on%20American%20Campuses.csv')
site_lat = df.lat
site_lon = df.lon
locations_name = df.text


fig = make_subplots(rows=1, cols=2, subplot_titles=("Bikes Availability", "Docking Availability"),
                    specs=[[{"type": "scattermapbox"}, {"type": "scattermapbox"}]])

fig.add_trace(go.Scattermapbox(
        lat=site_lat,
        lon=site_lon,
        mode='markers',
        marker=go.scattermapbox.Marker(
            size=17,
            color='rgb(255, 0, 0)',
            opacity=0.7
        ),
        text=locations_name,
        hoverinfo='text',
    ), row = 1, col = 1)

fig.add_trace(go.Scattermapbox(
        lat=site_lat,
        lon=site_lon,
        mode='markers',
        marker=go.scattermapbox.Marker(
            size=17,
            color='rgb(255, 0, 0)',
            opacity=0.7
        ),
        text=locations_name,
        hoverinfo='text',
    ), row = 1, col = 2)

fig.update_layout(mapbox1_accesstoken = mapboxtoken,
                  mapbox1 = dict(zoom = 1,
                                 style = mapboxstyle,
                                 center=go.layout.mapbox.Center(lat=np.mean(site_lat),lon=np.mean(site_lon))
                            )            
                 )

fig.show()

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=1,cols=2)

fig.add_scattermapbox(df_month_busiest_days,
                       lat="lat", lon="lon", color="docking_available",
                        color_discrete_map={0: "red", 1: "green"},
                        hover_name="address", hover_data=["station_id", "altitude", "post_code", "capacity"],
                        title="October's busiest day Docking Availability",
                        animation_frame='grouped_date',
                        zoom=11.5, height=630, width=600,
                        category_orders={"docking_available": [0, 1]})
fig.show()

In [None]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Create the first scatter_mapbox figure
fig1 = px.scatter_mapbox(df_month_busiest_days, lat="lat", lon="lon", color="docking_available",
                         color_discrete_map={0: "red", 1: "green"},
                         hover_name="address", hover_data=["station_id", "altitude", "post_code", "capacity"],
                         title="October's busiest day Docking Availability",
                         animation_frame='grouped_date',
                         zoom=11.5, height=630, width=600,
                         category_orders={"docking_available": [0, 1]})

# Extract traces from the first figure
traces1 = [trace for trace in fig1.data]

# Create the second scatter_mapbox figure
fig2 = px.scatter_mapbox(df_month_busiest_days, lat="lat", lon="lon", color="bikes_available",
                         color_discrete_map={0: "red", 1: "green"},
                         hover_name="address", hover_data=["station_id", "altitude", "post_code", "capacity", "num_bikes_available"],
                         title="October's busiest day Bikes Availability",
                         animation_frame='grouped_date',
                         zoom=11.5, height=630, width=600,
                         category_orders={"bikes_available": [0, 1]})

# Extract traces from the second figure
traces2 = [trace for trace in fig2.data]

# Create a subplot layout
fig = make_subplots(rows=1, cols=2, subplot_titles=("Docking Availability", "Bikes Availability"),
                    specs=[[{"type": "scattermapbox"}, {"type": "scattermapbox"}]])

# Add the first figure traces to the first subplot
for trace in traces1:
    fig.add_trace(trace, row=1, col=1)

# Add the second figure traces to the second subplot
for trace in traces2:
    fig.add_trace(trace, row=1, col=2)

# Update the layout
fig.update_layout(mapbox_style="open-street-map",
                  mapbox=dict(center=dict(lat=41.40484, lon=2.17482)),
                  margin={"r": 0, "t": 80, "l": 0, "b": 0},
                  height=700, width=1200,
                  title_text="October's Busiest Day: Bikes and Docking Availability")

# Show the combined figure
fig.show()

In [None]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Create the first scatter_mapbox figure
fig1 = px.scatter_mapbox(df_month_busiest_days, lat="lat", lon="lon", color="docking_available",
                         color_discrete_map={0: "red", 1: "green"},
                         hover_name="address", hover_data=["station_id", "altitude", "post_code", "capacity"],
                         title="October's busiest day Docking Availability",
                         animation_frame='grouped_date',
                         zoom=11.5, height=630, width=600,
                         category_orders={"docking_available": [0, 1]})

# Create the second scatter_mapbox figure
fig2 = px.scatter_mapbox(df_month_busiest_days, lat="lat", lon="lon", color="bikes_available",
                         color_discrete_map={0: "red", 1: "green"},
                         hover_name="address", hover_data=["station_id", "altitude", "post_code", "capacity", "num_bikes_available"],
                         title="October's busiest day Bikes Availability",
                         animation_frame='grouped_date',
                         zoom=11.5, height=630, width=600,
                         category_orders={"bikes_available": [0, 1]})

# Create a subplot layout
fig = make_subplots(rows=1, cols=2, subplot_titles=("Docking Availability", "Bikes Availability"),
                    specs=[[{"type": "scattermapbox"}, {"type": "scattermapbox"}]])

# Function to update frames
# Function to update frames
def update_frames(fig, traces, row, col):
    for frame in fig.frames:
        for trace in frame.data:
            fig.add_trace(trace, row=row, col=col)

# Extract and add traces from both figures
for trace in fig1.data:
    fig.add_trace(trace, row=1, col=1)

for trace in fig2.data:
    fig.add_trace(trace, row=1, col=2)

# Update frames for the animation
update_frames(fig1, fig1.data, row=1, col=1)
update_frames(fig2, fig2.data, row=1, col=2)

# Update the layout
fig.update_layout(mapbox_style="open-street-map",
                  mapbox=dict(center=dict(lat=41.40484, lon=2.17482)),
                  margin={"r": 0, "t": 80, "l": 0, "b": 0},
                  height=700, width=1200,
                  title_text="October's Busiest Day: Bikes and Docking Availability")

# Add animation settings
fig.frames = fig1.frames + fig2.frames

# Show the combined figure
fig.show()


In [None]:
df_month_busiest_days.to_csv('../../data/bicing/processed/months/days/1006_October_busiest_day.csv')

In [None]:
#plotly express version
px.