## This script accesses the large data file containing individual journey information, queries it with SQL via duckdb and saves aggregated dataframes to be used in the further visualisations

In [1]:
import duckdb

In [2]:
# filepath to be referenced in queries
bigfile = 'C:/Data/Citibike_NY_2022/merged/df_weather_duration.parquet'

In [3]:
duckdb.query(f"""
    SELECT *
    FROM '{bigfile}'
    LIMIT 5
""")

┌──────────────────┬───────────────┬─────────────────────────┬─────────────────────────┬────────────────────────┬──────────────────┬─────────────────────────┬────────────────┬───────────────┬─────────────────────┬───────┬───────┬───────┬─────────────┬──────────────┬─────────────┬──────────────┬────────────────────┐
│     ride_id      │ rideable_type │       started_at        │        ended_at         │   start_station_name   │ start_station_id │    end_station_name     │ end_station_id │ member_casual │        date         │ AWND  │ PRCP  │ TAVG  │  start_lat  │  start_lng   │   end_lat   │   end_lng    │   trip_duration    │
│     varchar      │    varchar    │      timestamp_ns       │      timestamp_ns       │        varchar         │     varchar      │         varchar         │    varchar     │    varchar    │    timestamp_ns     │ int64 │ int64 │ int64 │   double    │    double    │   double    │    double    │       double       │
├──────────────────┼───────────────┼─────────────

In [4]:
# Creating table with top 20 stations by number of routes beginning there
duckdb.query(f"""
    COPY (
        SELECT 
            start_station_name,
            COUNT(*) AS num_trips
        FROM '{bigfile}'
        GROUP BY start_station_name
        ORDER BY num_trips DESC
        LIMIT 20
    ) TO 'C:/Data/Citibike_NY_2022/merged/top_20.csv' (FORMAT CSV, HEADER TRUE);
""")

In [None]:
# Checking if start station coordinates are consistent
duckdb.query(f"""
    SELECT 
        start_station_name,
        COUNT(DISTINCT start_lat || ',' || start_lng) AS coord_versions
    FROM '{bigfile}'
    GROUP BY start_station_name
    ORDER BY coord_versions DESC
""").to_df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,start_station_name,coord_versions
0,Rivington St & Ridge St,1
1,Pier 40 - Hudson River Park,1
2,W 4 St & 7 Ave S,1
3,Atlantic Ave & Furman St,1
4,Morningside Dr & Amsterdam Ave,1
...,...,...
1734,Lafayette Ave & St James Pl,1
1735,Hamilton Pl & W 138 St,1
1736,Humboldt St & Varet St,1
1737,E 134 St & Walnut Ave,1


In [9]:
# Creating df at the route level showing number of trips between each route
# Including coordinates so it can be used for maps
routes = duckdb.query(f"""
    SELECT 
        CONCAT(start_station_name, '-', end_station_name) AS route,
        COUNT(*) AS num_trips,
        start_station_name, 
        end_station_name,
        ANY_VALUE(start_lng) AS start_lng,
        ANY_VALUE(start_lat) AS start_lat,
        ANY_VALUE(end_lng) AS end_lng,
        ANY_VALUE(end_lat) AS end_lat
    FROM '{bigfile}'
    GROUP BY 
        start_station_name, 
        end_station_name
    ORDER BY num_trips DESC;
""").to_df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [10]:
routes.head()

Unnamed: 0,route,num_trips,start_station_name,end_station_name,start_lng,start_lat,end_lng,end_lat
0,Central Park S & 6 Ave-Central Park S & 6 Ave,12041,Central Park S & 6 Ave,Central Park S & 6 Ave,-73.976342,40.765909,-73.976342,40.765909
1,7 Ave & Central Park South-7 Ave & Central Par...,8541,7 Ave & Central Park South,7 Ave & Central Park South,-73.979069,40.766741,-73.979069,40.766741
2,Roosevelt Island Tramway-Roosevelt Island Tramway,8213,Roosevelt Island Tramway,Roosevelt Island Tramway,-73.9536,40.757284,-73.9536,40.757284
3,Grand Army Plaza & Central Park S-Grand Army P...,7287,Grand Army Plaza & Central Park S,Grand Army Plaza & Central Park S,-73.973715,40.764397,-73.973715,40.764397
4,Soissons Landing-Soissons Landing,7275,Soissons Landing,Soissons Landing,-74.014866,40.692317,-74.014866,40.692317


In [11]:
routes.shape

(1006566, 8)

In [12]:
# Saving object as df
routes.to_csv("C:/Data/Citibike_NY_2022/merged/routes.csv", index=False)

In [13]:
# Checking format of temperature
duckdb.query(f"""
    SELECT
        MIN(TAVG),
        MAX(TAVG)
    FROM '{bigfile}'
""")

┌───────────┬───────────┐
│ min(TAVG) │ max(TAVG) │
│   int64   │   int64   │
├───────────┼───────────┤
│      -117 │       313 │
└───────────┴───────────┘

In [19]:
# Query for making a df with each day as a row - showing number of trips and weather variables
df_weather = duckdb.query(f"""
    SELECT 
        date,
        ANY_VALUE(TAVG) / 10 AS temperature, --- dividing by 10 so more intuitive
        ANY_VALUE(PRCP) AS precipitation,
        ANY_VALUE(AWND) AS wind,
        COUNT(*) AS trip_count
    FROM '{bigfile}'
    GROUP BY date                       
""").to_df()

In [20]:
df_weather.head()

Unnamed: 0,date,temperature,precipitation,wind,trip_count
0,2022-06-12,20.5,20,42,90122
1,2022-07-19,28.2,0,56,119421
2,2022-01-20,5.1,64,54,33772
3,2022-08-17,23.6,79,30,120309
4,2022-08-18,24.1,0,34,121752


In [21]:
# Saving object as df
df_weather.to_csv("C:/Data/Citibike_NY_2022/merged/df_weather.csv", index=False)

Getting sample of rides for graph of membership and ride type. Excluding outlier trips (100 mins or longer)

In [6]:
df_sample_100 = duckdb.query(f"""
    SELECT *
    FROM '{bigfile}'
    WHERE trip_duration < 100
    ORDER BY hash(ride_id, 1) -- using 1 as random seed for reproducibility
    LIMIT 100000;
""").to_df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [7]:
df_sample_100.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,member_casual,date,AWND,PRCP,TAVG,start_lat,start_lng,end_lat,end_lng,trip_duration
0,3610199C7DD25EE1,electric_bike,2022-03-25 16:23:26.117,2022-03-25 16:35:23.167,W 36 St & 9 Ave,6569.07,West End Ave & W 60 St,7059.08,member,2022-03-25,48,8,96,40.754623,-73.995168,40.77237,-73.99005,11.950833
1,AC3A787C575E83FA,classic_bike,2022-05-26 07:59:23.861,2022-05-26 08:03:57.614,Nassau Ave & Russell St,5581.01,Kingsland Ave & Nassau Ave,5613.04,member,2022-05-26,45,0,164,40.72557,-73.94434,40.72577,-73.94173,4.56255
2,C5A91DE83486770F,classic_bike,2022-08-21 16:48:14.678,2022-08-21 17:15:44.486,8 Ave & W 16 St,6072.11,Broadway & W 58 St,6948.1,casual,2022-08-21,39,0,253,40.740983,-74.001702,40.766953,-73.981693,27.4968
3,95A1FD80EA93D5ED,electric_bike,2022-12-19 08:27:37.252,2022-12-19 08:45:40.088,Flushing Ave & Woodward Ave,5225.02,Franklin St & Dupont St,5944.01,member,2022-12-19,72,0,17,40.71246,-73.91873,40.73564,-73.95866,18.047267
4,FB3E05590259FD43,classic_bike,2022-06-06 12:03:49.637,2022-06-06 12:14:14.689,Broadway & W 25 St,6173.08,W 37 St & 5 Ave,6398.06,member,2022-06-06,30,0,222,40.742869,-73.989186,40.75038,-73.98339,10.417533


In [8]:
# Saving object as df
df_sample_100.to_csv("C:/Data/Citibike_NY_2022/merged/df_sample_100.csv", index=False)