In [1]:
import requests
import matplotlib
import geopandas as gpd

In [32]:
BASE_URL = "https://s3.amazonaws.com/tripdata/"
month = "202412-citibike-tripdata.zip"
latest_data = requests.get(BASE_URL + month)

In [60]:
from decimal import Decimal
from io import BytesIO
import pandas as pd
from zipfile import ZipFile

# Read in a zipped CSV HTML response

origin_table_do_not_modify = pd.DataFrame()
zip_data = BytesIO(latest_data.content)
csv_filename = None

# Open the zip file
with ZipFile(zip_data) as zip_file:
    for name in zip_file.namelist():
        if name.endswith(".csv"):
            csv_filename = name
            with zip_file.open(name) as csv_file:
                origin_table_do_not_modify = pd.read_csv(
                    csv_file,
                    dtype={"start_station_id": str, "end_station_id": str},
                    converters={"start_lat": Decimal}
                )
                break


In [61]:
origin_table_do_not_modify

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,B44E5B10AEE58AD0,classic_bike,2024-12-14 10:58:18.153,2024-12-14 11:11:11.308,Frederick Douglass Blvd & W 145 St,7954.12,E 138 St & 5 Ave,7809.13,40.823061,-73.941928,40.814490,-73.936153,member
1,BC252DC6A6011556,electric_bike,2024-12-12 14:46:12.473,2024-12-12 16:45:37.777,Madison Ave & E 99 St,7443.01,,,40.78948541553215,-73.952429,40.780000,-73.960000,member
2,6FBE55EF6FE8736D,electric_bike,2024-12-11 07:55:18.770,2024-12-11 08:02:23.460,Columbia St & Kane St,4422.05,,,40.68763155,-74.001626,40.690000,-74.000000,member
3,908890DE7FDCF9FE,electric_bike,2024-12-09 22:51:11.668,2024-12-09 22:57:43.495,E 13 St & 2 Ave,5820.08,E 10 St & 2 Ave,5746.02,40.73153937464073,-73.985302,40.729708,-73.986598,member
4,D5D366379A4DC0A8,classic_bike,2024-12-10 18:48:40.063,2024-12-10 19:10:32.264,11 Ave & W 41 St,6726.01,E 25 St & 1 Ave,6004.07,40.76030096,-73.998842,40.738177,-73.977387,member
...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,4D7A0F3A9B538327,classic_bike,2024-12-06 18:43:51.866,2024-12-06 18:50:29.033,5 Ave & E 30 St,6248.08,10 Ave & W 28 St,6459.04,40.7459849,-73.986295,40.750664,-74.001768,member
999996,93C022D486F87ABC,classic_bike,2024-12-10 10:34:58.071,2024-12-10 10:51:49.151,Lafayette St & Grand St,5422.09,10 Ave & W 28 St,6459.04,40.72028,-73.998790,40.750664,-74.001768,member
999997,20A11C486859F19B,electric_bike,2024-12-03 14:02:29.375,2024-12-03 14:07:51.452,Lenox Ave & W 117 St,7655.22,W 110 St & Amsterdam Ave,7646.04,40.8025566,-73.949078,40.802692,-73.962950,member
999998,4D27B49621858BF9,electric_bike,2024-12-05 07:03:08.210,2024-12-05 07:06:03.572,Watts St & Greenwich St,5578.02,West St & Chambers St,5329.03,40.72405549,-74.009660,40.717548,-74.013221,casual


In [62]:
# Get station with most trips by end_station_id
origin_table_do_not_modify["end_station_id"].value_counts()

end_station_id
6140.05    4945
5905.14    4257
5905.12    3944
6450.05    3939
6492.08    3918
           ... 
7922.01       1
3278.07       1
4432.10       1
5523.02       1
4157.15       1
Name: count, Length: 2148, dtype: int64

In [63]:
origin_table_do_not_modify[origin_table_do_not_modify["end_station_id"] == "JC005"]['end_lat'].unique()

array([], dtype=float64)

### Missing start_station_id

In [64]:
# .45% of trips do not have a proper end stattion.
100 * null_end.shape[0] / origin_table_do_not_modify.shape[0]

0.4475

In [65]:
# Entries with null start_station_id
null_start = origin_table_do_not_modify[origin_table_do_not_modify.start_station_id.isna()]
null_start.to_csv("./output/null_start.csv")

# Entries with null start_station_id
null_end = origin_table_do_not_modify[origin_table_do_not_modify.end_station_id.isna()]
null_end.to_csv("./output/null_end.csv")

### Coordinate length

Do the entries with coordinates with > 6 decimal places have any significance? Are they consistent?

In [66]:
# This is interesting, when printing here it only shows 8 decimals but in the csv it shows 13

origin_table_do_not_modify[origin_table_do_not_modify['start_station_id'] == '5470.12']['start_lat'].unique()

array([Decimal('40.72019521437465')], dtype=object)

In [67]:
non_null_station_ids = origin_table_do_not_modify.dropna(subset=['start_station_id', 'end_station_id'])

In [83]:
unique_start_lat_counts = non_null_station_ids.groupby('start_station_id')['start_lat'].nunique().sort_values(ascending=False)
unique_start_lat_counts[unique_start_lat_counts > 1]

non_null_station_ids[non_null_station_ids.start_station_id == '5788.13'].start_lat.value_counts()

start_lat
40.73020660529954    2965
40.730251551            1
40.730248094            1
40.730335832            1
40.730383039            1
Name: count, dtype: int64

In [81]:
non_null_station_ids[non_null_station_ids.start_station_id == '5788.13']

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
186616,231646FC94B73EF6,electric_bike,2024-12-11 12:40:27.016,2024-12-11 12:47:00.806,Lafayette St & E 8 St,5788.13,E 30 St & Park Ave S,6206.08,40.73020660529954,-73.991026,40.744449,-73.983035,member
186996,4480B979137BE110,electric_bike,2024-12-05 18:15:42.099,2024-12-05 18:22:48.953,Lafayette St & E 8 St,5788.13,E 30 St & Park Ave S,6206.08,40.73020660529954,-73.991026,40.744449,-73.983035,member
187007,3CD706A0A34DF087,electric_bike,2024-12-13 01:01:27.099,2024-12-13 01:15:42.521,Lafayette St & E 8 St,5788.13,E 56 St & 3 Ave,6691.11,40.73020660529954,-73.991026,40.759345,-73.967597,member
187008,674CC9A4BB2A2848,electric_bike,2024-12-07 21:53:56.683,2024-12-07 22:07:52.691,Lafayette St & E 8 St,5788.13,E 47 St & 2 Ave,6498.10,40.73020660529954,-73.991026,40.753231,-73.970325,member
187051,399464A3B2A7CB64,classic_bike,2024-12-10 09:07:32.414,2024-12-10 09:37:54.115,Lafayette St & E 8 St,5788.13,E 47 St & 2 Ave,6498.10,40.73020660529954,-73.991026,40.753231,-73.970325,member
...,...,...,...,...,...,...,...,...,...,...,...,...,...
991804,AD32FBF5E30E3535,electric_bike,2024-12-09 10:31:22.737,2024-12-09 10:38:04.256,Lafayette St & E 8 St,5788.13,Barrow St & Hudson St,5805.05,40.73020660529954,-73.991026,40.731724,-74.006744,casual
991825,9036374077C360C1,classic_bike,2024-12-06 11:46:59.337,2024-12-06 11:58:18.564,Lafayette St & E 8 St,5788.13,Barrow St & Hudson St,5805.05,40.73020660529954,-73.991026,40.731724,-74.006744,member
991915,3EFC32EA7B03382A,electric_bike,2024-12-06 17:22:26.843,2024-12-06 17:38:58.428,Lafayette St & E 8 St,5788.13,Court St & State St,4488.08,40.73020660529954,-73.991026,40.690238,-73.992031,member
991949,77A6A2343585CF4A,electric_bike,2024-12-08 19:37:34.016,2024-12-08 19:44:21.128,Lafayette St & E 8 St,5788.13,Barrow St & Hudson St,5805.05,40.73020660529954,-73.991026,40.731724,-74.006744,casual


In [8]:
def remove_null_stations(df):
    return df[~df.start_station_id.isna() & ~df.end_station_id.isna()]

In [9]:
def reformat_station_ids(row):
    # replace `.` with `_` in start_station_id and end_station_id
    row["start_station_id"] = row["start_station_id"].replace(".", "_")
    row["end_station_id"] = row["end_station_id"].replace(".", "_")
    return row

In [10]:
trips_wo_null_stations = remove_null_stations(origin_table_do_not_modify)
trips_formatted = trips_wo_null_stations.apply(reformat_station_ids, axis=1)

In [11]:
name_counts = origin_table_do_not_modify.groupby("start_station_id")[
    "start_station_name"
].nunique()
# Two stations with multiple names
print(name_counts.sort_values(ascending=False).head(10))


# Get all station names for start_station_id == 4968.03
station_names = origin_table_do_not_modify[origin_table_do_not_modify["start_station_id"] == 4968.03][
    "start_station_name"
].unique()
station_names

start_station_id
HB101    1
JC075    1
JC072    1
JC066    1
JC065    1
JC063    1
JC059    1
JC057    1
JC056    1
JC055    1
Name: start_station_name, dtype: int64


array([], dtype=object)

In [15]:
origin_table_do_not_modify[origin_table_do_not_modify.start_station_name == "W 35 St & 9 Ave"]

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual


In [16]:
origin_table_do_not_modify[origin_table_do_not_modify.start_station_id == "6569.09_"]

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual


In [17]:
name_counts = origin_table_do_not_modify.groupby("start_station_name")[
    "start_station_id"
].nunique()
# Two stations with multiple names
print(name_counts.sort_values(ascending=False).head(50))


station_names = origin_table_do_not_modify[
    origin_table_do_not_modify["start_station_name"] == "W 35 St & 9 Ave"
]["start_station_id"].unique()

start_station_name
11 St & Washington St                           1
McGinley Square                                 1
Manila & 1st                                    1
Mama Johnson Field - 4 St & Jackson St          1
Madison St & 10 St                              1
Madison St & 1 St                               1
Lincoln Park                                    1
Liberty Light Rail                              1
Leonard Gordon Park                             1
Lafayette Park                                  1
Journal Square                                  1
Jersey & 6th St                                 1
Jersey & 3rd                                    1
Jackson Square                                  1
JC Medical Center                               1
Hoboken Terminal - River St & Hudson Pl         1
Hoboken Terminal - Hudson St & Hudson Pl        1
Marin Light Rail                                1
Monmouth and 6th                                1
14 St Ferry - 14 St & Shipyard 

In [18]:
origin_table_do_not_modify.columns

Index(['ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
       'member_casual'],
      dtype='object')

In [19]:
# convert all start_station_names to strings
origin_table_do_not_modify["start_station_id"] = origin_table_do_not_modify["start_station_id"].astype(
    str
)

In [21]:
origin_table_do_not_modify[origin_table_do_not_modify["start_station_id"] == "7293.10"].shape

(0, 13)

In [22]:
# Create a table using the start_station_id and the first encountered start_station_name/start_station_lat/start_station_lng
stations = trips_formatted[
    ["start_station_id", "start_station_name", "start_lat", "start_lng"]
].drop_duplicates(subset="start_station_id")

In [23]:
# convert `stations` to a GeoDataFrame
stations_gdf = gpd.GeoDataFrame(
    stations,
    geometry=gpd.points_from_xy(stations.start_lng, stations.start_lat),
    crs="EPSG:4326",
)
stations_gdf.rename(
    columns={"start_station_id": "id", "start_station_name": "name"}, inplace=True
)

In [24]:
stations_gdf.to_file(
    "../frontend-citi-bike-data/public/map_data/stations.geojson", driver="GeoJSON"
)

In [25]:
import os

# Create a dataframe which is the count of entries for any given `start_station_id`/`end_station_id` pair
station_pairs = (
    trips_formatted.groupby(["start_station_id", "end_station_id"])
    .size()
    .reset_index(name="count")
)
# For each `start_station_id` create a list of `end_station_id` and `count` pairs and write the value to a file with the name of the `start_station_id`
if not os.path.exists("../frontend-citi-bike-data/public/station_data"):
    os.makedirs("../frontend-citi-bike-data/public/station_data")
for station_id in station_pairs.start_station_id.unique():
    station_pairs[station_pairs.start_station_id == station_id][
        ["end_station_id", "count"]
    ].to_csv(
        f"../frontend-citi-bike-data/public/station_data/{station_id}.csv", index=False
    )

In [26]:
# Given a station_id, fetch the appropriate file and return the contents as a dataframe
def fetch_station_pairs(station_id):
    return pd.read_csv(
        f"../frontend-citi-bike-data/public/station_data/{station_id}.csv"
    )

In [27]:
fetch_station_pairs("8841_03")

Unnamed: 0,end_station_id,count
0,8118_02,1
1,8330_05,1
2,8505_08,1
3,8558_01,1
4,8601_04,2
5,8672_05,1
6,8682_06,1
