In [1]:
import numpy as np
import pandas as pd
import os
from pathlib import Path
import geopandas as gpd
from shapely.geometry import Point


Load CITI Bike Data

In [1]:
#base dir path
base_dir = Path("data")

monthly_dfs = {}

#loop through each month and read all folders in that month
for month in range(1, 13):
    #file name formatted based on citi bike format
    folder_name = f"2024{month:02d}-citibike-tripdata"
    folder_path = base_dir / folder_name
    
    #sanity check
    if not folder_path.exists():
        print(f"Missing folder: {folder_path}")
        continue

    monthly_df_list = []

    #look thorugh all files in folder
    for file in folder_path.glob("*.csv"):
        try:
            df = pd.read_csv(file)
            monthly_df_list.append(df)
        except Exception as e:
            print(f"Failed to read {file}: {e}")

#make month list
    if monthly_df_list:
        full_month_df = pd.concat(monthly_df_list, ignore_index=True)
        monthly_dfs[folder_name] = full_month_df
        #print statement indicating proper month loading
        print(f"{folder_name} loaded with {len(full_month_df)} rows.")
    else:
        print(f"No data loaded for {folder_name}")



  df = pd.read_csv(file)


202401-citibike-tripdata loaded with 1888085 rows.


  df = pd.read_csv(file)


202402-citibike-tripdata loaded with 2121501 rows.


  df = pd.read_csv(file)


202403-citibike-tripdata loaded with 2663295 rows.


  df = pd.read_csv(file)


202404-citibike-tripdata loaded with 3217063 rows.


  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)


202405-citibike-tripdata loaded with 4230360 rows.


  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)


202406-citibike-tripdata loaded with 4783576 rows.


  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)


202407-citibike-tripdata loaded with 4722896 rows.


  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)


202408-citibike-tripdata loaded with 4603575 rows.


  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)


202409-citibike-tripdata loaded with 4997898 rows.


  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)


202410-citibike-tripdata loaded with 5150054 rows.


  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)


202411-citibike-tripdata loaded with 3710134 rows.


  df = pd.read_csv(file)
  df = pd.read_csv(file)


202412-citibike-tripdata loaded with 2311171 rows.


Create monthly ride counts for seasonal trend plot

In [4]:
#ride counts per month
ride_counts = [(month, len(df)) for month, df in monthly_dfs.items()]
monthly_counts_df = pd.DataFrame(ride_counts, columns=["month", "ride_count"])
#sort by month (just in case they're out of order)
monthly_counts_df = monthly_counts_df.sort_values("month").reset_index(drop=True)



In [5]:
#cleaning monthly df to extract month from df
monthly_counts_df['month_clean'] = monthly_counts_df['month'].str.extract(r'(2024\d{2})')
#convert to date time, and create labels
monthly_counts_df['month_dt'] = pd.to_datetime(monthly_counts_df['month_clean'], format='%Y%m')
monthly_counts_df['month_label'] = monthly_counts_df['month_dt'].dt.strftime('%b')
monthly_counts_df = monthly_counts_df.sort_values('month_dt')
monthly_counts_df["count_divide"] = monthly_counts_df["ride_count"]/1000000


In [6]:
#seasonal df use
monthly_counts_df.to_csv("monthly_counts_df.csv", index=False)


Creating Duration and Distance for July Data

In [9]:
#compute distance in km

def haversine(lat1, lon1, lat2, lon2):
    R = 6371 #earths radius
    lat1 = np.radians(lat1)
    lat2 = np.radians(lat2)
    dlat = lat2 - lat1
    dlon = np.radians(lon2 - lon1)

    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c



In [10]:
df_july = monthly_dfs["202407-citibike-tripdata"].copy()

In [11]:
df_july['started_at'] = pd.to_datetime(df_july['started_at'])
df_july['ended_at'] = pd.to_datetime(df_july['ended_at'])

#using datetime finding duration in minutes
df_july['duration_min'] = (df_july['ended_at'] - df_july['started_at']).dt.total_seconds() / 60
#using haversine to calculate distance from start and end, long and lat
df_july['distance_km'] = haversine(df_july['start_lat'], df_july['start_lng'], df_july['end_lat'], df_july['end_lng'])


In [12]:
#filtering out outlier trips, cleans up plot
july_df_filtered = df_july[(df_july['duration_min'] > 1) & (df_july['duration_min'] < 200) & 
                 (df_july['distance_km'] > 0.1) & (df_july['distance_km'] < 20)]

In [13]:
#sampling 200,000
sample_size = 200000
july_sampled = july_df_filtered.sample(n=sample_size, random_state=42)



In [14]:
#print to csv for use
july_sampled.to_csv("july_sampled.csv", index=False)


Network Creation

In [7]:
all_rides_df = pd.concat(monthly_dfs.values(), ignore_index=True)


In [None]:
all_rides_sampled = all_rides_df.sample(n=200000, random_state=42)

In [3]:
#load manhattan neighborhood geojson file
neighborhoods = gpd.read_file("data/nyc_neighborhoods.geojson")


In [19]:
#map[ing to geo points
gdf_start = gpd.GeoDataFrame(
    all_rides_sampled,
    geometry=[Point(xy) for xy in zip(all_rides_sampled.start_lng, all_rides_sampled.start_lat)],
    crs="EPSG:4326"
)
gdf_end = gpd.GeoDataFrame(
    all_rides_sampled.copy(),
    geometry=[Point(xy) for xy in zip(all_rides_sampled.end_lng, all_rides_sampled.end_lat)],
    crs="EPSG:4326"
)

In [20]:
#left joining dfs to the neighborhoods file, based on geometry
gdf_joined = gpd.sjoin(
    gdf_start,
    neighborhoods[['neighborhood', 'boroughCode', 'borough', 'geometry']],
    how="left",
    predicate="within"
)
gdf_end_joined = gpd.sjoin(
    gdf_end[['ride_id', 'geometry']],
    neighborhoods[['neighborhood', 'boroughCode', 'borough', 'geometry']],
    how="left",
    predicate="within"
)


In [21]:
#names were messy
gdf_end_joined = gdf_end_joined.rename(columns={
    'neighborhood': 'end_neighborhood',
    'boroughCode': 'end_boroughCode',
    'borough': 'end_borough'
})

final_df = gdf_joined.merge(
    gdf_end_joined[['ride_id', 'end_neighborhood', 'end_boroughCode', 'end_borough']],
    on='ride_id',
    how='left'
)
#only manhattan
filtered__ride_df = final_df[
    (final_df['boroughCode'] == "1") &
    (final_df['end_boroughCode'] == "1")
]

#getting rid of loops
fil_ride_df = filtered__ride_df[
    (filtered__ride_df['neighborhood'] != filtered__ride_df['end_neighborhood'])
]


In [22]:
fil_ride_df.to_csv("fil_ride_df.csv", index=False)
