<a href="https://colab.research.google.com/github/TheCodingHustler/TheCodingHustler.github.io/blob/main/2025_NYCT_Datathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Bus Speeds on ACE (Bus Automated Camera Enforcement Violation Program)

Import Housing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import io
import requests
import json
from collections import defaultdict
import folium

In [None]:
import altair as alt

Defining the endpoint URLs for the datasets

In [None]:
endpoint_url_1="https://data.ny.gov/resource/kh8p-hcbm.csv" # MTA Bus Automated Camera Enforcement Violations Dataset
endpoint_url_2="https://data.ny.gov/resource/58t6-89vi.csv" # MTA Bus Route Segment Speeds: 2023-2024 (Before Congestion Pricing)
endpoint_url_3="https://data.ny.gov/resource/kufs-yh3x.csv" # MTA Bus Route Segment Speeds: Beginning 2025 (Post-Congestion Pricing)

Read the datasets into dataframes

In [None]:
df_violations = pd.read_csv(endpoint_url_1)
df_speeds_pre_congestion = pd.read_csv(endpoint_url_2)
df_speeds_post_congestion = pd.read_csv(endpoint_url_3)

Printing couple of rows of each dataframe

In [None]:
print("MTA Bus Automated Camera Enforcement Violations Dataset:")
print(df_violations.head())
print("-" * 50)

print("\nMTA Bus Route Segment Speeds: 2023-2024 (Before Congestion Pricing) Dataset:")
print(df_speeds_pre_congestion.head())
print("-" * 50)

print("\nMTA Bus Route Segment Speeds: Beginning 2025 (Post-Congestion Pricing) Dataset:")
print(df_speeds_post_congestion.head())
print("-" * 50)

MTA Bus Automated Camera Enforcement Violations Dataset:
   violation_id                                         vehicle_id  \
0     489749182  c5ae1411153b52556a1e648cc80d718aa519a4bdd189ab...   
1     489744714  df9044acf85cf55488aea4cd3ce1d0e17ef050551726b6...   
2     489743631  eb5a337966ba65f66ab1db8e169d2446a4fb429b0efc63...   
3     489741945  3f877f70d9b253515a945be807c9c62d5814949f810310...   
4     489741940  7feac037b62d591ffb1214e356157f3dd197fc22fee5bb...   

          first_occurrence          last_occurrence  \
0  2025-08-20T23:12:08.000  2025-08-21T00:24:08.000   
1  2025-08-20T23:48:59.000  2025-08-20T23:54:47.000   
2  2025-08-20T22:33:13.000  2025-08-20T23:56:02.000   
3  2025-08-20T22:50:45.000  2025-08-20T23:32:43.000   
4  2025-08-20T10:52:57.000  2025-08-20T11:16:57.000   

             violation_status        violation_type bus_route_id  \
0       TECHNICAL ISSUE/OTHER       MOBILE BUS STOP         BX36   
1    EXEMPT - BUS/PARATRANSIT       MOBILE BUS STOP    

Combining bus speed datasets before and after congestion pricing

In [None]:
df_speeds_pre_congestion['time_period'] = 'Pre-Congestion Pricing'
df_speeds_post_congestion['time_period'] = 'Post-Congestion Pricing'

In [None]:
df_all_speeds = pd.concat ([df_speeds_pre_congestion, df_speeds_post_congestion]) #concentration of dataframes

Identification of camera-enforced routes from violations dataset

In [None]:
ace_routes = df_violations['bus_route_id'].unique()

In [None]:
print("Camera-Enforced Routes:")
print(ace_routes)

Camera-Enforced Routes:
['BX36' 'BX28' 'Q53+' 'Q44+' 'M101' 'B46+' 'Q69' 'BX38' 'M42' 'M60+' 'M2'
 'BX6+' 'B35' 'BX35' 'M4' 'B82+' 'M34+' 'M15+' 'BX19' 'BX41+' 'M23+'
 'M100' 'BX12+' 'Q43' 'Q54' 'B41' 'B44+' 'M79+' 'Q58' 'M14+' 'B25' 'B62'
 'M86+' 'B26' 'Q5' 'B42' 'S79+']


Combining ACE-only speed routes

In [None]:
df_ace_speeds = df_all_speeds[df_all_speeds['route_id'].isin(ace_routes)]

Calculating Route and Time Period by group and changing average speed

In [None]:
speed_change = df_ace_speeds.groupby(['route_id', 'time_period'])['average_road_speed'].mean().reset_index()

Visualization of Average Bus Speed on ACE Routes Before and After Congestion Pricing

In [None]:
all_routes = df_ace_speeds['route_id'].unique()

In [None]:
chart = alt.Chart(speed_change).mark_bar().encode(
    x=alt.X('route_id:N', title='Bus Route ID'),
    y=alt.Y('average_road_speed:Q', title='Average Speed (mph)'),
    color=alt.Color('time_period:N', title='Time Period'),
    xOffset='time_period:N',
    tooltip=[
        alt.Tooltip('route_id:N', title='Bus Route ID'),
        alt.Tooltip('time_period:N', title='Time Period'),
        alt.Tooltip('average_road_speed:Q', title='Avg. Speed', format='.2f')
    ]
).properties(
    title='Average Bus Speed on ACE Routes Before and After Congestion Pricing',
    width=400,
    height=400
)
chart.show()

# Exempt Violations and Repeat offenders

In [None]:
df_exempt_violations = df_violations[df_violations['violation_type'] == 'exempt'].copy()

In [None]:
repeat_offenders = df_exempt_violations.groupby('violation_status').filter(lambda x: len(x) > 1)

In [None]:
print("Repeat Offenders (Exempt Vehicles):")
print(repeat_offenders.head())

Repeat Offenders (Exempt Vehicles):
Empty DataFrame
Columns: [violation_id, vehicle_id, first_occurrence, last_occurrence, violation_status, violation_type, bus_route_id, violation_latitude, violation_longitude, stop_id, stop_name, bus_stop_latitude, bus_stop_longitude, violation_georeference, bus_stop_georeference]
Index: []


In [None]:
m = folium.Map(location=[40.7128, -74.0060], zoom_start=12)

In [None]:
from vega_datasets import data

In [None]:
import geopandas as gpd

In [None]:
import json

In [None]:
geojson_url = "https://gist.githubusercontent.com/ix4/6f44e559b29a72c4c5d130ac13aad317/raw/a7a3a37f2fe054ebc18871b34b023d312668f035/nyc.geojson"
response = requests.get(geojson_url)
response.raise_for_status()
boroughs_geo = response.json()

In [None]:
df = df_exempt_violations.copy()
df['latitude'] = pd.to_numeric(df['violation_latitude'], errors='coerce')
df['longitude'] = pd.to_numeric(df['violation_longitude'], errors='coerce')
df = df.dropna(subset=['latitude', 'longitude'])

In [None]:
df = df_violations.copy()
df['latitude'] = pd.to_numeric(df['violation_latitude'], errors='coerce')
df['longitude'] = pd.to_numeric(df['violation_longitude'], errors='coerce')
df = df.dropna(subset=['latitude', 'longitude'])

boroughs = alt.Chart(alt.Data(values=boroughs_geo['features'])).mark_geoshape(
    fill='lightgray',
    stroke='black',
    strokeWidth=0.5
).encode(
    tooltip=alt.Tooltip('properties.borough_name:N', title='Borough')
).project(
    type='mercator'
).properties(
    width=800,
    height=600,
    title='Geographic Locations of Bus Violations + NYC Boroughs'
)

points = alt.Chart(df).mark_circle(size=10, opacity=0.5).encode(
    longitude='longitude:Q',
    latitude='latitude:Q',
    color=alt.Color('bus_route_id:N', title='Bus Route ID'), # Use bus_route_id
    tooltip=[
        alt.Tooltip('vehicle_id:N', title='Vehicle ID'), # Use vehicle_id
        alt.Tooltip('bus_route_id:N', title='Bus Route ID'), # Use bus_route_id
        alt.Tooltip('first_occurrence:T', title='First Occurrence') # Use first_occurrence
    ]
)

chart = boroughs + points

chart

# Violations and Congestion Pricing

In [None]:
cbd_routes = ['M1', 'M2', 'M3', 'M4', 'M5', 'M15']
df_cbd_violations = df_violations[df_violations['bus_route_id'].isin(cbd_routes)]

In [None]:
df_cbd_violations['first_occurrence'] = pd.to_datetime(df_cbd_violations['first_occurrence'])
monthly_violations = df_cbd_violations.set_index('first_occurrence').resample('ME').size()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cbd_violations['first_occurrence'] = pd.to_datetime(df_cbd_violations['first_occurrence'])


In [None]:
rule_data = pd.DataFrame({
    'start_date': [pd.to_datetime('2025-01-05')]
})

In [None]:
box_plot = alt.Chart(df_all_speeds).mark_boxplot().encode(
    x=alt.X('time_period:N', title='Time Period'),
    y=alt.Y('average_road_speed:Q', title='Average Road Speed (mph)'),
    color='time_period:N',
    tooltip=['time_period', 'average_road_speed']
).properties(
    title='Distribution of Average Bus Speeds Before and After Congestion Pricing',
    width=400,
    height=400
)

box_plot