In [None]:
import importlib
import config
importlib.reload(config)

<div>
  <h3>Package Setup</h3>
  <p>
    The section below sets up the environment for analysis. It imports all the core libraries we'll need for pulling data from APIs,  handling it in tables, and creating visualizations.
  </p>


  <h3>What this section does</h3>
  <ul>
    <li>Imports <strong>Requests</strong> so the code can make web API calls and retrieve data from the internet.</li>
    <li>Loads <strong>Pandas</strong> to manage and clean data in DataFrames (rows and columns like Excel).</li>
    <li>Uses <strong>StringIO</strong> to treat text data (like CSV strings from an API) as if it were a file, which helps load it into Pandas easily.</li>
    <li>Brings in <strong>Seaborn</strong> and <strong>Matplotlib</strong> to make clear, publication-style charts and plots.</li>
    <li>Pulls <strong>APP_TOKEN, USERNAME, PASSWORD</strong> from <code>config</code> file so credentials aren’t exposed directly in the code.</li>
    <li>Uses <strong>Sodapy</strong> to connect to Socrata open data portals such as NYC Open Data or government APIs.</li>
    <li>Includes <strong>NumPy</strong> for numerical operations like rounding, filling missing values, or computing averages.</li>
    <li>Imports <strong>JSON</strong> to read and parse structured API responses formatted as JSON text.</li>
  </ul>


In [1]:
import requests as r
import pandas as pd
from io import StringIO
import seaborn as sns
from config import APP_TOKEN, USERNAME, PASSWORD
import matplotlib.pyplot as plt
from sodapy import Socrata
import numpy as np
import json

<div>
  <h2>Section: API Data Extraction and Daily Aggregation</h2>
   
   <h3>Overview</h3>
  <p>
    This section establishes a connection to the NYC Open Data API and pulls the 2023 Yellow Taxi trip data. It uses the Socrata API to query, summarize, and structure the data by day, giving you a clean daily trip count dataset for analysis.
  </p>

   <ul>
    <li>
      <strong>Define Data Source</strong> — sets the domain (<code>data.cityofnewyork.us</code>) and dataset ID (<code>h9gi-nx95</code>) that identify where the data will come from.
    </li>
<li>
  <strong>Connect to API</strong> — uses the <code>Socrata</code> client my personalized <code>APP_TOKEN</code> to securely access NYC’s open data service.
</li>

<li>
  <strong>Run SoQL Query</strong> — retrieves data using Socrata’s SQL-style language:
  <ul>
    <li><code>select</code> → groups trips by date and counts them.</li>
    <li><code>where</code> → filters trips to the year 2023.</li>
    <li><code>group</code> → aggregates data by day.</li>
    <li><code>order</code> → sorts daily results chronologically.</li>
  </ul>
</li>

<li>
  <strong>Convert and Clean Data</strong> — loads the API results into a Pandas DataFrame, converts the date column to datetime format, and ensures trip counts are numeric for later visualization or analysis.
</li>


In [11]:
#API PULL
DOMAIN = "data.cityofnewyork.us"
DATASET = "h9gi-nx95"  

client = Socrata(DOMAIN, APP_TOKEN)

# PULL DAILY AGGGREGRATION via SoQL 
results = client.get(
    DATASET,
    select="*", 
    where="borough != 'NAN'"
)

# Convert to df
df= pd.DataFrame.from_records(results)


<div>
  <h2>Section: Previewing the Retrieved Data</h2>

   <h3>Overview</h3>
  <p>
    This section displays the first few rows of the <code>daily</code> DataFrame to confirm that the API pull and data cleaning worked correctly. It’s a quick quality check before moving continuing with analysis or visualization.
  </p>

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 29 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   crash_date                     1000 non-null   object
 1   crash_time                     1000 non-null   object
 2   borough                        1000 non-null   object
 3   zip_code                       999 non-null    object
 4   latitude                       983 non-null    object
 5   longitude                      983 non-null    object
 6   location                       983 non-null    object
 7   on_street_name                 598 non-null    object
 8   off_street_name                598 non-null    object
 9   number_of_persons_injured      1000 non-null   object
 10  number_of_persons_killed       1000 non-null   object
 11  number_of_pedestrians_injured  1000 non-null   object
 12  number_of_pedestrians_killed   1000 non-null   object
 13  numb

In [19]:
df.describe()

Unnamed: 0,crash_date,crash_time,borough,zip_code,latitude,longitude,location,on_street_name,off_street_name,number_of_persons_injured,...,contributing_factor_vehicle_3,collision_id,vehicle_type_code1,vehicle_type_code2,vehicle_type_code_3,cross_street_name,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
count,1000,1000,1000,999,983.0,983.0,983,598,598,1000,...,96,1000,987,655,83,402,28,28,9,8
unique,89,508,5,159,950.0,951.0,962,422,476,8,...,1,1000,31,34,5,399,1,5,1,3
top,2021-09-11T00:00:00.000,0:00,BROOKLYN,11207,0.0,0.0,"{'latitude': '0.0', 'longitude': '0.0', 'human...",BROADWAY,3 AVENUE,0,...,Unspecified,4675373,Sedan,Sedan,Sedan,2820 SNYDER AVENUE,Unspecified,Sedan,Unspecified,Station Wagon/Sport Utility Vehicle
freq,160,25,367,29,4.0,4.0,4,9,7,662,...,96,1,513,292,50,2,28,19,9,6


In [None]:
df.head()

Unnamed: 0,crash_date,crash_time,borough,zip_code,latitude,longitude,location,on_street_name,off_street_name,number_of_persons_injured,...,contributing_factor_vehicle_3,collision_id,vehicle_type_code1,vehicle_type_code2,vehicle_type_code_3,cross_street_name,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
0,2023-11-01T00:00:00.000,1:29,BROOKLYN,11230,40.62179,-73.970024,"{'latitude': '40.62179', 'longitude': '-73.970...",OCEAN PARKWAY,AVENUE K,1,...,Unspecified,4675373,Moped,Sedan,Sedan,,,,,
1,2021-09-11T00:00:00.000,9:35,BROOKLYN,11208,40.667202,-73.8665,"{'latitude': '40.667202', 'longitude': '-73.86...",,,0,...,,4456314,Sedan,,,1211 LORING AVENUE,,,,
2,2021-12-14T00:00:00.000,8:13,BROOKLYN,11233,40.683304,-73.917274,"{'latitude': '40.683304', 'longitude': '-73.91...",SARATOGA AVENUE,DECATUR STREET,0,...,,4486609,,,,,,,,
3,2021-12-14T00:00:00.000,8:17,BRONX,10475,40.86816,-73.83148,"{'latitude': '40.86816', 'longitude': '-73.831...",,,2,...,,4486660,Sedan,Sedan,,344 BAYCHESTER AVENUE,,,,
4,2021-12-14T00:00:00.000,21:10,BROOKLYN,11207,40.67172,-73.8971,"{'latitude': '40.67172', 'longitude': '-73.897...",,,0,...,,4487074,Sedan,,,2047 PITKIN AVENUE,,,,


<div>
  <h2>Section: Data Cleaning and Preparation</h2>

  <p>
    This section cleans up the date field

In [20]:
# Assuming 'df' is your DataFrame and 'crash_date' is the column to fix
df['crash_date'] = pd.to_datetime(df['crash_date'], errors='coerce')

In [21]:
df.head()

Unnamed: 0,crash_date,crash_time,borough,zip_code,latitude,longitude,location,on_street_name,off_street_name,number_of_persons_injured,...,contributing_factor_vehicle_3,collision_id,vehicle_type_code1,vehicle_type_code2,vehicle_type_code_3,cross_street_name,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
0,2023-11-01,1:29,BROOKLYN,11230,40.62179,-73.970024,"{'latitude': '40.62179', 'longitude': '-73.970...",OCEAN PARKWAY,AVENUE K,1,...,Unspecified,4675373,Moped,Sedan,Sedan,,,,,
1,2021-09-11,9:35,BROOKLYN,11208,40.667202,-73.8665,"{'latitude': '40.667202', 'longitude': '-73.86...",,,0,...,,4456314,Sedan,,,1211 LORING AVENUE,,,,
2,2021-12-14,8:13,BROOKLYN,11233,40.683304,-73.917274,"{'latitude': '40.683304', 'longitude': '-73.91...",SARATOGA AVENUE,DECATUR STREET,0,...,,4486609,,,,,,,,
3,2021-12-14,8:17,BRONX,10475,40.86816,-73.83148,"{'latitude': '40.86816', 'longitude': '-73.831...",,,2,...,,4486660,Sedan,Sedan,,344 BAYCHESTER AVENUE,,,,
4,2021-12-14,21:10,BROOKLYN,11207,40.67172,-73.8971,"{'latitude': '40.67172', 'longitude': '-73.897...",,,0,...,,4487074,Sedan,,,2047 PITKIN AVENUE,,,,


In [22]:
df.to_csv("nyc_crashes.csv", index=False)