In [13]:
# ==================================================
# NOAA API TOKEN (Authentication)
# ==================================================

NOAA_TOKEN = "PureVeIaTdTaFZySikdtVUlYAFTbIHzw"

print("Token length:", len(NOAA_TOKEN))

Token length: 32


In [7]:
# ===============================
# 1. IMPORT LIBRARIES
# ===============================

import pandas as pd
import numpy as np
import os
import zipfile

print("Libraries imported successfully.")

Libraries imported successfully.


In [14]:
import requests

# ==================================================
# NOAA: Download DAILY weather data for LaGuardia (LGA) in 2022
# Dataset: GHCND (Daily Summaries)
# Station: USW00014732 (New York LaGuardia Airport)
# ==================================================

STATION_ID = "GHCND:USW00014732"
START_DATE = "2022-01-01"
END_DATE = "2022-12-31"

BASE_URL = "https://www.ncei.noaa.gov/cdo-web/api/v2/data"

def fetch_noaa_daily(station_id, start_date, end_date, datatypes, token):
    headers = {"token": token}
    all_results = []
    limit = 1000
    offset = 1

    while True:
        params = {
            "datasetid": "GHCND",
            "stationid": station_id,
            "startdate": start_date,
            "enddate": end_date,
            "units": "metric",
            "limit": limit,
            "offset": offset,
            "datatypeid": datatypes,   # list allowed
        }

        r = requests.get(BASE_URL, headers=headers, params=params)
        r.raise_for_status()
        data = r.json()

        results = data.get("results", [])
        all_results.extend(results)

        if len(results) < limit:
            break

        offset += limit

    return pd.DataFrame(all_results)

# We pull these common daily variables
datatypes = ["TAVG", "TMIN", "TMAX", "PRCP"]

noaa_raw = fetch_noaa_daily(
    station_id=STATION_ID,
    start_date=START_DATE,
    end_date=END_DATE,
    datatypes=datatypes,
    token=NOAA_TOKEN
)

print("Rows downloaded:", len(noaa_raw))
print("Columns:", list(noaa_raw.columns))
noaa_raw.head()

Rows downloaded: 1460
Columns: ['date', 'datatype', 'station', 'attributes', 'value']


Unnamed: 0,date,datatype,station,attributes,value
0,2022-01-01T00:00:00,PRCP,GHCND:USW00014732,",,W,2400",19.3
1,2022-01-01T00:00:00,TAVG,GHCND:USW00014732,"H,,S,",11.6
2,2022-01-01T00:00:00,TMAX,GHCND:USW00014732,",,W,2400",13.9
3,2022-01-01T00:00:00,TMIN,GHCND:USW00014732,",,W,2400",10.0
4,2022-01-02T00:00:00,PRCP,GHCND:USW00014732,",,W,2400",1.0


In [15]:
# ==================================================
# CLEAN & RESHAPE NOAA DATA
# ==================================================

# Convert date column to datetime
noaa_raw["date"] = pd.to_datetime(noaa_raw["date"])

# Pivot from long to wide format
noaa_df = (
    noaa_raw
    .pivot(index="date", columns="datatype", values="value")
    .reset_index()
)

# Sort by date
noaa_df = noaa_df.sort_values("date")

print("Weather dataset shape:", noaa_df.shape)
noaa_df.head()

Weather dataset shape: (365, 5)


datatype,date,PRCP,TAVG,TMAX,TMIN
0,2022-01-01,19.3,11.6,13.9,10.0
1,2022-01-02,1.0,11.4,15.6,3.9
2,2022-01-03,0.0,1.4,3.9,-4.3
3,2022-01-04,0.0,-2.7,2.2,-6.0
4,2022-01-05,6.1,3.2,8.9,0.0


In [16]:
# ==================================================
# EXPORT CLEAN WEATHER DATA TO CSV
# ==================================================

noaa_df.to_csv("weather_2022_laguardia.csv", index=False)

print("Weather CSV exported successfully.")

Weather CSV exported successfully.


In [17]:
# ==================================================
# AGGREGATE CITIBIKE DATA TO DAILY LEVEL
# ==================================================

# Convert started_at to datetime (if not already)
citibike_df["started_at"] = pd.to_datetime(citibike_df["started_at"])

# Create date-only column
citibike_df["date"] = citibike_df["started_at"].dt.date

# Count number of rides per day
daily_rides = (
    citibike_df
    .groupby("date")
    .size()
    .reset_index(name="daily_ride_count")
)

# Convert date back to datetime for merging
daily_rides["date"] = pd.to_datetime(daily_rides["date"])

print("Daily rides dataset shape:", daily_rides.shape)
daily_rides.head()

Daily rides dataset shape: (402, 2)


Unnamed: 0,date,daily_ride_count
0,2021-01-30,1
1,2021-02-15,1
2,2021-03-11,1
3,2021-03-14,1
4,2021-03-31,1


In [18]:
# ==================================================
# FIX: KEEP ONLY 2022 ROWS, THEN RE-AGGREGATE DAILY
# ==================================================

# Ensure datetime
citibike_df["started_at"] = pd.to_datetime(citibike_df["started_at"], errors="coerce")

# Filter to 2022 only
citibike_2022 = citibike_df[
    (citibike_df["started_at"] >= "2022-01-01") &
    (citibike_df["started_at"] <  "2023-01-01")
].copy()

# Daily aggregation
citibike_2022["date"] = citibike_2022["started_at"].dt.floor("D")

daily_rides_2022 = (
    citibike_2022
    .groupby("date")
    .size()
    .reset_index(name="daily_ride_count")
    .sort_values("date")
)

print("Filtered rows (2022 only):", len(citibike_2022))
print("Daily rides dataset shape (2022):", daily_rides_2022.shape)
print("Date range:", daily_rides_2022["date"].min(), "to", daily_rides_2022["date"].max())

daily_rides_2022.head()

Filtered rows (2022 only): 29838166
Daily rides dataset shape (2022): (365, 2)
Date range: 2022-01-01 00:00:00 to 2022-12-31 00:00:00


Unnamed: 0,date,daily_ride_count
0,2022-01-01,20428
1,2022-01-02,43009
2,2022-01-03,33189
3,2022-01-04,36842
4,2022-01-05,34230


In [20]:
# ==================================================
# 8. MERGE DAILY CITIBIKE COUNTS WITH NOAA WEATHER
# ==================================================

# Ensure both are datetime
daily_rides_2022["date"] = pd.to_datetime(daily_rides_2022["date"])
noaa_df["date"] = pd.to_datetime(noaa_df["date"])

# Merge (inner keeps only dates present in both)
merged_daily = pd.merge(
    daily_rides_2022,
    noaa_df,
    on="date",
    how="inner"
)

print("Merged dataset shape:", merged_daily.shape)
print("Merged date range:", merged_daily["date"].min(), "to", merged_daily["date"].max())

merged_daily.head()

Merged dataset shape: (365, 6)
Merged date range: 2022-01-01 00:00:00 to 2022-12-31 00:00:00


Unnamed: 0,date,daily_ride_count,PRCP,TAVG,TMAX,TMIN
0,2022-01-01,20428,19.3,11.6,13.9,10.0
1,2022-01-02,43009,1.0,11.4,15.6,3.9
2,2022-01-03,33189,0.0,1.4,3.9,-4.3
3,2022-01-04,36842,0.0,-2.7,2.2,-6.0
4,2022-01-05,34230,6.1,3.2,8.9,0.0


In [21]:
# Export merged dataset
merged_daily.to_csv("citibike_2022_daily_with_weather.csv", index=False)
print("Exported: citibike_2022_daily_with_weather.csv")

Exported: citibike_2022_daily_with_weather.csv


In [8]:
# ===============================
# 2. EXTRACT MAIN 2022 ZIP FILE
# ===============================

main_zip_path = "data/2022-citibike-tripdata.zip"
extract_path = "data/"

with zipfile.ZipFile(main_zip_path, "r") as zip_ref:
    zip_ref.extractall(extract_path)

print("Main 2022 zip extracted.")

Main 2022 zip extracted.


In [9]:
# ==========================================
# 3. EXTRACT ALL MONTHLY ZIP FILES (12)
# ==========================================

monthly_folder = os.path.join("data", "2022-citibike-tripdata")

zip_files = [
    os.path.join(monthly_folder, f)
    for f in os.listdir(monthly_folder)
    if f.lower().endswith(".zip")
]

print("Monthly zip files found:", len(zip_files))

for z in zip_files:
    with zipfile.ZipFile(z, "r") as zip_ref:
        zip_ref.extractall(monthly_folder)

print("All monthly zip files extracted.")

Monthly zip files found: 12
All monthly zip files extracted.


In [12]:
# ==================================================
# 4. COMBINE ALL CSV FILES INTO ONE DATAFRAME
# ==================================================
# Uses:
# 1) List comprehension to build list of CSV file paths
# 2) Generator expression inside pd.concat() for memory efficiency
#
# Note:
# We set low_memory=False to prevent pandas chunk-based dtype guessing
# (this reduces DtypeWarning spam for mixed-type columns)

csv_files = [
    os.path.join(monthly_folder, f)
    for f in os.listdir(monthly_folder)
    if f.lower().endswith(".csv")
]

print("CSV files found:", len(csv_files))

citibike_df = pd.concat(
    (pd.read_csv(file, low_memory=False) for file in csv_files),  # generator expression
    ignore_index=True
)

print("All CSV files combined successfully.")
print("Dataset shape:", citibike_df.shape)

CSV files found: 36
All CSV files combined successfully.
Dataset shape: (29838806, 13)
