Importing Dataset and create GeoJSON file for further spatial analysis

In [None]:
# Core libraries
import os
import numpy as np
import pandas as pd

# Geospatial libraries
import geopandas as gpd
import folium
from folium import Choropleth, GeoJson, GeoJsonTooltip

# Plotting
import matplotlib.pyplot as plt

In [None]:
# Load taxi trip data
df = pd.read_csv("testing_no_null.csv", low_memory=False)
gdf_tracts = gpd.read_file("tl_2024_17_tract.geojson")

df['Trip Start Timestamp'] = pd.to_datetime(df['Trip Start Timestamp'])
df['Trip End Timestamp']   = pd.to_datetime(df['Trip End Timestamp'])

df['Pickup Census Tract'] = df['Pickup Census Tract'].astype(str).str.zfill(11)
df['Dropoff Census Tract'] = df['Dropoff Census Tract'].astype(str).str.zfill(11)

gdf_tracts["GEOID"] = gdf_tracts["GEOID"].astype(str).str.zfill(11)



# Create set of all used tract codes
#used_tracts = set(df['Pickup Census Tract']).union(set(df['Dropoff Census Tract']))
#print(used_tracts)

# Path to shapefile directory
#shapefile_dir = "tl_2024_17_tract"

# Find the .shp file
#shapefile_path = next((os.path.join(shapefile_dir, f) for f in os.listdir(shapefile_dir) if f.endswith(".shp")), None)

#if shapefile_path:
#    # Load shapefile
#    gdf = gpd.read_file(shapefile_path)

#    # Ensure GEOID is string
#    gdf['GEOID'] = gdf['GEOID'].astype(str)

#    # Filter to only tracts used in the dataset
#    gdf_filtered = gdf[gdf['GEOID'].isin(used_tracts)]

#    # Save filtered GeoJSON
#    geojson_path = "tl_2024_17_tract.geojson"
#    gdf_filtered.to_file(geojson_path, driver='GeoJSON')

#    print(f"Filtered GeoJSON saved with {len(gdf_filtered)} tracts to {geojson_path}")
#else:
#    print("No .shp file found in the tl_2024_17_tract directory.")



general temporal analysis of total trips

In [None]:
#helper plot function for bar plots of total trips
def plot_by_15min(
        df,
        tract_filter=None,        # None → all tracts | str | list of GEOIDs
        show_endpoints=True,      # include drop-offs?
        title=None,
        ax=None,
        tick_step_hours=2         # x-tick spacing in hours (default: every 2 h)
    ):
    """
    Bar-chart of pick-ups (and optionally drop-offs) by 15-minute time-of-day
    bucket, collapsed across all days in the data.
    """
    # ── 0. Work on a copy so the caller’s df is unchanged
    df = df.copy()

    # ── 1. Normalise tract columns to 11-digit zero-padded strings
    for col in ["Pickup Census Tract", "Dropoff Census Tract"]:
        df[col] = (
            df[col]
            .astype(str)
            .str.replace(r"\.0$", "", regex=True)
            .str.zfill(11)
        )

    # ── 2. Optional tract filter
    if tract_filter is not None:
        if not isinstance(tract_filter, (list, set, tuple)):
            tract_filter = [tract_filter]
        tract_filter = [str(t).replace(".0", "").zfill(11) for t in tract_filter]
        df = df[
            df["Pickup Census Tract"].isin(tract_filter) |
            df["Dropoff Census Tract"].isin(tract_filter)
        ]

    if df.empty:
        raise ValueError(f"No trips found for tract_filter={tract_filter}")

    # ── 3. Add 15-minute *time-of-day* buckets
    df["pickup_15m"]  = df["Trip Start Timestamp"].dt.floor("15T").dt.time
    df["dropoff_15m"] = df["Trip End Timestamp"] .dt.floor("15T").dt.time

    # ── 4. Aggregate across all days
    pickups  = df.groupby("pickup_15m").size().rename("Pickups")
    summary  = pickups.to_frame()

    if show_endpoints:
        dropoffs = df.groupby("dropoff_15m").size().rename("Drop-offs")
        summary  = summary.join(dropoffs, how="outer").fillna(0)

    # ── 5. Fill every 15-minute slot in the 24-hour clock
    full_range = pd.date_range("00:00", "23:45", freq="15min").time
    summary    = summary.reindex(full_range, fill_value=0)

    # ── 6. Plot
    x = np.arange(len(summary))              # 96 slots
    bar_w = 1.0

    if ax is None:
        _, ax = plt.subplots(figsize=(14, 4))

    # Compute shared base and delta layer
    shared = np.minimum(summary["Pickups"], summary["Drop-offs"])
    delta  = summary["Drop-offs"] - summary["Pickups"]

    # Lower bar: common portion (neutral color)
    ax.bar(x, shared, width=bar_w, label="Shared trips", color="lightgray")

    # Upper bar: positive or negative delta
    ax.bar(x, np.maximum(delta, 0), width=bar_w, bottom=shared,
           label="Drop-off surplus", color="orange", alpha=0.6)

    ax.bar(x, np.maximum(-delta, 0), width=bar_w, bottom=shared,
           label="Pick-up surplus", color="steelblue", alpha=0.6)

    # Nicely formatted x-axis
    step   = tick_step_hours * 4            # 4 quarter-hours per hour
    ticks  = x[::step]
    labels = [t.strftime("%H:%M") for t in full_range][::step]

    ax.set_xticks(ticks)
    ax.set_xticklabels(labels, rotation=90, fontsize=8)

    ax.set_ylabel("Trip count")
    ax.set_xlabel("Time of day (15-min bins)")
    ax.set_title(title or ("All tracts" if tract_filter is None
                           else f"Tract(s): {', '.join(tract_filter)}"))
    ax.legend(frameon=False)
    ax.margins(x=0)
    plt.tight_layout()
    return ax

all tracts

In [None]:
plot_by_15min(df, title="City-wide taxi activity")

overall pretty similar dropoff and pickup values -> most trips were inside the city limits. more pickups early and more dropoffs later in the day are also plausible. now it would be good to know how many trips are currently ongoing for each hour.

Creating Dataframes for Tract Analysis. 1 with Pickup Tract, 1 with Dropoff Tract. Filtering for only Tracts with more than 30 Trips as else the data is too sparse

In [None]:
pickup_tract_summary = (
    df.groupby('Pickup Census Tract')
      .agg(
          Avg_Start   = ('Trip Start Timestamp', 'mean'),
          Avg_End     = ('Trip End Timestamp',   'mean'),
          Avg_Seconds = ('Trip Seconds',         'mean'),
          Total_Trips = ('Trip Start Timestamp', 'count')
      )
      .reset_index()
      .rename(columns={'Pickup Census Tract': 'GEOID'})
)

dropoff_tract_summary = (
    df.groupby('Dropoff Census Tract')
      .agg(
          Avg_Start   = ('Trip Start Timestamp', 'mean'),
          Avg_End     = ('Trip End Timestamp',   'mean'),
          Avg_Seconds = ('Trip Seconds',         'mean'),
          Total_Trips = ('Trip Start Timestamp', 'count')
      )
      .reset_index()
      .rename(columns={'Dropoff Census Tract': 'GEOID'})
)

#discarding all tracts with less than 30 total trips, as these can be seen as non uniformally distributed and the value is therefore lowered

pickup_tract_summary = pickup_tract_summary[pickup_tract_summary['Total_Trips'] >= 30]
dropoff_tract_summary = dropoff_tract_summary[dropoff_tract_summary['Total_Trips'] >= 30]

Spatial Analysis, Amount of Starts/ Ends for each Tract 

In [None]:
tract_summary = pickup_tract_summary.copy()          # stay safe
tract_summary["GEOID"] = (
    tract_summary["GEOID"].astype(str)
    .str.replace(r"\.0$", "", regex=True)      # drop any trailing ".0"
    .str.zfill(11)
)
gdf_map = gdf_tracts.merge(tract_summary, on="GEOID", how="left")

center = [41.8781, -87.6298]              # Chicago
fmap = folium.Map(location=center, zoom_start=10, tiles="cartodbpositron")

gdf_for_choro = gdf_map.drop(columns=["Avg_Start", "Avg_End"], errors="ignore")

# Folium's Choropleth can’t handle datetime columns → drop them here

Choropleth(
    geo_data=gdf_for_choro,
    data=gdf_map,
    columns=["GEOID", "Total_Trips"],
    key_on="feature.properties.GEOID",
    fill_color="YlOrRd",
    fill_opacity=0.7,
    line_opacity=0.2,
    nan_fill_color="lightgrey",
    legend_name="Trip Start Count"
).add_to(fmap)
tooltip = GeoJsonTooltip(
    fields=["GEOID", "NAMELSAD", "Total_Trips"],
    aliases=["GEOID:", "Tract Name:", "Total Trips:"],
    localize=True,
    sticky=True,
    labels=True,
    style=("background-color: white; color: #333; font-size: 12px; padding: 5px;")
)

# Drop datetime columns before passing to GeoJson
gdf_clean = gdf_map.drop(columns=["Avg_Start", "Avg_End"], errors="ignore")

GeoJson(
    gdf_clean,
    tooltip=tooltip,
    name="Tracts"
).add_to(fmap)
fmap  

most trip starts per tract at the airport. and in the city.

In [None]:
tract_summary = dropoff_tract_summary.copy()          
tract_summary["GEOID"] = (
    tract_summary["GEOID"].astype(str)
    .str.replace(r"\.0$", "", regex=True)      # drop any trailing ".0"
    .str.zfill(11)
)

# -------------------------------------------------
# 4. Merge statistics onto geometry
# -------------------------------------------------
gdf_map = gdf_tracts.merge(tract_summary, on="GEOID", how="left")

# -------------------------------------------------
# 5. Build Folium map
# -------------------------------------------------
center = [41.8781, -87.6298]              # Chicago
fmap = folium.Map(location=center, zoom_start=10, tiles="cartodbpositron")

gdf_for_choro = gdf_map.drop(columns=["Avg_Start", "Avg_End"], errors="ignore")

# Folium's Choropleth can’t handle datetime columns → drop them here

Choropleth(
    geo_data=gdf_for_choro,
    data=gdf_map,
    columns=["GEOID", "Total_Trips"],
    key_on="feature.properties.GEOID",
    fill_color="YlOrRd",
    fill_opacity=0.7,
    line_opacity=0.2,
    nan_fill_color="lightgrey",
    legend_name="Trip End Count"
).add_to(fmap)
fmap  

temporal anaylsis of total trips overall and for relevant tracts

In [None]:
def plot_by_15min(
        df,
        tract_filter=None,        # None → all tracts | str | list of GEOIDs
        show_endpoints=True,      # include drop-offs?
        title=None,
        ax=None,
        tick_step_hours=2         # x-tick spacing in hours (default: every 2 h)
    ):
    """
    Bar-chart of pick-ups (and optionally drop-offs) by 15-minute time-of-day
    bucket, collapsed across all days in the data.
    """
    # ── 0. Work on a copy so the caller’s df is unchanged
    df = df.copy()

    # ── 1. Normalise tract columns to 11-digit zero-padded strings
    for col in ["Pickup Census Tract", "Dropoff Census Tract"]:
        df[col] = (
            df[col]
            .astype(str)
            .str.replace(r"\.0$", "", regex=True)
            .str.zfill(11)
        )

    # ── 2. Optional tract filter
    if tract_filter is not None:
        if not isinstance(tract_filter, (list, set, tuple)):
            tract_filter = [tract_filter]
        tract_filter = [str(t).replace(".0", "").zfill(11) for t in tract_filter]
        df = df[
            df["Pickup Census Tract"].isin(tract_filter) |
            df["Dropoff Census Tract"].isin(tract_filter)
        ]

    if df.empty:
        raise ValueError(f"No trips found for tract_filter={tract_filter}")

    # ── 3. Add 15-minute *time-of-day* buckets
    df["pickup_15m"]  = df["Trip Start Timestamp"].dt.floor("15T").dt.time
    df["dropoff_15m"] = df["Trip End Timestamp"] .dt.floor("15T").dt.time

    # ── 4. Aggregate across all days
    pickups  = df.groupby("pickup_15m").size().rename("Pickups")
    summary  = pickups.to_frame()

    if show_endpoints:
        dropoffs = df.groupby("dropoff_15m").size().rename("Drop-offs")
        summary  = summary.join(dropoffs, how="outer").fillna(0)

    # ── 5. Fill every 15-minute slot in the 24-hour clock
    full_range = pd.date_range("00:00", "23:45", freq="15min").time
    summary    = summary.reindex(full_range, fill_value=0)

    # ── 6. Plot
    x = np.arange(len(summary))              # 96 slots
    bar_w = 1.0

    if ax is None:
        _, ax = plt.subplots(figsize=(14, 4))

    # Compute shared base and delta layer
    shared = np.minimum(summary["Pickups"], summary["Drop-offs"])
    delta  = summary["Drop-offs"] - summary["Pickups"]

    # Lower bar: common portion (neutral color)
    ax.bar(x, shared, width=bar_w, label="Shared trips", color="lightgray")

    # Upper bar: positive or negative delta
    ax.bar(x, np.maximum(delta, 0), width=bar_w, bottom=shared,
           label="Drop-off surplus", color="orange", alpha=0.6)

    ax.bar(x, np.maximum(-delta, 0), width=bar_w, bottom=shared,
           label="Pick-up surplus", color="steelblue", alpha=0.6)

    # Nicely formatted x-axis
    step   = tick_step_hours * 4            # 4 quarter-hours per hour
    ticks  = x[::step]
    labels = [t.strftime("%H:%M") for t in full_range][::step]

    ax.set_xticks(ticks)
    ax.set_xticklabels(labels, rotation=90, fontsize=8)

    ax.set_ylabel("Trip count")
    ax.set_xlabel("Time of day (15-min bins)")
    ax.set_title(title or ("All tracts" if tract_filter is None
                           else f"Tract(s): {', '.join(tract_filter)}"))
    ax.legend(frameon=False)
    ax.margins(x=0)
    plt.tight_layout()
    return ax

In [None]:
plot_by_15min(df, title="City-wide taxi activity")


In [None]:
plot_by_15min(df, tract_filter="17031980000", title="O'Hare?")

In [None]:
# -------------------------------------------------
# 1. Load tract boundaries
# -------------------------------------------------
gdf_tracts = gpd.read_file("tl_2024_17_tract.geojson")

# -------------------------------------------------
# 2. Harmonise GEOID formats in BOTH tables
# -------------------------------------------------
gdf_tracts["GEOID"] = gdf_tracts["GEOID"].astype(str).str.zfill(11)

tract_summary = tract_summary.copy()          # stay safe
tract_summary["GEOID"] = (
    tract_summary["GEOID"].astype(str)
    .str.replace(r"\.0$", "", regex=True)      # drop any trailing ".0"
    .str.zfill(11)
)

# -------------------------------------------------
# 3. Convert Avg_Seconds → minutes (friendlier legend)
# -------------------------------------------------
tract_summary["Avg_Minutes"] = tract_summary["Avg_Seconds"] / 60

# -------------------------------------------------
# 4. Merge statistics onto geometry
# -------------------------------------------------
gdf_map = gdf_tracts.merge(tract_summary, on="GEOID", how="left")

# Convert datetime columns to strings so GeoJSON serialises cleanly
for col in ["Avg_Start", "Avg_End"]:
    if col in gdf_map.columns:
        gdf_map[col] = gdf_map[col].dt.strftime("%Y-%m-%d %H:%M")

# -------------------------------------------------
# 5. Build Folium map
# -------------------------------------------------
center = [41.8781, -87.6298]              # Chicago
fmap = folium.Map(location=center, zoom_start=10, tiles="cartodbpositron")

# Folium's Choropleth can’t handle datetime columns → drop them here
gdf_for_choro = gdf_map.drop(columns=["Avg_Start", "Avg_End"], errors="ignore")

Choropleth(
    geo_data=gdf_for_choro,
    data=gdf_map,
    columns=["GEOID", "Avg_Minutes"],
    key_on="feature.properties.GEOID",
    fill_color="YlOrRd",
    fill_opacity=0.7,
    line_opacity=0.2,
    nan_fill_color="lightgrey",
    legend_name="Average Trip Duration (minutes)"
).add_to(fmap)

# Add tract border layer with tooltip
GeoJson(
    gdf_map,
    tooltip=GeoJsonTooltip(
        fields=["GEOID", "Avg_Minutes", "Avg_Start", "Avg_End"],
        aliases=["Tract", "Avg Min", "Avg Start", "Avg End"],
        localize=True,
        sticky=False
    )
).add_to(fmap)

fmap


In [None]:
# --- 1. Import Required Libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from sklearn.mixture import GaussianMixture
import h3
import folium
from folium.plugins import HeatMap
import warnings
warnings.filterwarnings("ignore")


df = pd.read_csv("testing_no_null.csv", low_memory=False)

df['Trip Start Timestamp'] = pd.to_datetime(df['Trip Start Timestamp'])
df['Trip End Timestamp'] = pd.to_datetime(df['Trip End Timestamp'])


def lat_lng_to_h3(row, res=8):
    return h3.latlng_to_cell(row['Pickup Centroid Latitude'], row['Pickup Centroid Longitude'], res)

if 'Pickup Centroid Latitude' in df.columns and 'Pickup Centroid Longitude' in df.columns:
    df['H3 Index'] = df.apply(lat_lng_to_h3, axis=1)
else:
    print("Pickup Centroid Latitude or Longitude not found — skipping H3 indexing.")

df['Hour'] = df['Trip Start Timestamp'].dt.hour
df['Day'] = df['Trip Start Timestamp'].dt.date

# Group by spatial + temporal units
agg = df.groupby(['H3 Index', 'Hour']).size().reset_index(name='Trip Count')

# --- 6. Gaussian Mixture Model Clustering ---
coords = df[['Pickup Centroid Latitude', 'Pickup Centroid Longitude']].dropna()

# Fit GMM
gmm = GaussianMixture(n_components=5, covariance_type='full').fit(coords)
coords['Cluster'] = gmm.predict(coords)

# Plot clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x='Pickup Centroid Longitude',
    y='Pickup Centroid Latitude',
    hue='Cluster',
    data=coords,
    palette='viridis',
    alpha=0.6
)
plt.title("GMM Clusters of Taxi Pickup Locations")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.show()

# --- 7. Folium Heatmap (Optional) ---
map_center = [coords['Pickup Centroid Latitude'].mean(), coords['Pickup Centroid Longitude'].mean()]
fmap = folium.Map(location=map_center, zoom_start=11)
heat_data = coords[['Pickup Centroid Latitude', 'Pickup Centroid Longitude']].values.tolist()
HeatMap(heat_data).add_to(fmap)

fmap.save("pickup_heatmap.html")


In [None]:
from folium import Choropleth

# 1. Load GeoJSON and Taxi Data
gdf_tracts = gpd.read_file("tl_2024_17_tract.geojson")
df = pd.read_csv("testing_no_null.csv", low_memory=False)

# 2. Prepare Data
df['Trip Start Timestamp'] = pd.to_datetime(df['Trip Start Timestamp'])

print(gdf_tracts)

# Convert tract codes to string for joining
gdf_tracts['GEOID'] = gdf_tracts['GEOID'].astype(str)

# 3. Gaussian Mixture Model on pickup coordinates
coords = df[['Pickup Centroid Latitude', 'Pickup Centroid Longitude']]
gmm = GaussianMixture(n_components=5, covariance_type='full').fit(coords)
df['Cluster'] = gmm.predict(df[['Pickup Centroid Latitude', 'Pickup Centroid Longitude']])

# 4. Aggregate: Most common cluster per tract
cluster_map = df.groupby('Pickup Census Tract')['Cluster'].agg(lambda x: x.value_counts().index[0]).reset_index()
cluster_map.columns = ['GEOID', 'Dominant Cluster']

gdf_tracts['GEOID'] = gdf_tracts['GEOID'].astype(str)
cluster_map['GEOID'] = cluster_map['GEOID'].astype(str)

# 5. Merge with GeoDataFrame
gdf_tracts = gdf_tracts.merge(cluster_map, on='GEOID', how='left')

# 6. Create Folium Map
chicago_center = [41.8781, -87.6298]
fmap = folium.Map(location=chicago_center, zoom_start=10)

# 7. Add Colored Census Tracts Based on Cluster
Choropleth(
    geo_data=gdf_tracts,
    data=gdf_tracts,
    columns=['GEOID', 'Dominant Cluster'],
    key_on='feature.properties.GEOID',
    fill_color='Set1',
    fill_opacity=0.6,
    line_opacity=0.2,
    legend_name='Dominant GMM Cluster'
).add_to(fmap)

# Optional: Add tooltip
folium.GeoJson(
    gdf_tracts,
    name="Census Tracts",
    tooltip=folium.GeoJsonTooltip(fields=['GEOID', 'Dominant Cluster'])
).add_to(fmap)

fmap.save("gmm_clusters_by_tract.html")
fmap


In [None]:
from folium.plugins import HeatMap


pickup_map = folium.Map(location=chicago_center, zoom_start=10)

# Add very subtle borders only (no fill, no color scale)
folium.GeoJson(
    gdf_tracts,
    style_function=lambda x: {
        'fillColor': 'transparent',
        'color': 'gray',
        'weight': 0.3,
        'fillOpacity': 0
    },
    tooltip=folium.GeoJsonTooltip(fields=['GEOID'])
).add_to(pickup_map)

# Add heatmap
pickup_coords = df[['Pickup Centroid Latitude', 'Pickup Centroid Longitude']].dropna().values.tolist()
HeatMap(pickup_coords, radius=8, blur=15).add_to(pickup_map)



pickup_map


In [None]:
pickup_map = folium.Map(location=chicago_center, zoom_start=10)

# Add very subtle borders only (no fill, no color scale)
folium.GeoJson(
    gdf_tracts,
    style_function=lambda x: {
        'fillColor': 'transparent',
        'color': 'gray',
        'weight': 0.3,
        'fillOpacity': 0
    },
    tooltip=folium.GeoJsonTooltip(fields=['GEOID'])
).add_to(pickup_map)

# Add heatmap
pickup_coords = df[['Dropoff Centroid Latitude', 'Dropoff Centroid Longitude']].values.tolist()
HeatMap(pickup_coords, radius=8, blur=15).add_to(pickup_map)



pickup_map

trip length average per tract

In [None]:
print(df["Trip Seconds"].mean)

In [None]:
# 1. Compute trip durations in minutes
df['Trip End Timestamp'] = pd.to_datetime(df['Trip End Timestamp'])
df['Trip Duration (min)'] = (df['Trip End Timestamp'] - df['Trip Start Timestamp']).dt.total_seconds() / 60

# 2. Group by pickup tract and compute average trip duration
avg_duration = df.groupby('Pickup Census Tract')['Trip Duration (min)'].mean().reset_index()
avg_duration.columns = ['GEOID', 'Avg Trip Duration (min)']
avg_duration['GEOID'] = avg_duration['GEOID'].astype(str)

# 3. Merge into GeoDataFrame
gdf_tracts = gdf_tracts.merge(avg_duration, on='GEOID', how='left')

# 4. Create the folium map
chicago_center = [41.8781, -87.6298]
fmap = folium.Map(location=chicago_center, zoom_start=10)

# 5. Add average trip duration choropleth
Choropleth(
    geo_data=gdf_tracts,
    data=gdf_tracts,
    columns=['GEOID', 'Avg Trip Duration (min)'],
    key_on='feature.properties.GEOID',
    fill_color='YlOrRd',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Avg Trip Duration (min)'
).add_to(fmap)

# 6. Add tooltip for GEOID and trip duration
folium.GeoJson(
    gdf_tracts,
    name="Census Tracts",
    tooltip=folium.GeoJsonTooltip(fields=['GEOID', 'Avg Trip Duration (min)'],
                                  aliases=["Tract", "Avg Duration (min)"],
                                  localize=True)
).add_to(fmap)

# 7. Save and show
fmap.save("avg_trip_duration_by_tract.html")
fmap
