# Imports + pipeline call

In [1]:
import matplotlib.pyplot as plt

from constants.taxi_c import GeoBounds as gb
from pipelines.taxi_pipeline import build_taxi_dataset

  from numpy.array_api import int32
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
taxi_df = build_taxi_dataset(save_csv=True)

# EDA Hist / describe

- 1458644 rows
- 11 columns
- object: id, pickup_datetime, dropoff_datetime, store_and_fwd_flag
- int64: vendor_id, passenger_count, trip_duration
- float64: pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude
- no missing values/categories
- no NaNs
- id can be used for unique ID
- two unique vendors seems to be all right
- ten unique passengers
- two unique store_and_fwd_flag matches with description
- no duplicated rows

In [None]:
taxi_df.describe(include='object')

## vendor_id

In [None]:
taxi_df['vendor_id'].value_counts()

In [None]:
taxi_df['vendor_id'].isna().sum()

- Only two unique values: 1 and 2.
- Frequencies are relatively balanced.
- No missing values or anomalies detected.

##  pickup_datetime & dropoff_datetime

In [None]:
# Number of missing values
print("Missing pickup_datetime:", taxi_df['pickup_datetime'].isna().sum())
print("Missing dropoff_datetime:", taxi_df['dropoff_datetime'].isna().sum())

In [None]:
# Negative or unrealistic trip durations
invalid_times = (taxi_df['dropoff_datetime'] < taxi_df['pickup_datetime'])
print("Trips with negative duration:", invalid_times.sum())

- no negative durations

In [None]:
pickup_counts = taxi_df['pickup_datetime'].dt.date.value_counts().sort_index()
dropoff_counts = taxi_df['dropoff_datetime'].dt.date.value_counts().sort_index()

# Plot
fig, ax = plt.subplots(figsize=(12, 5))
pickup_counts.plot(ax=ax, label="Pick_ups", color='green', alpha=0.6)
dropoff_counts.plot(ax=ax, label="Drop_offs", color='blue', alpha=0.6)

ax.set_title("Daily Pickup and Dropoff Counts")
ax.set_xlabel("Date")
ax.set_ylabel("Number of Rides")
ax.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

- aligns with expected rush hour effects

##  passenger_count

In [None]:
print(taxi_df['passenger_count'].value_counts().sort_index())

- 60 with passenger count 0
- 5 with passenger count > 6

In [None]:
taxi_df['passenger_count'].value_counts().sort_index().plot(kind='bar')

In [None]:
taxi_df.groupby('passenger_count')['trip_duration'].mean().plot(kind='bar')

- Passenger count = 0 has the highest average trip duration, which is not plausible, indicating likely data entry or logging errors

- Passenger count = 1 dominates at all hours
- Group trips (2–4 passengers) are more frequent in the evening and late night

##  pickup_longitude, pickup_latitude & dropoff_longitude, dropoff_latitude

In [None]:
print(taxi_df[gb.cols].describe())

- Some trip coordinates lie outside NYC bounds
- Some points lie far outside the visible city bounds — likely due to GPS glitches or data corruption.

## Geographic distribution

In [None]:
# Scatterplot of pickup and dropoff locations
plt.figure(figsize=(6, 6))
plt.scatter(taxi_df['pickup_longitude'], taxi_df['pickup_latitude'], s=0.5, alpha=0.1,
            label='pickup')
plt.scatter(taxi_df['dropoff_longitude'], taxi_df['dropoff_latitude'], s=0.5, alpha=0.1,
            label='dropoff')
plt.xlim(gb.min_lon, gb.max_lon)
plt.ylim(gb.min_lat, gb.max_lat)
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.legend(loc='upper right')
plt.title("Geographic Distribution of Taxi Pickups and Dropoffs")
plt.grid(True)
plt.tight_layout()
plt.show()

- Most pickups and dropoffs are located within the NYC area (Manhattan, Brooklyn, Queens).
- Clear density clusters appear around Midtown and Downtown Manhattan.

##  store_and_fwd_flag

In [None]:
print(taxi_df['store_and_fwd_flag'].value_counts(dropna=False))

- store_and_fwd_flag = 'Y' is rare and likely not predictive on its own

##  trip_duration

In [None]:
taxi_df['trip_duration'].describe()

In [None]:
taxi_df['trip_duration'].plot.hist(bins=100, range=(0, 3600))  # bis 1 Stunde

- Several trips have a duration longer than 2 hours, which is highly unlikely for intra-city NYC taxi rides. A few trips are also shorter than 1 minute, which may indicate errors or missing data.

In [None]:
plt.figure(figsize=(8, 4))
plt.hist(taxi_df['trip_duration_log'], bins=50, edgecolor='black')
plt.title("Log-Transformed Trip Duration")
plt.xlabel("log(1 + trip_duration [s])")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()

## distance feature (Haversine)

In [None]:
taxi_df.plot.scatter(x='hav_dist_km', y='trip_duration_log', alpha=0.3)

In [None]:
# Filter for clearer visualization
filtered_data = taxi_df[
  (taxi_df['route_distance_km'] <= 20) &
  (taxi_df['hav_dist_km'] > 0.0) &
  (taxi_df['trip_duration_min'] <= 60) &
  (taxi_df['trip_duration_min'] > 0)
  ]

# Plot
plt.figure(figsize=(10, 5))
plt.scatter(filtered_data['hav_dist_km'], filtered_data['trip_duration_min'], alpha=0.1, s=1)
plt.xlabel("hav_dist_km")
plt.ylabel("Trip Duration (min)")
plt.title("Trip Duration vs. hav_dist_km (Filtered: ≤20 km & ≤60 min)")
plt.grid(True)
plt.tight_layout()
plt.show()

- Long durations at very short distances
- Long distances with short durations