In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from data_loader import load_taxi_data
from features_engineering import add_taxi_time_features, \
  add_taxi_distance_features
from features_engineering import add_trip_duration_features



# Load taxi dataset

In [2]:
taxi_data_raw = load_taxi_data()

In [4]:
taxi_data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 11 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   id                  1458644 non-null  object 
 1   vendor_id           1458644 non-null  int64  
 2   pickup_datetime     1458644 non-null  object 
 3   dropoff_datetime    1458644 non-null  object 
 4   passenger_count     1458644 non-null  int64  
 5   pickup_longitude    1458644 non-null  float64
 6   pickup_latitude     1458644 non-null  float64
 7   dropoff_longitude   1458644 non-null  float64
 8   dropoff_latitude    1458644 non-null  float64
 9   store_and_fwd_flag  1458644 non-null  object 
 10  trip_duration       1458644 non-null  int64  
dtypes: float64(4), int64(3), object(4)
memory usage: 122.4+ MB


# Univariat EDA Taxidata

## General Exploration

In [None]:
taxi_data_raw.info()

- 1458644 rows
- 11 columns
- object: id, pickup_datetime, dropoff_datetime, store_and_fwd_flag
- int64: vendor_id, passenger_count, trip_duration
- float64: pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude
- no missing values/categories

In [None]:
taxi_data_raw.isna().sum()

- no NaNs

In [None]:
taxi_data_raw.nunique()

- id can be used for unique ID
- two unique vendors seems to be allright
- ten unique passengers
- two unique store_and_fwd_flag matches with description

In [None]:
taxi_data_raw[taxi_data_raw.duplicated()]

- no duplicated rows

## Exploration of vendor_id

In [None]:
# Vendor ID
taxi_data_raw['vendor_id'].value_counts()

In [None]:
taxi_data_raw['vendor_id'].plot.hist()

- Only two unique values: 1 and 2.
- Frequencies are relatively balanced.
- No missing values or anomalies detected.

## Exploration of pickup_datetime & dropoff_datetime

In [None]:
taxi_data_raw['pickup_datetime'].describe()

In [None]:
taxi_data_raw['dropoff_datetime'].describe()

In [None]:
(taxi_data_raw['dropoff_datetime'] < taxi_data_raw['pickup_datetime']).sum()

- no negative durations

In [None]:
# Basic datetime features
add_taxi_time_features(taxi_data_raw)

In [None]:
pickup_counts = taxi_data_raw['pickup_datetime'].dt.date.value_counts().sort_index()
dropoff_counts = taxi_data_raw['dropoff_datetime'].dt.date.value_counts().sort_index()

# Plot
fig, ax = plt.subplots(figsize=(12, 5))
pickup_counts.plot(ax=ax, label="Pick_ups", color='green', alpha=0.6)
dropoff_counts.plot(ax=ax, label="Drop_offs", color='blue', alpha=0.6)

ax.set_title("Daily Pickup and Dropoff Counts")
ax.set_xlabel("Date")
ax.set_ylabel("Number of Rides")
ax.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
taxi_data_raw["pickup_hour_of_day"].value_counts().sort_index().plot(kind='bar')

- peak from 18 to 22
- trough from 1 to 6

In [None]:
taxi_data_raw['pickup_day_of_week'].value_counts().sort_index().plot(kind='bar')

In [None]:
taxi_data_raw['pickup_day_of_week'].value_counts().sort_index()

- trough on Sunday
- peak on Friday

In [None]:
taxi_data_raw['pickup_month'].value_counts().sort_index().plot(kind='bar')

In [None]:
taxi_data_raw['pickup_month'].value_counts().sort_index()

- trough in January
- peak in March

In [None]:
taxi_data_raw.groupby(['pickup_datetime']).size().sort_values(ascending=False).head()

- could be repeated records

In [None]:
# shows the average trip duration per pickup hour across a 24-hour day
taxi_data_raw.groupby(taxi_data_raw['pickup_hour_of_day'])['trip_duration'].mean().plot()

- aligns with expected rush hour effects

## Exploration of passenger_count

Findings
- The majority of trips have 1 passenger; distribution is heavily right-skewed
- Edge values observed: 60 trips with 0 passengers, and very few with 7–9 passengers
- These edge values are likely invalid or extremely rare
- Trips with 0 passengers show the highest average trip duration, which is implausible and suggests data entry or system error
- Passenger counts 7–9 are extremely rare and may suffer from sampling bias or logging issues
- These edge cases collectively make up a very small share of the data (<0.01%)

In [None]:
taxi_data_raw["passenger_count"].describe()

In [None]:
# Passenger count
taxi_data_raw['passenger_count'].value_counts().sort_index()

- 60 with passenger count 0
- 5 with passenger count > 6

In [None]:
taxi_data_raw['passenger_count'].value_counts().sort_index().plot(kind='bar')

In [None]:
taxi_data_raw.groupby('passenger_count')['trip_duration'].mean().plot(kind='bar')

- Passenger count = 0 has the highest average trip duration, which is not plausible, indicating likely data entry or logging errors

In [None]:
(pd.crosstab(taxi_data_raw['pickup_hour_of_day'], taxi_data_raw['passenger_count']).plot(
    kind='bar', stacked=True, figsize=(12, 5)))

- Passenger count = 1 dominates at all hours
- Group trips (2–4 passengers) are more frequent in the evening and late night

In [None]:
edge_cases = taxi_data_raw['passenger_count'].isin([0, 7, 8, 9])
edge_cases.mean()

- 0.0045% of trips are likely outliers or errors

In [None]:
# View rows with passenger_count == 0
zero_passenger_trips = taxi_data_raw[taxi_data_raw['passenger_count'] == 0]

zero_passenger_trips[['pickup_datetime', 'dropoff_datetime', 'trip_duration']].describe()

## Explore pickup_longitude, pickup_latitude & dropoff_longitude, dropoff_latitude

In [None]:
NYC_LAT_MIN = 40.47
NYC_LAT_MAX = 41.0
NYC_LON_MIN = -74.3
NYC_LON_MAX = -73.6

In [None]:
# Summary statistics for coordinates
taxi_data_raw[
  ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']].describe()

In [None]:
taxi_data_raw[
  (taxi_data_raw['pickup_latitude'] < NYC_LAT_MIN) | (
      taxi_data_raw['pickup_latitude'] > NYC_LAT_MAX) |
  (taxi_data_raw['dropoff_latitude'] < NYC_LAT_MIN) | (
      taxi_data_raw['dropoff_latitude'] > NYC_LAT_MAX) |
  (taxi_data_raw['pickup_longitude'] < NYC_LON_MIN) | (
      taxi_data_raw['pickup_longitude'] > NYC_LON_MAX) |
  (taxi_data_raw['dropoff_longitude'] < NYC_LON_MIN) | (
      taxi_data_raw['dropoff_longitude'] > NYC_LON_MAX)]

- Some trip coordinates lie outside NYC bounds
- Some points lie far outside the visible city bounds — likely due to GPS glitches or data corruption.

In [None]:
(taxi_data_raw['pickup_longitude'] % 0.001 < 1e-10).mean()

In [None]:
(taxi_data_raw['pickup_latitude'] % 0.001 < 1e-10).mean()

In [None]:
(taxi_data_raw['dropoff_latitude'] % 0.001 < 1e-10).mean()

In [None]:
(taxi_data_raw['dropoff_longitude'] % 0.001 < 1e-10).mean()

- no rounding issues

### Geographic distribution

In [None]:
# Scatterplot of pickup and dropoff locations
plt.figure(figsize=(6, 6))
plt.scatter(taxi_data_raw['pickup_longitude'], taxi_data_raw['pickup_latitude'], s=0.5, alpha=0.1,
            label='pickup')
plt.scatter(taxi_data_raw['dropoff_longitude'], taxi_data_raw['dropoff_latitude'], s=0.5, alpha=0.1,
            label='dropoff')
plt.xlim(-74.3, -73.6)
plt.ylim(40.47, 41.0)
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.legend(loc='upper right')
plt.title("Geographic Distribution of Taxi Pickups and Dropoffs")
plt.grid(True)
plt.tight_layout()
plt.show()

- Most pickups and dropoffs are located within the NYC area (Manhattan, Brooklyn, Queens).
- Clear density clusters appear around Midtown and Downtown Manhattan.

In [None]:
taxi_data_raw[
  (taxi_data_raw['pickup_latitude'].round(5) == taxi_data_raw['dropoff_latitude'].round(5)) &
  (taxi_data_raw['pickup_longitude'].round(5) == taxi_data_raw['dropoff_longitude'].round(5)) &
  (taxi_data_raw['trip_duration'] > 300)  # more than 5 minutes
  ].shape[0]

- implausible values

## Explore store_and_fwd_flag

In [None]:
taxi_data_raw['store_and_fwd_flag'].value_counts(normalize=True)

In [None]:
taxi_data_raw['store_and_fwd_flag'].value_counts().sort_index().plot(kind='bar')

- store_and_fwd_flag = 'Y' is rare and likely not predictive on its own

## Exploration of trip_duration

In [None]:
taxi_data_raw = add_trip_duration_features(taxi_data_raw)

In [None]:
# Summary statistics for trip duration
taxi_data_raw['trip_duration'].describe()

In [None]:
# Check for extreme trip durations
taxi_data_raw[taxi_data_raw['trip_duration'] > 3 * 3600]

In [None]:
taxi_data_raw[taxi_data_raw['trip_duration'] < 60]  # < 1 minute

- Several trips have a duration longer than 2 hours, which is highly unlikely for intra-city NYC taxi rides. A few trips are also shorter than 1 minute, which may indicate errors or missing data.

In [None]:
sns.boxplot(x=taxi_data_raw['trip_duration'][taxi_data_raw['trip_duration'] < 5000])

## Distance Features

### Haversine

In [None]:
add_taxi_distance_features(taxi_data_raw)

In [None]:
taxi_data_raw.plot.scatter(x='hav_dist_km', y='trip_duration_min', alpha=0.3)

In [None]:
# Filter for clearer visualization
filtered_data = taxi_data_raw[
  (taxi_data_raw['hav_dist_km'] <= 20) &
  (taxi_data_raw['trip_duration_min'] <= 60)
  ]

# Plot
plt.figure(figsize=(10, 5))
plt.scatter(filtered_data['hav_dist_km'], filtered_data['trip_duration_min'], alpha=0.1, s=1)
plt.xlabel("hav_dist_km")
plt.ylabel("Trip Duration (min)")
plt.title("Trip Duration vs. hav_dist_km (Filtered: ≤20 km & ≤60 min)")
plt.grid(True)
plt.tight_layout()
plt.show()

### Vincenty

In [None]:
# # Apply haversine to the dataset
# taxi_data_raw['vin_dist_km'] = vincenty(
#     taxi_data_raw['pickup_latitude'], taxi_data_raw['pickup_longitude'],
#     taxi_data_raw['dropoff_latitude'], taxi_data_raw['dropoff_longitude']
# )

In [None]:
# taxi_data_raw.plot.scatter(x='vin_dist_km', y='trip_duration_min', alpha=0.3)

- Long durations at very short distances
- Long distances with short durations

In [None]:
# # Filter for clearer visualization
# filtered_data = taxi_data_raw[
#   (taxi_data_raw['vin_dist_km'] <= 20) &
#   (taxi_data_raw['trip_duration_min'] <= 60)
#   ]
#
# # Plot
# plt.figure(figsize=(10, 5))
# plt.scatter(filtered_data['vin_dist_km'], filtered_data['trip_duration_min'], alpha=0.1, s=1)
# plt.xlabel("VINCENTY_DIST_KM")
# plt.ylabel("Trip Duration (min)")
# plt.title("Trip Duration vs. VINCENTY_DIST_KM (Filtered: ≤20 km & ≤60 min)")
# plt.grid(True)
# plt.tight_layout()
# plt.show()

- For trips under ~2 km, durations vary widely
- Some points still lie far above the trend line

In [None]:
# from geopy.distance import geodesic
#
# # Apply geodesic row-wise
# taxi_data_raw['geodesic_km'] = taxi_data_raw.apply(
#     lambda row: geodesic(
#         (row['pickup_latitude'], row['pickup_longitude']),
#         (row['dropoff_latitude'], row['dropoff_longitude'])
#     ).kilometers,
#     axis=1
# )

In [None]:
# Histogram (capped at 100 minutes for clarity)
plt.figure(figsize=(8, 4))
plt.hist(taxi_data_raw['trip_duration_min'], bins=100, range=(0, 100), edgecolor='black')
plt.title("Distribution of Trip Duration (minutes)")
plt.xlabel("Trip Duration [min]")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Log-transformed trip duration
taxi_data_raw['trip_duration_log'] = np.log1p(taxi_data_raw['trip_duration'])

plt.figure(figsize=(8, 4))
plt.hist(taxi_data_raw['trip_duration_log'], bins=100, edgecolor='black')
plt.title("Log-Transformed Trip Duration")
plt.xlabel("log(1 + trip_duration [s])")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()

- log1p(trip_duration) helps stabilize variance, handle skewness, and improve model performance

# Save Taxi Data

In [None]:
taxi_data_raw.info()

In [None]:
taxi_data_raw.to_csv("data/taxi_data_clean.csv", index=False)