In [None]:
import pickle
from pathlib import Path
from zipfile import ZipFile

import matplotlib.pyplot as plt
import pandas as pd
from kaggle.api.kaggle_api_extended import KaggleApi

### Load taxi dataset, unzip and save in taxi_data_raw

In [None]:
def load_taxi_data():
  download_file_name = "nyc-taxi-trip-duration.zip"
  data_dir = Path("data_packages")
  extracted_dir = Path("data")
  csv_path = extracted_dir / "train.csv"
  pkl_path = extracted_dir / "train.pkl"

  zip_path = data_dir / download_file_name

# Step 0: Fast CSV access via cache (if available)
  if pkl_path.is_file():
    with open(pkl_path, "rb") as f:
      return pickle.load(f)

  # Step 1: Download, only if ZIP is still missing
  if not zip_path.is_file():
    data_dir.mkdir(parents=True, exist_ok=True)
    api = KaggleApi()
    api.authenticate()
    api.competition_download_files("nyc-taxi-trip-duration", path=data_dir)

# Step 2: Unpack ZIP only if inner ZIPs are missing
  inner_zip_names = {"train.zip", "test.zip", "sample_submission.zip"}
  existing_inner_zips = {z.name for z in data_dir.glob("*.zip")}
  missing_inner_zips = inner_zip_names - existing_inner_zips

  if missing_inner_zips:
    with ZipFile(zip_path, 'r') as outer_zip:
      outer_zip.extractall(data_dir)

# Step 3: Extract only missing CSVs from inner ZIPs
  extracted_dir.mkdir(parents=True, exist_ok=True)
  for inner_zip in data_dir.glob("*.zip"):
    with ZipFile(inner_zip, 'r') as zip_ref:
      for member in zip_ref.namelist():
        if member.endswith(".csv"):
          target_file = extracted_dir / Path(member).name
          if not target_file.is_file():
            zip_ref.extract(member, path=extracted_dir)

# Step 4: Load CSV and save pkl cache
  if not csv_path.is_file():
    raise FileNotFoundError(f"'{csv_path}' wurde nicht gefunden – Entpackung fehlgeschlagen.")

  df = pd.read_csv(csv_path)
  with open(pkl_path, "wb") as f:
    pickle.dump(df, f)

  return df


# Anwendung
taxi_data_raw = load_taxi_data()

# Univariat EDA Taxidata

## General Exploration

In [None]:
taxi_data_raw.info()

- 1458644 rows
- 11 columns
- object: id, pickup_datetime, dropoff_datetime, store_and_fwd_flag
- int64: vendor_id, passenger_count, trip_duration
- float64: pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude
- no missing values/categories

In [None]:
taxi_data_raw.isna().sum()

- no NaNs

In [None]:
taxi_data_raw.nunique()

- id can be used for unique ID
- two unique vendors seems to be allright
- ten unique passengers
- two unique store_and_fwd_flag matches with description

In [None]:
taxi_data_raw[taxi_data_raw.duplicated()]

- no duplicated rows

## Exploration of vendor_id

In [None]:
# Vendor ID
taxi_data_raw['vendor_id'].value_counts()

In [None]:
taxi_data_raw['vendor_id'].plot.hist()

- Only two unique values: 1 and 2.
- Frequencies are relatively balanced.
- No missing values or anomalies detected.

## Exploration of pickup_datetime & dropoff_datetime

pickup_datetime

In [None]:
taxi_data_raw['pickup_datetime'] = pd.to_datetime(taxi_data_raw['pickup_datetime'])

- transform object dtype into pd_datetime

In [None]:
taxi_data_raw['pickup_datetime'].astype(str).str.match(
    r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$').all()

- check wether the pattern is equal

In [None]:
taxi_data_raw['pickup_datetime'].describe()

dropoff_datetime

In [None]:
taxi_data_raw['dropoff_datetime'] = pd.to_datetime(taxi_data_raw['dropoff_datetime'])

- transform object dtype into pd_datetime

In [None]:
taxi_data_raw['dropoff_datetime'].astype(str).str.match(
    r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$').all()

- check wether the pattern is equal

In [None]:
taxi_data_raw['dropoff_datetime'].describe()

In [None]:
(taxi_data_raw['dropoff_datetime'] < taxi_data_raw['pickup_datetime']).sum()

- no negative durations

In [None]:
# Basic datetime features
taxi_data_raw['pickup_hour_of_day'] = taxi_data_raw['pickup_datetime'].dt.hour
taxi_data_raw['pickup_day_of_week'] = taxi_data_raw['pickup_datetime'].dt.dayofweek
taxi_data_raw['pickup_month'] = taxi_data_raw['pickup_datetime'].dt.month

# Day of year and hour of year (for merging with weather)
taxi_data_raw['day_of_year'] = taxi_data_raw['pickup_datetime'].dt.dayofyear
taxi_data_raw['hour_of_year'] = ((taxi_data_raw['day_of_year'] - 1) * 24 +
                                 taxi_data_raw['pickup_hour_of_day'])

In [None]:
pickup_counts = taxi_data_raw['pickup_datetime'].dt.date.value_counts().sort_index()
dropoff_counts = taxi_data_raw['dropoff_datetime'].dt.date.value_counts().sort_index()

# Plot
fig, ax = plt.subplots(figsize=(12, 5))
pickup_counts.plot(ax=ax, label="Pickups", color='green', alpha=0.6)
dropoff_counts.plot(ax=ax, label="Dropoffs", color='blue', alpha=0.6)

ax.set_title("Daily Pickup and Dropoff Counts")
ax.set_xlabel("Date")
ax.set_ylabel("Number of Rides")
ax.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
taxi_data_raw["pickup_hour_of_day"].value_counts().sort_index().plot(kind='bar')

- peak from 18 to 22
- trough from 1 to 6

In [None]:
taxi_data_raw['pickup_day_of_week'].value_counts().sort_index().plot(kind='bar')

In [None]:
taxi_data_raw['pickup_day_of_week'].value_counts().sort_index()

- trough on Sunday
- peak on Friday

In [None]:
taxi_data_raw['pickup_month'].value_counts().sort_index().plot(kind='bar')

In [None]:
taxi_data_raw['pickup_month'].value_counts().sort_index()

- trough in January
- peak in March

In [None]:
taxi_data_raw.groupby(['pickup_datetime']).size().sort_values(ascending=False).head()

- could be repeated records

In [None]:
# shows the average trip duration per pickup hour across a 24-hour day
taxi_data_raw.groupby(taxi_data_raw['pickup_hour_of_day'])['trip_duration'].mean().plot()

- aligns with expected rush hour effects

## Exploration of passenger_count

Findings
- The majority of trips have 1 passenger; distribution is heavily right-skewed
- Edge values observed: 60 trips with 0 passengers, and very few with 7–9 passengers
- These edge values are likely invalid or extremely rare
- Trips with 0 passengers show the highest average trip duration, which is implausible and suggests data entry or system error
- Passenger counts 7–9 are extremely rare and may suffer from sampling bias or logging issues
- These edge cases collectively make up a very small share of the data (<0.01%)

In [None]:
taxi_data_raw["passenger_count"].describe()

In [None]:
# Passenger count
taxi_data_raw['passenger_count'].value_counts().sort_index()

- 60 with passenger count 0
- 5 with passenger count > 6

In [None]:
taxi_data_raw['passenger_count'].value_counts().sort_index().plot(kind='bar')

In [None]:
taxi_data_raw.groupby('passenger_count')['trip_duration'].mean().plot(kind='bar')

- Passenger count = 0 has the highest average trip duration, which is not plausible, indicating likely data entry or logging errors

In [None]:
(pd.crosstab(taxi_data_raw['pickup_hour_of_day'], taxi_data_raw['passenger_count']).plot(
    kind='bar', stacked=True, figsize=(12, 5)))

- Passenger count = 1 dominates at all hours
- Group trips (2–4 passengers) are more frequent in the evening and late night

In [None]:
edge_cases = taxi_data_raw['passenger_count'].isin([0, 7, 8, 9])
edge_cases.mean()

- 0.0045% of trips are likely outliers or errors

In [None]:
# View rows with passenger_count == 0
zero_passenger_trips = taxi_data_raw[taxi_data_raw['passenger_count'] == 0]

zero_passenger_trips[['pickup_datetime', 'dropoff_datetime', 'trip_duration']].describe()

## Explore pickup_longitude, pickup_latitude & dropoff_longitude, dropoff_latitude

In [None]:
# Summary statistics for coordinates
taxi_data_raw[
  ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']].describe()

In [None]:
taxi_data_raw[
  (taxi_data_raw['pickup_latitude'] < 40.47) | (taxi_data_raw['pickup_latitude'] > 41.0) |
  (taxi_data_raw['dropoff_latitude'] < 40.47) | (taxi_data_raw['dropoff_latitude'] > 41.0) |
  (taxi_data_raw['pickup_longitude'] < -74.3) | (taxi_data_raw['pickup_longitude'] > -73.6) |
  (taxi_data_raw['dropoff_longitude'] < -74.3) | (taxi_data_raw['dropoff_longitude'] > -73.6)]

- Some trip coordinates lie outside NYC bounds
- Some points lie far outside the visible city bounds — likely due to GPS glitches or data corruption.

In [None]:
taxi_data_raw['pickup_longitude'].map(lambda x: str(x).endswith('000')).mean()

In [None]:
taxi_data_raw['pickup_latitude'].map(lambda x: str(x).endswith('000')).mean()

In [None]:
taxi_data_raw['dropoff_latitude'].map(lambda x: str(x).endswith('000')).mean()

In [None]:
taxi_data_raw['dropoff_longitude'].map(lambda x:str(x).endswith('000')).mean()

- no rounding issues

### Geographic distribution

In [None]:
# Scatterplot of pickup and dropoff locations
plt.figure(figsize=(6, 6))
plt.scatter(taxi_data_raw['pickup_longitude'], taxi_data_raw['pickup_latitude'], s=0.5, alpha=0.1,
            label='pickup')
plt.scatter(taxi_data_raw['dropoff_longitude'], taxi_data_raw['dropoff_latitude'], s=0.5, alpha=0.1,
            label='dropoff')
plt.xlim(-74.3, -73.6)
plt.ylim(40.47, 41.0)
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.legend(loc='upper right')
plt.title("Geographic Distribution of Taxi Pickups and Dropoffs")
plt.grid(True)
plt.tight_layout()
plt.show()

- Most pickups and dropoffs are located within the NYC area (Manhattan, Brooklyn, Queens).
- Clear density clusters appear around Midtown and Downtown Manhattan.

In [None]:
taxi_data_raw[
  (taxi_data_raw['pickup_latitude'].round(5) == taxi_data_raw['dropoff_latitude'].round(5)) &
  (taxi_data_raw['pickup_longitude'].round(5) == taxi_data_raw['dropoff_longitude'].round(5)) &
  (taxi_data_raw['trip_duration'] > 300)  # more than 5 minutes
  ]

- implausible values

## Explore store_and_fwd_flag

In [None]:
taxi_data_raw['store_and_fwd_flag'].value_counts(normalize=True)

In [None]:
taxi_data_raw['store_and_fwd_flag'].value_counts().sort_index().plot(kind='bar')

- store_and_fwd_flag = 'Y' is rare and likely not predictive on its own

## Exploration of trip_duration

In [None]:
# Summary statistics for trip duration
taxi_data_raw['trip_duration'].describe()

In [None]:
# Check for extreme trip durations
taxi_data_raw[taxi_data_raw['trip_duration'] > 3 * 3600]

In [None]:
taxi_data_raw[taxi_data_raw['trip_duration'] < 60]  # < 1 minute

- Several trips have a duration longer than 2 hours, which is highly unlikely for intra-city NYC taxi rides. A few trips are also shorter than 1 minute, which may indicate errors or missing data.

In [None]:
import seaborn as sns

sns.boxplot(x=taxi_data_raw['trip_duration'])

In [None]:
# Trip duration in minutes
taxi_data_raw['trip_duration_min'] = taxi_data_raw['trip_duration'] / 60

In [None]:
from numpy import radians, sin, cos, sqrt, arctan2


def haversine(lat1, lon1, lat2, lon2):
  r = 6371  # Earth's radius in km
  lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
  dlat = lat2 - lat1
  dlon = lon2 - lon1

  a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
  c = 2 * arctan2(sqrt(a), sqrt(1 - a))

  return r * c


# Apply to the dataset
taxi_data_raw['haversine_km'] = haversine(
    taxi_data_raw['pickup_latitude'], taxi_data_raw['pickup_longitude'],
    taxi_data_raw['dropoff_latitude'], taxi_data_raw['dropoff_longitude']
)

In [None]:
taxi_data_raw.plot.scatter(x='haversine_km', y='trip_duration_min', alpha=0.3)

- Long durations at very short distances
- Long distances with short durations

In [None]:
# Filter for clearer visualization
filtered_data = taxi_data_raw[
  (taxi_data_raw['haversine_km'] <= 20) &
  (taxi_data_raw['trip_duration_min'] <= 60)
  ]

# Plot
plt.figure(figsize=(10, 5))
plt.scatter(filtered_data['haversine_km'], filtered_data['trip_duration_min'], alpha=0.1, s=1)
plt.xlabel("Haversine Distance (km)")
plt.ylabel("Trip Duration (min)")
plt.title("Trip Duration vs. Haversine Distance (Filtered: ≤20 km & ≤60 min)")
plt.grid(True)
plt.tight_layout()
plt.show()

- For trips under ~2 km, durations vary widely
- Some points still lie far above the trend line

In [None]:
# Histogram (capped at 100 minutes for clarity)
plt.figure(figsize=(8, 4))
plt.hist(taxi_data_raw['trip_duration_min'], bins=100, range=(0, 100), edgecolor='black')
plt.title("Distribution of Trip Duration (minutes)")
plt.xlabel("Trip Duration [min]")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
import numpy as np

# Log-transformed trip duration
taxi_data_raw['trip_duration_log'] = np.log1p(taxi_data_raw['trip_duration'])

plt.figure(figsize=(8, 4))
plt.hist(taxi_data_raw['trip_duration_log'], bins=100, edgecolor='black')
plt.title("Log-Transformed Trip Duration")
plt.xlabel("log(1 + trip_duration [s])")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()

- log1p(trip_duration) helps stabilize variance, handle skewness, and improve model performance

# Save Taxi Data

In [None]:
taxi_data_raw.to_csv("data/taxi_data_clean.csv", index=False)