# NYC Taxi Data - Exploratory Data Analysis (EDA)

This notebook explores the NYC Taxi dataset to understand demand patterns, seasonality, and anomalies before modeling.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (15, 6)

## 1. Load Data

In [None]:
# Load processed hourly data
try:
    df = pd.read_csv('../data/processed/hourly_demand.csv', index_col='timestamp', parse_dates=True)
    print(f"Loaded {len(df)} hourly records")
    print(df.head())
except FileNotFoundError:
    print("Data not found! Please run the download and preprocessing scripts first.")

## 2. Time Series Visualization

In [None]:
plt.figure(figsize=(15, 6))
plt.plot(df.index, df['trip_count'], alpha=0.8)
plt.title('NYC Taxi Hourly Demand', fontsize=14)
plt.xlabel('Time')
plt.ylabel('Trips')
plt.show()

## 3. Seasonal Decomposition
Decomposing the series into Trend, Seasonality, and Residuals.

In [None]:
decomposition = seasonal_decompose(df['trip_count'], model='additive', period=24)

fig, axes = plt.subplots(4, 1, figsize=(15, 12))
decomposition.observed.plot(ax=axes[0], title='Observed')
decomposition.trend.plot(ax=axes[1], title='Trend')
decomposition.seasonal.plot(ax=axes[2], title='Seasonal (Daily)')
decomposition.resid.plot(ax=axes[3], title='Residuals')
plt.tight_layout()
plt.show()

## 4. Demand Distribution

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['trip_count'], bins=50, kde=True)
plt.title('Distribution of Hourly Trip Counts')
plt.xlabel('Trips')
plt.show()

## 5. Weekly Patterns
Analyzing demand by day of the week.

In [None]:
df['day_name'] = df.index.day_name()
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='day_name', y='trip_count', order=day_order)
plt.title('Trip Count Distribution by Day of Week')
plt.show()

## 6. Hourly Patterns
Analyzing demand by hour of the day.

In [None]:
df['hour'] = df.index.hour

plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='hour', y='trip_count')
plt.title('Trip Count Distribution by Hour of Day')
plt.show()