# Load data

In [None]:
import matplotlib.pyplot as plt
# Load packages
import pandas as pd
from sklearn.model_selection import train_test_split

from features_engineering import add_weather_features


In [None]:
# Load taxi and weather data
taxi_weather_data_raw = pd.read_csv("data/taxi_weather_data_raw.csv")

In [None]:
taxi_weather_data_raw.info()

In [None]:
taxi_weather_data_raw[['hour_of_year', 'datetime_hour']].head()

In [None]:
taxi_weather_data_raw[['datetime_hour', 'pickup_datetime']]

In [None]:
taxi_weather_data_raw.isna().sum()

In [None]:
taxi_weather_data_raw.info()

In [None]:
taxi_weather_data_raw.isna().sum()

# split train and test set

In [None]:
train_set, test_set = train_test_split(taxi_weather_data_raw, test_size=0.2, random_state=42)

In [None]:
taxi_weather_data = train_set.copy()

# Seek correlation

## correlation matrix (trip_dur and trip_dur_log)

In [None]:
corr_matrix = taxi_weather_data.select_dtypes(include='number').corr()

In [None]:
corr_matrix['trip_duration'].sort_values(ascending=False)

- Not a strong predictor so far. Even distance is only weakly correlated
- No weather variable currently shows a significant linear correlation

In [None]:
corr_matrix['trip_duration_log'].sort_values(ascending=False)

- The log transformation of trip_duration has reduced the dispersion
- haversine_km - Strong correlation: longer distance → longer duration (logarithmically smoothed)
- pickup_longitude - Possible district effect
- Weather characteristics hardly show linear effects, they could have a non-linear effect or only be relevant at certain times or locations
- Geographical coordinates contribute significantly

## Filter haversine in combination

In [None]:
# Filter: ≤ 10 km, 7–9 AM, Monday to Friday (weekday 0–4)
filtered = taxi_weather_data[
  (taxi_weather_data['trip_duration_min'].between(5, 120)) &
  (taxi_weather_data['hav_dist_km'].between(1, 20)) &
  (taxi_weather_data['pickup_hour_of_day'].between(5, 18)) &
  (taxi_weather_data['pickup_day_of_week'].between(0, 6)) &
  (taxi_weather_data['passenger_count'].between(1, 6)) &
  (taxi_weather_data['pickup_longitude'].between(-74.3, -73.6)) &
  (taxi_weather_data['dropoff_longitude'].between(-74.3, -73.6)) &
  (taxi_weather_data['pickup_latitude'].between(40.47, 41.0)) &
  (taxi_weather_data['dropoff_latitude'].between(40.47, 41.0))
  ]

filtered.plot.scatter(x="hav_dist_km", y="trip_duration_log", alpha=0.7, grid=True)
plt.xlabel("hav_dist_km")
plt.ylabel("Trip Duration")
plt.title("scatter: Weekday, Distance, Time")
plt.show()

filtered.plot.hexbin(
    x="hav_dist_km", y="trip_duration_log",
    gridsize=50, cmap='plasma'
)
plt.xlabel("hav_dist_km")
plt.ylabel("Trip Duration")
plt.title("Hexbin: Weekday, Distance, Time")
plt.grid(True)
plt.show()
# TODO: Add zone-based filtering (e.g. by longitude/latitude clusters or external zone map)

# Imputation of  Features

In [None]:
taxi_weather_data.isna().sum()

In [None]:
# Create new time column from hour_of_year (basis: start time)
base_time = pd.to_datetime("2016-01-01 00:00:00")
taxi_weather_data['datetime_hour_fallback'] = taxi_weather_data['hour_of_year'].apply(
    lambda h: base_time + pd.Timedelta(hours=h))

In [None]:
features_to_impute = [
  'humidity', 'windspeed_kph', 'temp_c', 'pressure_hPa', 'precip_daily_mm', 'daily_snow_mm',
  'precip_mm'
]

for feature in features_to_impute:
  print(f"\nImputing {feature}...")
  before = taxi_weather_data[feature]
  taxi_weather_data.set_index('datetime_hour_fallback', inplace=True)
  taxi_weather_data[feature] = taxi_weather_data[feature].interpolate(method='time')
  after = taxi_weather_data[feature]
  taxi_weather_data.reset_index(inplace=True)

  print("mean before:", before.mean())
  print("mean after:", after.mean())
  print("std deviation before:", before.std())
  print("std deviation after:", after.std())

In [None]:
taxi_weather_data.isna().sum()

## Classify Weather Conditions

In [None]:
taxi_weather_data = add_weather_features(taxi_weather_data)

In [None]:
taxi_weather_data.isna().sum()