# Load data

In [4]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import config as config
import features.taxi_config as tc
import modelling as ml

In [5]:
# Load taxi and weather data
taxi_weather_data = pd.read_csv(config.TAXI_WEATHER_DATA_SAVE)

In [6]:
taxi_weather_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 33 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   id                  1458644 non-null  object 
 1   vendor_id           1458644 non-null  int64  
 2   pickup_datetime     1458644 non-null  object 
 3   dropoff_datetime    1458644 non-null  object 
 4   passenger_count     1458644 non-null  int64  
 5   pickup_longitude    1458644 non-null  float64
 6   pickup_latitude     1458644 non-null  float64
 7   dropoff_longitude   1458644 non-null  float64
 8   dropoff_latitude    1458644 non-null  float64
 9   store_and_fwd_flag  1458644 non-null  object 
 10  trip_duration       1458644 non-null  int64  
 11  pickup_hour         1458644 non-null  int64  
 12  pickup_weekday      1458644 non-null  object 
 13  pickup_month        1458644 non-null  int64  
 14  hour_of_year        1458644 non-null  int64  
 15  trip_duration_m

In [7]:
taxi_weather_data.isna().sum()

id                         0
vendor_id                  0
pickup_datetime            0
dropoff_datetime           0
passenger_count            0
pickup_longitude           0
pickup_latitude            0
dropoff_longitude          0
dropoff_latitude           0
store_and_fwd_flag         0
trip_duration              0
pickup_hour                0
pickup_weekday             0
pickup_month               0
hour_of_year               0
trip_duration_min          0
trip_duration_log          0
hav_dist_km                0
Unnamed: 0                 0
datetime_hour              0
humidity                   0
fog                        0
rain                       0
snow                       0
conditions                 0
datetime                   0
hour_of_day                0
temp_c                     0
windspeed_kph         271618
precip_mm                  0
pressure_hPa            7585
precip_daily_mm            0
daily_snow_mm              0
dtype: int64

# split train and test set

In [None]:
train_set, test_set = ml.split_train_test(taxi_weather_data, test_size=config.TEST_SIZE,
                                       random_state=config.RANDOM_STATE)

# Multi EDA

Explore numerical Rain Features

In [None]:
rain_features = ['precip_mm', 'precip_daily_mm', 'rain_code']
taxi_weather_data[rain_features + ['trip_duration_log']].corr()['trip_duration_log'].sort_values()

Explore categorial Rain Features

In [None]:
taxi_weather_data.groupby('rain_class')['trip_duration_log'].mean().plot(kind='bar',
                                                                         title='trip_duration_log by rain_class')

In [None]:
from sklearn.ensemble import RandomForestRegressor

features = ['precip_mm', 'precip_daily_mm', 'rain_code']  # + andere nützliche
X = taxi_weather_data[features]
y = taxi_weather_data['trip_duration_log']

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)

importances = pd.Series(model.feature_importances_, index=features)
importances.sort_values(ascending=False).plot(kind='barh', title='Feature Importance')

In [None]:
taxi_weather_data[['precip_mm', 'rain_code']].corr()

# Seek correlation

## correlation matrix (trip_dur and trip_dur_log)

In [None]:
corr_matrix = train_set.select_dtypes(include='number').corr()

In [None]:
corr_matrix['trip_duration'].sort_values(ascending=False)

In [None]:
train_set['trip_duration'].std()

- Not a strong predictor so far. Even distance is only weakly correlated
- No weather variable currently shows a significant linear correlation

In [None]:
corr_matrix['trip_duration_log'].sort_values(ascending=False)

In [None]:
train_set['trip_duration_log'].std()

- The log transformation of trip_duration has reduced the dispersion
- haversine_km - Strong correlation: longer distance → longer duration (logarithmically smoothed)
- pickup_longitude - Possible district effect
- Weather characteristics hardly show linear effects, they could have a non-linear effect or only be relevant at certain times or locations
- Geographical coordinates contribute significantly

# Filter haversine in combination

In [None]:
filtered = train_set[
  (train_set['trip_duration_min'].between(*TYPICAL_DURATION_MIN)) &
  (train_set['hav_dist_km'].between(*TYPICAL_DISTANCE_KM)) &
  (train_set['pickup_hour'].between(*TYPICAL_HOURS)) &
  (train_set['pickup_weekday'].isin(TYPICAL_WEEKDAYS)) &
  (train_set['passenger_count'].between(*TYPICAL_PASSENGERS)) &
  (train_set['pickup_longitude'].between(*LON_RANGE)) &
  (train_set['dropoff_longitude'].between(*LON_RANGE)) &
  (train_set['pickup_latitude'].between(*LAT_RANGE)) &
  (train_set['dropoff_latitude'].between(*LAT_RANGE))
  ]

filtered.plot.scatter(x="hav_dist_km", y="trip_duration_log", alpha=0.7, grid=True)
plt.xlabel("hav_dist_km")
plt.ylabel("Trip Duration")
plt.title("scatter: Weekday, Distance, Time")
plt.show()

filtered.plot.hexbin(
    x="hav_dist_km", y="trip_duration_log",
    gridsize=50, cmap='plasma'
)
plt.xlabel("hav_dist_km")
plt.ylabel("Trip Duration")
plt.title("Hexbin: Weekday, Distance, Time")
plt.grid(True)
plt.show()
# TODO: Add zone-based filtering (e.g. by longitude/latitude clusters or external zone map)

# Log transformed features

In [None]:
precip_log_mm = np.log1p(train_set['precip_mm'])

In [None]:
import seaborn as sns

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
sns.histplot(train_set['precip_mm'], bins=20, ax=axes[0])
axes[0].set_title("Original")
sns.histplot(np.log1p(precip_log_mm), bins=20, ax=axes[1])
axes[1].set_title("log1p")

- log shows no impact on rain

# Scale haversine km

In [None]:
train_set['hav_dist_km'].plot.hist()

In [None]:
dist_km_log = np.log1p(train_set['hav_dist_km'])

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
sns.histplot(train_set['hav_dist_km'], bins=20, ax=axes[0])
axes[0].set_title("Original")
sns.histplot(np.log1p(dist_km_log), bins=20, ax=axes[1])
axes[1].set_title("log1p")