In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import ElasticNet
import joblib
from tqdm import tqdm
import yaml

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
%cd ..

C:\Yellow Taxi Project\SimpleOps\project


In [4]:
import yaml
import pprint
with open('params.yaml') as conf_file:
    config = yaml.safe_load(conf_file)

pprint.pprint(config)

{'base': {'random_seed': 42},
 'data': {'null_handled_data_csv': 'data/processed/yellow_tripdata_2023-01_null_handled.csv',
          'processed_data_csv': 'data/processed/yellow_tripdata_2023-01.csv',
          'raw_data_csv': 'data/raw/yellow_tripdata_2023-01.csv',
          'raw_data_parquet': 'data/raw/yellow_tripdata_2023-01.parquet',
          'test_size': 0.2,
          'train_data_csv': 'data/processed/train_data.csv',
          'val_data_csv': 'data/processed/val_data.csv'},
 'features': {'categorical': ['VendorID',
                              'RatecodeID',
                              'store_and_fwd_flag',
                              'payment_type',
                              'PULocationID',
                              'DOLocationID',
                              'time_of_day'],
              'numerical': ['passenger_count',
                            'trip_distance',
                            'fare_amount',
                            'extra',
                 

In [23]:
df = pd.read_csv(config['data']['null_handled_data_csv'], parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])

In [24]:
df['trip_duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds()
df = df[(df['trip_duration'] >= 60) & (df['trip_duration'] <= 7200)]


In [25]:
df['trip_duration'].describe()

count    3.030075e+06
mean     8.753121e+02
std      6.571537e+02
min      6.000000e+01
25%      4.350000e+02
50%      6.970000e+02
75%      1.103000e+03
max      7.199000e+03
Name: trip_duration, dtype: float64

In [19]:
df['pickup_hour'] = df['tpep_pickup_datetime'].dt.hour
df['pickup_weekday'] = df['tpep_pickup_datetime'].dt.weekday
df['is_weekend'] = df['pickup_weekday'].isin([5, 6]).astype(int)
df['pickup_day'] = df['tpep_pickup_datetime'].dt.day
df['month'] = df['tpep_pickup_datetime'].dt.month
df['is_peak'] = ((df['pickup_hour'].between(7,9)) | (df['pickup_hour'].between(16,18))).astype(int)

def time_of_day(hour):
    if hour < 6: return 'night'
    elif hour < 12: return 'morning'
    elif hour < 18: return 'afternoon'
    else: return 'evening'
df['time_of_day'] = df['pickup_hour'].apply(time_of_day)

categorical_cols = [
    'VendorID', 'RatecodeID', 'store_and_fwd_flag', 'payment_type',
    'PULocationID', 'DOLocationID', 'time_of_day'
]
numerical_cols = [
    'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax', 'tip_amount',
    'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge',
    'pickup_hour', 'pickup_weekday', 'is_weekend', 'pickup_day', 'month', 'is_peak', 'manhattan_dist'
]
numerical_cols = [col for col in numerical_cols if col in df.columns]
config['features'] = config.get('features', {})  # ensure 'features' section exists
config['features']['numerical'] = numerical_cols
config['features']['categorical'] = categorical_cols
# Label encode categoricals
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# Standardize numerical features
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,congestion_surcharge,airport_fee,trip_duration,pickup_hour,pickup_weekday,is_weekend,pickup_day,month,is_peak,time_of_day
0,1,2023-01-01 00:32:10,2023-01-01 00:40:36,-0.409545,-0.011623,0,0,65,44,2,...,0.285490,0.000000,506.0,-2.451946,1.505714,1.579736,-1.771680,-0.003181,-0.673544,3
1,1,2023-01-01 00:55:08,2023-01-01 01:01:27,-0.409545,-0.011105,0,0,197,149,1,...,0.285490,0.000000,379.0,-2.451946,1.505714,1.579736,-1.771680,-0.003181,-0.673544,3
2,1,2023-01-01 00:25:04,2023-01-01 00:37:49,-0.409545,-0.005486,0,0,202,150,1,...,0.285490,0.000000,765.0,-2.451946,1.505714,1.579736,-1.771680,-0.003181,-0.673544,3
3,0,2023-01-01 00:03:48,2023-01-01 00:13:25,-1.538708,-0.007917,0,0,39,228,1,...,-3.117608,1.250000,577.0,-2.451946,1.505714,1.579736,-1.771680,-0.003181,-0.673544,3
4,1,2023-01-01 00:10:29,2023-01-01 00:21:19,-0.409545,-0.009790,0,0,6,238,1,...,0.285490,0.000000,650.0,-2.451946,1.505714,1.579736,-1.771680,-0.003181,-0.673544,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3066761,1,2023-01-31 23:58:34,2023-02-01 00:12:33,-0.000187,-0.003334,1,0,6,204,0,...,-0.021836,0.107409,839.0,1.537105,-1.007317,-0.633017,1.676434,-0.003181,-0.673544,1
3066762,1,2023-01-31 23:31:09,2023-01-31 23:50:36,-0.000187,0.007624,1,0,11,234,0,...,-0.021836,0.107409,1167.0,1.537105,-1.007317,-0.633017,1.676434,-0.003181,-0.673544,1
3066763,1,2023-01-31 23:01:05,2023-01-31 23:25:36,-0.000187,0.003121,1,0,13,151,0,...,-0.021836,0.107409,1471.0,1.537105,-1.007317,-0.633017,1.676434,-0.003181,-0.673544,1
3066764,1,2023-01-31 23:40:00,2023-01-31 23:53:00,-0.000187,-0.002936,1,0,141,238,0,...,-0.021836,0.107409,780.0,1.537105,-1.007317,-0.633017,1.676434,-0.003181,-0.673544,1


In [22]:
df['trip_duration'].describe()

count    3.033431e+06
mean     9.502748e+02
std      2.567825e+03
min      6.000000e+01
25%      4.350000e+02
50%      6.980000e+02
75%      1.105000e+03
max      6.017510e+05
Name: trip_duration, dtype: float64