# Data Preprocessing: Trip Duration

Generate code & functions such that conducts data preprocessing(includes feature engineering & data cleaning).

In [None]:
%%time

# Import Standard Libraries
import os
import sys
import warnings
# warnings.filterwarnings("ignore")

# Import Data Handling Libraries
import pandas as pd
import numpy as np
np.random.seed(42)

# Import Date-Time Handling Libraries
from datetime import timedelta
import datetime as dt

# Import Geodetic Libraries
import pyproj
from pyproj import Geod

# Import Data Visualization Libraries
import matplotlib
matplotlib.rcParams["font.size"] = 12
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [12, 12]  # Set default figure size
import seaborn as sns

# Import Machine Learning Libraries
from sklearn.decomposition import PCA  # Principal Component Analysis

# Set random seed for reproducibility in scikit-learn
from sklearn.utils import check_random_state
rng = check_random_state(42)

# Import Utilities
import gc
from tqdm import tqdm
import joblib

# Import Custom Modules
from data_loader import *  # Custom data loading functions

In [None]:
%%time

# Load the dataset
df_train = load_data("train")
df_train.head()

In [None]:
%%time

# Load the test dataset
df_test = load_data("test")
df_test.head()

## Feature Extraction

### PCA in Longitudes & Latitudes

In [None]:
%%time

def apply_pca_to_coords(train, test, random_seed=42):
    """
    Applies PCA transformation to pickup and dropoff coordinates for train and test datasets.

    The PCA is fitted **only on the training data** to prevent data leakage.

    Parameters:
        train (pd.DataFrame): The training dataset.
        test (pd.DataFrame): The testing dataset.
        random_seed (int): Random seed for reproducibility.

    Returns:
        None: Modifies train and test DataFrames in place.
    """

    # Fit PCA only on training data
    coords_train = np.vstack((
        train[["pickup_latitude", "pickup_longitude"]].values,
        train[["dropoff_latitude", "dropoff_longitude"]].values
    ))

    pca = PCA(whiten=True, random_seed=random_seed).fit(coords_train)

    # Apply transformation to train dataset
    train.loc[:, "pickup_pca0"] = pca.transform(train[["pickup_latitude", "pickup_longitude"]])[:, 0]
    train.loc[:, "pickup_pca1"] = pca.transform(train[["pickup_latitude", "pickup_longitude"]])[:, 1]
    train.loc[:, "dropoff_pca0"] = pca.transform(train[["dropoff_latitude", "dropoff_longitude"]])[:, 0]
    train.loc[:, "dropoff_pca1"] = pca.transform(train[["dropoff_latitude", "dropoff_longitude"]])[:, 1]

    # Apply the same transformation to test dataset to avoid data leakage
    test.loc[:, "pickup_pca0"] = pca.transform(test[["pickup_latitude", "pickup_longitude"]])[:, 0]
    test.loc[:, "pickup_pca1"] = pca.transform(test[["pickup_latitude", "pickup_longitude"]])[:, 1]
    test.loc[:, "dropoff_pca0"] = pca.transform(test[["dropoff_latitude", "dropoff_longitude"]])[:, 0]
    test.loc[:, "dropoff_pca1"] = pca.transform(test[["dropoff_latitude", "dropoff_longitude"]])[:, 1]

# Example usage:
apply_pca_to_coords(df_train, df_test, random_seed=42)

### Distance

In [None]:
%%time

# Define WGS84 ellipsoid
geod = Geod(ellps="WGS84")

# Compute great-circle distance in kilometers
df_train["euclidean_distance"] = df_train.apply(
    lambda row: geod.inv(row["pickup_longitude"], row["pickup_latitude"],
                         row["dropoff_longitude"], row["dropoff_latitude"])[2] / 1000, axis=1
)

# Compute great-circle distance in kilometers
df_test["euclidean_distance"] = df_test.apply(
    lambda row: geod.inv(row["pickup_longitude"], row["pickup_latitude"],
                         row["dropoff_longitude"], row["dropoff_latitude"])[2] / 1000, axis=1
)

### Datetime Feature

In [None]:
%%time

def generate_datetime_features(df):
    """
    Generate detailed date-time features for pickups and modify the DataFrame in place.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the datetime column.
    
    Returns:
        None (Modifies df in place)
    """
    # Convert to datetime format
    pickup_times = pd.to_datetime(df["pickup_datetime"])

    # Extract relevant time features
    df["pickup_hour_of_day"] = (pickup_times.dt.hour * 60.0 + pickup_times.dt.minute) / 60.0
    df["dropoff_hour_of_day"] = df["pickup_hour_of_day"] + df["trip_duration [min]"] / 60.0

    df["day_of_week"] = pickup_times.dt.weekday
    df["hour_of_week"] = df["day_of_week"] * 24.0 + df["pickup_hour_of_day"]

    df["month_of_year"] = pickup_times.dt.month
    df["day_of_year"] = pickup_times.dt.dayofyear
    df["week_of_year"] = pickup_times.dt.isocalendar().week
    df["hour_of_year"] = df["day_of_year"] * 24.0 + df["pickup_hour_of_day"]

generate_datetime_features(df_train)
generate_datetime_features(df_test)

### Temporal & Geospatial Aggregation

### NYC Weather

## Data Cleaning

### Location Outlier

In [None]:
%%time

def filter_by_nyc_boundary(df, geojson_path):
    """
    Filters pickup and dropoff locations to keep only those within the New York City boundary.

    Parameters:
        df (pd.DataFrame): The DataFrame containing pickup and dropoff coordinates.
        geojson_path (str): Path to the GeoJSON file defining NYC boundaries.

    Returns:
        pd.DataFrame: Filtered DataFrame with locations inside the NYC bounding box.
    """
    # Load the GeoJSON file
    with open(geojson_path, "r") as f:
        geojson_data = json.load(f)

    # Extract NYC boundary coordinates where NAME is "New York"
    nyc_coords = []
    for feature in geojson_data["features"]:
        if feature["properties"].get("NAME") == "New York":
            for polygon in feature["geometry"]["coordinates"]:  # Loop through MultiPolygon
                for ring in polygon:  # Each polygon has a ring of coordinates
                    nyc_coords.extend(ring)

    # Compute NYC bounding box (min/max latitudes & longitudes)
    min_long = min(lon for lon, lat in nyc_coords)
    max_long = max(lon for lon, lat in nyc_coords)
    min_lat = min(lat for lon, lat in nyc_coords)
    max_lat = max(lat for lon, lat in nyc_coords)

    # Filter data based on bounding box
    mask = (
        (df["pickup_longitude"].between(min_long, max_long))
        & (df["pickup_latitude"].between(min_lat, max_lat))
        & (df["dropoff_longitude"].between(min_long, max_long))
        & (df["dropoff_latitude"].between(min_lat, max_lat))
    )

    return df[mask]


# Apply function to df_train and df_test
df_train = filter_by_nyc_boundary(df_train, "utils/gz_2010_us_040_00_5m.json")
df_test = filter_by_nyc_boundary(df_test, "utils/gz_2010_us_040_00_5m.json")

### Duration Outlier

### Speed Outlier

---