# Data Preprocessing: Trip Duration

Generate code & functions such that conducts data preprocessing(includes feature engineering & data cleaning).

In [1]:
%%time

# Import Standard Libraries
import os
import sys
import warnings
# warnings.filterwarnings("ignore")

# Import Data Handling Libraries
import pandas as pd
import numpy as np
np.random.seed(42)

# Import Date-Time Handling Libraries
from datetime import timedelta
import datetime as dt

# Import Geodetic Libraries
import pyproj
from pyproj import Geod

# Import Data Visualization Libraries
import matplotlib
matplotlib.rcParams["font.size"] = 12
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [12, 12]  # Set default figure size
import seaborn as sns

# Import Machine Learning Libraries
from sklearn.decomposition import PCA  # Principal Component Analysis

# Set random seed for reproducibility in scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.utils import check_random_state
rng = check_random_state(42)

# Import Utilities
import time
import gc
from tqdm import tqdm
import joblib
from joblib import Parallel, delayed
import multiprocessing
import json

# Import Custom Modules
from data_loader import *  # Custom data loading functions

CPU times: user 1.49 s, sys: 611 ms, total: 2.1 s
Wall time: 1.46 s


In [2]:
%%time

# Load the dataset
df = pd.read_csv(os.path.join(os.getcwd(), "data", "data.csv"))
df.head()

CPU times: user 2.93 s, sys: 1.06 s, total: 3.98 s
Wall time: 4.01 s


Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 11 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   id                  1458644 non-null  object 
 1   vendor_id           1458644 non-null  int64  
 2   pickup_datetime     1458644 non-null  object 
 3   dropoff_datetime    1458644 non-null  object 
 4   passenger_count     1458644 non-null  int64  
 5   pickup_longitude    1458644 non-null  float64
 6   pickup_latitude     1458644 non-null  float64
 7   dropoff_longitude   1458644 non-null  float64
 8   dropoff_latitude    1458644 non-null  float64
 9   store_and_fwd_flag  1458644 non-null  object 
 10  trip_duration       1458644 non-null  int64  
dtypes: float64(4), int64(3), object(4)
memory usage: 122.4+ MB


In [4]:
%%time

# Delete cols that leads to data leakage
del df["dropoff_datetime"]
gc.collect()

CPU times: user 59.3 ms, sys: 39.8 ms, total: 99.1 ms
Wall time: 98.1 ms


0

In [5]:
%%time

# Define helper function formats time seconds into string
def format_time(seconds):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = seconds % 60
    return f"{hours} hour {minutes} min {seconds:.2f} sec"

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 15 µs


## Feature Extraction

### PCA in Longitudes & Latitudes

In [6]:
%%time

def apply_pca_to_coords(df, random_seed=42):
    """
    Applies PCA transformation to pickup and dropoff coordinates for train and test datasets.

    The PCA is fitted **only on the training data** to prevent data leakage.

    Parameters:
        df (pd.DataFrame): The dataset.
        random_seed (int): Random seed for reproducibility.

    Returns:
        None: Modifies train and test DataFrames in place.
    """

    # Fit PCA on data
    coords_train = np.vstack((
        df[["pickup_latitude", "pickup_longitude"]].values,
        df[["dropoff_latitude", "dropoff_longitude"]].values
    ))

    pca = PCA(whiten=True, random_state=random_seed).fit(coords_train)

    # Apply transformation to train dataset
    df_coords_pickup = df[["pickup_latitude", "pickup_longitude"]].values
    df_coords_dropoff = df[["dropoff_latitude", "dropoff_longitude"]].values
    df.loc[:, "pickup_pca0"] = pca.transform(df_coords_pickup)[:, 0]
    df.loc[:, "pickup_pca1"] = pca.transform(df_coords_pickup)[:, 1]
    df.loc[:, "dropoff_pca0"] = pca.transform(df_coords_dropoff)[:, 0]
    df.loc[:, "dropoff_pca1"] = pca.transform(df_coords_dropoff)[:, 1]

# Example usage:
apply_pca_to_coords(df, random_seed=42)

CPU times: user 1.1 s, sys: 1.02 s, total: 2.12 s
Wall time: 477 ms


### Distance

In [7]:
%%time

# Define WGS84 ellipsoid
geod = Geod(ellps="WGS84")

# Compute great-circle distance in kilometers
df["geodesic_distance"] = df.apply(
    lambda row: geod.inv(row["pickup_longitude"], row["pickup_latitude"],
                         row["dropoff_longitude"], row["dropoff_latitude"])[2] / 1000, axis=1
)

CPU times: user 17.8 s, sys: 1.1 s, total: 18.9 s
Wall time: 18.4 s


### Datetime Feature

In [8]:
%%time

def generate_datetime_features(df):
    """
    Generate detailed date-time features for pickups and modify the DataFrame in place.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the datetime column.
    
    Returns:
        None (Modifies df in place)
    """
    # Convert to datetime format
    pickup_times = pd.to_datetime(df["pickup_datetime"])

    # Extract relevant time features as integers
    df["pickup_hour_of_day"] = (pickup_times.dt.hour * 60 + pickup_times.dt.minute) // 60  # Integer division

    df["day_of_week"] = pickup_times.dt.weekday.astype(int)
    df["hour_of_week"] = (df["day_of_week"] * 24 + df["pickup_hour_of_day"]).astype(int)

    df["month_of_year"] = pickup_times.dt.month.astype(int)
    df["day_of_year"] = pickup_times.dt.dayofyear.astype(int)
    df["week_of_year"] = pickup_times.dt.isocalendar().week.astype(int)
    df["hour_of_year"] = (df["day_of_year"] * 24 + df["pickup_hour_of_day"]).astype(int)

generate_datetime_features(df)

CPU times: user 1.12 s, sys: 59.6 ms, total: 1.18 s
Wall time: 1.18 s


### NYC Weather

In [9]:
%%time

def merge_weather_data(df):
    """
    Merges weather data with a given dataframe (train or test) based on the pickup date.
    
    Parameters:
        df (pd.DataFrame): The train or test dataframe containing 'pickup_datetime'.
    
    Returns:
        pd.DataFrame: The merged dataframe with only the intermediate weather data columns.
    """

    # Load NYC weather data to enrich information
    weather_data = pd.read_csv(os.path.join("utils", "weather_data_nyc_centralpark_2016.csv"), low_memory=False)
    weather_data["date"] = pd.to_datetime(weather_data["date"], format="%d-%m-%Y")

    # Ensure datetime consistency
    weather_data["date"] = weather_data["date"].dt.date
    df["pickup_date"] = pd.to_datetime(df["pickup_datetime"]).dt.date

    # Handle trace values in precipitation, snow fall, and snow depth columns
    weather_data["r_depth"] = weather_data["precipitation"].apply(lambda x: 0.01 if x == "T" else float(x))  # rain depth
    weather_data["s_fall"] = weather_data["snow fall"].apply(lambda x: 0.01 if x == "T" else float(x))  # snow fall
    weather_data["s_depth"] = weather_data["snow depth"].apply(lambda x: 0.01 if x == "T" else float(x))  # snow depth

    # Calculate total precipitation, and snow/rain indicators
    weather_data["all_precip"] = weather_data["s_fall"] + weather_data["r_depth"]
    weather_data["has_snow"] = (weather_data["s_fall"] > 0) | (weather_data["s_depth"] > 0)
    weather_data["has_rain"] = weather_data["r_depth"] > 0

    # Copy temperature columns
    weather_data["max_temp"] = weather_data["maximum temperature"]
    weather_data["min_temp"] = weather_data["minimum temperature"]

    # Select only the newly created columns
    weather_data = weather_data[["date", "r_depth", "s_fall", "s_depth", "all_precip", "has_snow", "has_rain", "max_temp", "min_temp"]]

    # Merge the datasets on the date
    df = df.merge(weather_data, left_on="pickup_date", right_on="date", how="left")

    return df

# Apply function to train and test datasets
df = merge_weather_data(df)

CPU times: user 982 ms, sys: 344 ms, total: 1.33 s
Wall time: 1.32 s


## Data Cleaning

### Location Outlier

In [10]:
%%time

def filter_by_nyc_boundary(df, geojson_path):
    """
    Filters pickup and dropoff locations to keep only those within the New York City boundary.

    Parameters:
        df (pd.DataFrame): The DataFrame containing pickup and dropoff coordinates.
        geojson_path (str): Path to the GeoJSON file defining NYC boundaries.

    Returns:
        pd.DataFrame: Filtered DataFrame with locations inside the NYC bounding box.
    """
    # Load the GeoJSON file
    with open(geojson_path, "r") as f:
        geojson_data = json.load(f)

    # Extract NYC boundary coordinates where NAME is "New York"
    nyc_coords = []
    for feature in geojson_data["features"]:
        if feature["properties"].get("NAME") == "New York":
            for polygon in feature["geometry"]["coordinates"]:  # Loop through MultiPolygon
                for ring in polygon:  # Each polygon has a ring of coordinates
                    nyc_coords.extend(ring)

    # Compute NYC bounding box (min/max latitudes & longitudes)
    min_long = min(lon for lon, lat in nyc_coords)
    max_long = max(lon for lon, lat in nyc_coords)
    min_lat = min(lat for lon, lat in nyc_coords)
    max_lat = max(lat for lon, lat in nyc_coords)

    # Count records before filtering
    initial_count = len(df)

    # Filter data based on bounding box
    mask = (
        (df["pickup_longitude"].between(min_long, max_long))
        & (df["pickup_latitude"].between(min_lat, max_lat))
        & (df["dropoff_longitude"].between(min_long, max_long))
        & (df["dropoff_latitude"].between(min_lat, max_lat))
    )

    filtered_df = df[mask]

    # Count records after filtering
    final_count = len(filtered_df)
    dropped_count = initial_count - final_count

    print(f"Records before filtering: {initial_count}")
    print(f"Records after filtering: {final_count}")
    print(f"Records dropped: {dropped_count}\n")

    return filtered_df


# Apply function to df
df = filter_by_nyc_boundary(df, "utils/gz_2010_us_040_00_5m.json")

Records before filtering: 1458644
Records after filtering: 1458577
Records dropped: 67

CPU times: user 366 ms, sys: 90.9 ms, total: 456 ms
Wall time: 451 ms


### Distance & Duration Outlier

Some outliers represent natural variations in the population, and they should be left as is in your dataset. These are called true outliers. Other outliers are problematic and should be removed because they represent measurement errors, data entry or processing errors, or poor sampling, e.g. zero-distance trip or trips with too fast speed.. This piece of code filters distance and duration outliers in both train & test dataset.

In [11]:
%%time

# Define WGS84 ellipsoid
geod = Geod(ellps="WGS84")

# Load the GeoJSON file to Calculate the MAX distance possible for a trip in New York
geojson_path = "utils/gz_2010_us_040_00_5m.json"
with open(geojson_path, "r") as f:
    geojson_data = json.load(f)

# Extract NYC boundary coordinates where NAME is "New York"
nyc_coords = []
for feature in geojson_data["features"]:
    if feature["properties"].get("NAME") == "New York":
        for polygon in feature["geometry"]["coordinates"]:  # Loop through MultiPolygon
            for ring in polygon:  # Each polygon has a ring of coordinates
                nyc_coords.extend(ring)

# Find the two farthest points in the boundary
max_distance = 0
max_pair = None

for i in range(len(nyc_coords)):
    for j in range(i + 1, len(nyc_coords)):
        lon1, lat1 = nyc_coords[i]
        lon2, lat2 = nyc_coords[j]

        # Compute geodesic distance in meters
        _, _, dist_m = geod.inv(lon1, lat1, lon2, lat2)

        # Convert to kilometers
        dist_km = dist_m / 1000

        if dist_km > max_distance:
            max_distance = dist_km
            max_pair = ((lon1, lat1), (lon2, lat2))

print(f"Max Distance: {max_distance:.2f} km")
print(f"Farthest Points: {max_pair}")

Max Distance: 671.44 km
Farthest Points: ((-71.856214, 41.070598), (-79.761951, 42.26986))
CPU times: user 2.5 s, sys: 7.95 ms, total: 2.51 s
Wall time: 2.5 s


In [12]:
%%time

def filter_by_bounds(df, column, lower_bound=None, upper_bound=None):
    """
    Filters trips based on a given column (e.g., Euclidean distance or trip duration)
    using absolute lower and upper bounds only.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the column to filter.
        column (str): The column to apply filtering on.
        lower_bound (float, optional): Absolute minimum value to keep. Defaults to None (not applied).
        upper_bound (float, optional): Absolute maximum value to keep. Defaults to None (not applied).

    Returns:
        pd.DataFrame: Filtered DataFrame with values within the specified bounds.
    """
    if column not in df.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame.")
    
    initial_count = len(df)
    
    if lower_bound is not None:
        df = df[df[column] > lower_bound]
        print(f"Applied lower bound ({column}): {lower_bound}")
    
    if upper_bound is not None:
        df = df[df[column] < upper_bound]
        print(f"Applied upper bound ({column}): {upper_bound}")
    
    final_count = len(df)
    dropped_count = initial_count - final_count
    print(f"Total records dropped due to {column} outliers: {dropped_count}")
    
    return df

# Apply function to filter both geodesic_distance and trip_duration
# Speed Lim: [1m/s ~ 25m/s] <-> [3.6 km/h ~ 90km/h]
# Duration Lim: [5min ~ 20h] <-> [300s ~ 72000]
df = filter_by_bounds(df, "geodesic_distance", lower_bound=0.1, upper_bound=720)      # [0.1km, 720km]
print() 
df = filter_by_bounds(df, "trip_duration", lower_bound=300, upper_bound=72_000)       # [5min, 20h]

Applied lower bound (geodesic_distance): 0.1
Applied upper bound (geodesic_distance): 720
Total records dropped due to geodesic_distance outliers: 13287

Applied lower bound (trip_duration): 300
Applied upper bound (trip_duration): 72000
Total records dropped due to trip_duration outliers: 215491
CPU times: user 934 ms, sys: 788 ms, total: 1.72 s
Wall time: 1.72 s


### Speed Outlier

In [13]:
%%time

def filter_by_speed(df, distance_col, duration_col, speed_lower_limit, speed_upper_limit):
    """
    Filters trips based on both speed and duration limits, using distance and duration columns.
    
    Parameters:
        df (pd.DataFrame): The DataFrame containing the columns to filter.
        distance_col (str): The column for geodesic distance.
        duration_col (str): The column for trip duration.
        speed_lower_limit (float): Minimum speed limit in m/s.
        speed_upper_limit (float): Maximum speed limit in m/s.
    
    Returns:
        pd.DataFrame: Filtered DataFrame with values within the specified bounds for speed and duration.
    """
    # Calculate speed from distance and duration
    df["speed"] = (df[distance_col] / df[duration_col]) * 1000
    
    # Apply the filtering
    df = filter_by_bounds(df, "speed", lower_bound=speed_lower_limit, upper_bound=speed_upper_limit)
    
    return df

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 13.1 µs


In [14]:
%%time

# Apply the function to filter speed outliers
df = filter_by_speed(
    df,
    "geodesic_distance", "trip_duration",
    speed_lower_limit=1, speed_upper_limit=25
)

Applied lower bound (speed): 1
Applied upper bound (speed): 25
Total records dropped due to speed outliers: 18812
CPU times: user 338 ms, sys: 52.4 ms, total: 391 ms
Wall time: 388 ms


### Spatial & Temporal Aggregation

In [15]:
%%time

def bin_coordinates(df, precision=2):
    """Bins latitude and longitude to a specified precision."""
    df.loc[:, "pickup_lat_bin"] = np.round(df["pickup_latitude"], precision)
    df.loc[:, "pickup_long_bin"] = np.round(df["pickup_longitude"], precision)
    df.loc[:, "dropoff_lat_bin"] = np.round(df["dropoff_latitude"], precision)
    df.loc[:, "dropoff_long_bin"] = np.round(df["dropoff_longitude"], precision)

bin_coordinates(df)

CPU times: user 22 ms, sys: 2.94 ms, total: 24.9 ms
Wall time: 22.5 ms


In [16]:
%%time

def compute_spatial_aggregations(df, min_trips=100):
    """Computes trip counts for different spatial aggregations."""
    groupings = [
        ["pickup_lat_bin", "pickup_long_bin", "dropoff_lat_bin", "dropoff_long_bin"],
        ["pickup_lat_bin", "pickup_long_bin"],
        ["dropoff_lat_bin", "dropoff_long_bin"]
    ]
    
    for groupby_cols in groupings:
        col_name = "cnt_coords_bin_" + "".join(set([col[0] for col in groupby_cols]))
        
        # Compute trip counts and store in a dictionary for fast lookup
        counts = df.groupby(groupby_cols).size().to_dict()
        
        # Apply counts to create a new column in the dataframe
        df[col_name] = df[groupby_cols].apply(lambda row: counts.get(tuple(row), 0), axis=1)
        
        # Apply filtering based on min_trips
        df[col_name] = df[col_name].where(df[col_name] >= min_trips, 0)

compute_spatial_aggregations(df)

CPU times: user 17 s, sys: 184 ms, total: 17.2 s
Wall time: 17.2 s


In [17]:
%%time

def process_chunk(chunk, df_ref):
    """
    Process a chunk of the DataFrame to compute spatial-temporal features.
    
    Args:
        chunk: A subset of the main DataFrame.
        df_ref: Reference DataFrame for aggregation calculations.
    
    Returns:
        DataFrame with computed features for the chunk.
    """
    # Ensure pickup_datetime is in datetime format
    chunk["pickup_datetime"] = pd.to_datetime(chunk["pickup_datetime"])
    df_ref["pickup_datetime"] = pd.to_datetime(df_ref["pickup_datetime"])
    
    # Create temporary columns for time calculations
    df_ref = df_ref.copy()
    df_ref["pickup_hour"] = df_ref["pickup_datetime"].dt.floor("H")
    
    # Add new columns with default values
    chunk["cnt_prev_1h"] = 0
    chunk["cnt_mean_prev_3h_pickups"] = 0.0
    chunk["cnt_mean_prev_3h_dropoffs"] = 0.0
    
    for idx, row in chunk.iterrows():
        # Get current trip attributes
        current_time = row["pickup_datetime"]
        current_hour = current_time.floor("H")
        pl_bin = row["pickup_lat_bin"]
        plon_bin = row["pickup_long_bin"]
        dl_bin = row["dropoff_lat_bin"]
        dlon_bin = row["dropoff_long_bin"]
        
        # Calculate 1-hour window
        t1_start = current_hour - pd.Timedelta(hours=1)
        t1_end = current_hour
        
        # Calculate 3-hour average window (T-4h to T-1h)
        t3_start = current_hour - pd.Timedelta(hours=4)
        t3_end = current_hour - pd.Timedelta(hours=1)
        
        # Get reference data subsets
        ref_1h = df_ref[
            (df_ref["pickup_hour"] >= t1_start) & 
            (df_ref["pickup_hour"] < t1_end)
        ]
        
        ref_3h = df_ref[
            (df_ref["pickup_hour"] >= t3_start) & 
            (df_ref["pickup_hour"] < t3_end)
        ]
        
        # Calculate 1-hour total count
        chunk.at[idx, "cnt_prev_1h"] = len(ref_1h)
        
        # Calculate 3-hour spatial averages
        pickup_count = len(ref_3h[
            (ref_3h["pickup_lat_bin"] == pl_bin) &
            (ref_3h["pickup_long_bin"] == plon_bin)
        ])
        
        dropoff_count = len(ref_3h[
            (ref_3h["dropoff_lat_bin"] == dl_bin) &
            (ref_3h["dropoff_long_bin"] == dlon_bin)
        ])
        
        chunk.at[idx, "cnt_mean_prev_3h_pickups"] = pickup_count / 3
        chunk.at[idx, "cnt_mean_prev_3h_dropoffs"] = dropoff_count / 3
    
    return chunk

def compute_spatial_temporal_aggregation_parallel(df, df_ref, n_jobs):
    """
    Compute spatial-temporal aggregated features for taxi trips in parallel.
    
    Args:
        df: DataFrame to compute features for (must contain pickup/dropoff bins and timestamps).
        df_ref: Reference DataFrame used for aggregation calculations.
        n_jobs: Number of parallel jobs to run. Default is -1 (use all available cores).
    
    Returns:
        DataFrame with computed features.
    """
    
    # Determine chunk size based on available cores
    num_chunks = min(n_jobs, len(df))
    chunk_size = len(df) // num_chunks if num_chunks > 0 else len(df)
    print(f"Available CPU Core: {n_jobs} | Chunk Size: {chunk_size}")
    
    # Split the DataFrame into chunks for parallel processing
    chunks = [df.iloc[i:i + chunk_size] for i in range(0, len(df), chunk_size)]
    
    # Process chunks in parallel
    results = Parallel(n_jobs=n_jobs)(
        delayed(process_chunk)(chunk, df_ref) for chunk in chunks
    )
    
    # Combine results into a single DataFrame
    return pd.concat(results, ignore_index=True)

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 16.7 µs


In [18]:
%%time

# Apply Spatial Temporal Aggregation Paralleled Calculation:
df = compute_spatial_temporal_aggregation_parallel(df, df, 8)

Available CPU Core: 8 | Chunk Size: 151373
CPU times: user 42.9 s, sys: 22.7 s, total: 1min 5s
Wall time: 55min 52s


### OSRM Feature

In [19]:
%%time

# Credit: The OSRM-based routing data used here was generated by Oscarleo 
# and is available at: https://www.kaggle.com/datasets/oscarleo/new-york-city-taxi-with-osrm

# Load the Parquet file with selected columns
fastest_routes = pd.read_parquet(
    "data/osrm/fastest_routes.parquet",
    columns=['id', 'total_distance', 'total_travel_time', 'number_of_steps']
)

# Merge on 'id' the key column
df = df.merge(fastest_routes, on="id", how="left")

# Brief overview of fastest routes planed by osrm framework 
fastest_routes.info()

# Delte relevant variables
del fastest_routes
gc.collect()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458643 entries, 0 to 1458642
Data columns (total 4 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   id                 1458643 non-null  object 
 1   total_distance     1458643 non-null  float64
 2   total_travel_time  1458643 non-null  float64
 3   number_of_steps    1458643 non-null  int64  
dtypes: float64(2), int64(1), object(1)
memory usage: 44.5+ MB
CPU times: user 2.87 s, sys: 1.42 s, total: 4.29 s
Wall time: 3.99 s


29

### Drop Redundant Columns

In [20]:
df.columns

Index(['id', 'vendor_id', 'pickup_datetime', 'passenger_count',
       'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'store_and_fwd_flag', 'trip_duration',
       'pickup_pca0', 'pickup_pca1', 'dropoff_pca0', 'dropoff_pca1',
       'geodesic_distance', 'pickup_hour_of_day', 'day_of_week',
       'hour_of_week', 'month_of_year', 'day_of_year', 'week_of_year',
       'hour_of_year', 'pickup_date', 'date', 'r_depth', 's_fall', 's_depth',
       'all_precip', 'has_snow', 'has_rain', 'max_temp', 'min_temp', 'speed',
       'pickup_lat_bin', 'pickup_long_bin', 'dropoff_lat_bin',
       'dropoff_long_bin', 'cnt_coords_bin_pd', 'cnt_coords_bin_p',
       'cnt_coords_bin_d', 'cnt_prev_1h', 'cnt_mean_prev_3h_pickups',
       'cnt_mean_prev_3h_dropoffs', 'total_distance', 'total_travel_time',
       'number_of_steps'],
      dtype='object')

In [21]:
%%time

# Delete redundant, intermediate columns
df.drop(columns=[
    "pickup_datetime", "pickup_date", "date", "speed",
    "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude",
    "pickup_lat_bin", "pickup_long_bin", "dropoff_lat_bin", "dropoff_long_bin"
], inplace=True)

gc.collect()

# Reorganize the columns to make `trip_duration` the target column in the end
df = df[[col for col in df.columns if col != "trip_duration"] + ["trip_duration"]]

df.columns

CPU times: user 350 ms, sys: 0 ns, total: 350 ms
Wall time: 349 ms


Index(['id', 'vendor_id', 'passenger_count', 'store_and_fwd_flag',
       'pickup_pca0', 'pickup_pca1', 'dropoff_pca0', 'dropoff_pca1',
       'geodesic_distance', 'pickup_hour_of_day', 'day_of_week',
       'hour_of_week', 'month_of_year', 'day_of_year', 'week_of_year',
       'hour_of_year', 'r_depth', 's_fall', 's_depth', 'all_precip',
       'has_snow', 'has_rain', 'max_temp', 'min_temp', 'cnt_coords_bin_pd',
       'cnt_coords_bin_p', 'cnt_coords_bin_d', 'cnt_prev_1h',
       'cnt_mean_prev_3h_pickups', 'cnt_mean_prev_3h_dropoffs',
       'total_distance', 'total_travel_time', 'number_of_steps',
       'trip_duration'],
      dtype='object')

### One-Hot Encoding Categorical Data

In [22]:
%%time

# Convert boolean columns to integers
bool_columns = ["has_snow", "has_rain"]
df[bool_columns] = df[bool_columns].astype(int)

# Process the vendor_id column
if "vendor_id" in df.columns:
    df["vendor_id"] = df["vendor_id"] - 1
    
# Progress the flag column
df["store_and_fwd_flag"] = df["store_and_fwd_flag"].apply(lambda x: 0 if x == "Y" else 1)
df["store_and_fwd_flag"] = df["store_and_fwd_flag"].apply(lambda x: 0 if x == "Y" else 1)

CPU times: user 585 ms, sys: 0 ns, total: 585 ms
Wall time: 582 ms


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1210987 entries, 0 to 1210986
Data columns (total 34 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   id                         1210987 non-null  object 
 1   vendor_id                  1210987 non-null  int64  
 2   passenger_count            1210987 non-null  int64  
 3   store_and_fwd_flag         1210987 non-null  int64  
 4   pickup_pca0                1210987 non-null  float64
 5   pickup_pca1                1210987 non-null  float64
 6   dropoff_pca0               1210987 non-null  float64
 7   dropoff_pca1               1210987 non-null  float64
 8   geodesic_distance          1210987 non-null  float64
 9   pickup_hour_of_day         1210987 non-null  int32  
 10  day_of_week                1210987 non-null  int64  
 11  hour_of_week               1210987 non-null  int64  
 12  month_of_year              1210987 non-null  int64  
 13  day_of_year 

In [24]:
df.describe()

Unnamed: 0,vendor_id,passenger_count,store_and_fwd_flag,pickup_pca0,pickup_pca1,dropoff_pca0,dropoff_pca1,geodesic_distance,pickup_hour_of_day,day_of_week,...,cnt_coords_bin_pd,cnt_coords_bin_p,cnt_coords_bin_d,cnt_prev_1h,cnt_mean_prev_3h_pickups,cnt_mean_prev_3h_dropoffs,total_distance,total_travel_time,number_of_steps,trip_duration
count,1210987.0,1210987.0,1210987.0,1210987.0,1210987.0,1210987.0,1210987.0,1210987.0,1210987.0,1210987.0,...,1210987.0,1210987.0,1210987.0,1210987.0,1210987.0,1210987.0,1210987.0,1210987.0,1210987.0,1210987.0
mean,0.5345714,1.669153,1.0,-0.006675423,-0.0441935,-0.007728207,-0.01712308,3.952992,13.70683,3.040424,...,1166.138,36696.27,28937.38,332.7892,9.922165,7.881916,5319.821,441.5067,8.219698,946.5372
std,0.4988036,1.314138,0.0,0.5628665,0.8420306,0.5354075,0.9688054,4.131656,6.36944,1.943954,...,1293.731,22845.63,21141.91,100.8178,8.313414,7.627852,5541.506,318.4298,4.509629,648.9072
min,0.0,0.0,1.0,-6.046855,-6.539942,-18.17656,-6.590078,0.3065172,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,301.0
25%,0.0,1.0,1.0,-0.08788089,-0.4178586,-0.1567154,-0.479929,1.581162,9.0,1.0,...,188.0,19575.0,11467.0,295.0,3.0,1.666667,2118.5,226.7,5.0,508.0
50%,1.0,1.0,1.0,0.1146166,0.05761965,0.08902504,0.06947418,2.499565,14.0,3.0,...,823.0,33772.0,26624.0,344.0,7.666667,6.0,3269.2,337.9,7.0,758.0
75%,1.0,2.0,1.0,0.2712265,0.4323915,0.2596224,0.4865418,4.481081,19.0,5.0,...,1652.0,55752.0,43325.0,397.0,15.0,12.0,6007.2,549.7,10.0,1165.0
max,1.0,6.0,1.0,5.045628,16.75372,10.55469,20.75788,116.6143,23.0,6.0,...,6912.0,76047.0,65339.0,544.0,50.33333,49.0,85064.3,4243.6,46.0,20400.0


### Save Data in parquet

In [25]:
%%time

# Ensure the 'prep' directory exists
os.makedirs(os.path.join("data", "prep"), exist_ok=True)

# Save to preprocessed df as Parquet files
df.to_parquet("data/prep/data.parquet", index=False)

# Perform 80-20 train-validation-test split (80% train, 10% validation, 10% test)
df_train, df_temp = train_test_split(df, test_size=0.2, random_state=42)
df_valid, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)

# Save the train, validation, and test datasets as Parquet files
df_train.to_parquet("data/prep/data_train.parquet", index=False)
df_valid.to_parquet("data/prep/data_valid.parquet", index=False)
df_test.to_parquet("data/prep/data_test.parquet", index=False)

# Print the number of records in each set
print(f"Number of records in train set: {len(df_train)}")
print(f"Number of records in validation set: {len(df_valid)}")
print(f"Number of records in test set: {len(df_test)}")

Number of records in train set: 968789
Number of records in validation set: 121099
Number of records in test set: 121099
CPU times: user 3.4 s, sys: 448 ms, total: 3.85 s
Wall time: 3.74 s


---