In [1]:
import numpy as np
import pandas as pd

# Load dataset
dataset = np.load("Datasets/nyc_taxi_data.npy", allow_pickle=True).item()
X_train, y_train, X_test, y_test = dataset["X_train"], dataset["y_train"], dataset["X_test"], dataset["y_test"]

# Convert to Pandas DataFrame
df_train = pd.DataFrame(X_train)
df_train["trip_duration"] = y_train  # Add target column

print(df_train.head())  # View first few rows
print(df_train.info())  # Check data types


                id  vendor_id      pickup_datetime     dropoff_datetime  \
879655   id2425795          1  2016-01-08 23:55:11  2016-01-09 00:04:32   
646838   id0767831          2  2016-03-05 09:52:06  2016-03-05 10:00:12   
1138713  id0449104          1  2016-04-09 16:03:53  2016-04-09 16:21:22   
864716   id3030157          1  2016-01-06 11:12:44  2016-01-06 11:19:49   
434927   id1584885          1  2016-06-26 09:10:56  2016-06-26 09:17:44   

         passenger_count  pickup_longitude  pickup_latitude  \
879655                 1        -73.955551        40.773346   
646838                 1        -73.962181        40.763599   
1138713                1        -73.977486        40.751842   
864716                 1        -73.970001        40.762363   
434927                 1        -73.950348        40.771561   

         dropoff_longitude  dropoff_latitude store_and_fwd_flag  trip_duration  
879655          -73.973640         40.763500                  N            561  
646838  

Keys are:  
  - X_train  
  - X_test  
  - y_train  
  - y_test  

Extract our input_file into: X_train, y_train, X_test, y_test.

Create a dataframe, df_train, of the X_train dataset for feature engineering and cleaning.  
Add the target data, y_train, to df_train.
Print the first 5 entries of df_train.  

Features are:  
  - id \<obj\>
  - vendor_id \<int64\>
  - pickup_datetime \<obj\>
  - dropoff_datetime \<obj\>
  - passenger_count \<int64\>
  - pickup_longitude \<float64\>
  - pickup_latitude \<float64\>
  - dropoff_longitude \<float64\>
  - dropoff_latitude \<float64\>
  - store_and_fwd_flag \<obj\>
  - trip_duration (test) \<int64\>

Extract time features for training.

In [2]:
# Extract Time Features
# Convert to datetime
df_train["pickup_datetime"] = pd.to_datetime(df_train["pickup_datetime"])
df_train["dropoff_datetime"] = pd.to_datetime(df_train["dropoff_datetime"])

# Extract Hour
df_train["pickup_hour"] = df_train["pickup_datetime"].dt.hour
df_train["dropoff_hour"] = df_train["dropoff_datetime"].dt.hour

# Extract day of week
df_train["pickup_day_of_week"] = df_train["pickup_datetime"].dt.weekday
df_train["dropoff_day_of_week"] = df_train["dropoff_datetime"].dt.weekday

# Extract month
df_train["pickup_month"] = df_train["pickup_datetime"].dt.month
df_train["dropoff_month"] = df_train["dropoff_datetime"].dt.month

# Calculate sin/cos of pickup/dropoff hour to normalize values
df_train["pickup_hour_sin"] = np.sin(2 * np.pi * df_train["pickup_hour"] /24)
df_train["dropoff_hour_sin"] = np.sin(2 * np.pi * df_train["dropoff_hour"] /24)
df_train["pickup_hour_cos"] = np.cos(2 * np.pi * df_train["pickup_hour"] /24)
df_train["dropoff_hour_cos"] = np.cos(2 * np.pi * df_train["dropoff_hour"] /24)

#Drop original timestamps (not required after feature extraction)
df_train.drop(columns=["pickup_datetime", "dropoff_datetime", "pickup_hour", "dropoff_hour"], inplace=True)

import MinMaxScaler to normalize the lat/long of pickup and dropoff.

In [3]:
from sklearn.preprocessing import MinMaxScaler

# Define the scaler
scaler = MinMaxScaler()

# Select the columns to normalize
columns_to_normalize = ["pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude"]

# Apply Min-Max Scaling
df_train[columns_to_normalize] = scaler.fit_transform(df_train[columns_to_normalize])

Remove irrelevant columns "id" and "vendor_id".  It is possible that vendor_id may have some predictive power and that some vendors have quicker or slower trip times than others.  However, my intuition is that the location and time of day are more predictive of trip times than which company provides the service.

In [4]:
df_train.drop(columns=["id", "vendor_id", "store_and_fwd_flag"], inplace=True)

Sanity Check.

In [None]:
print(df_train.head())
print(df_train.info())  # Ensure all columns are numerical

In [5]:
# Normalize target data with natural log (offset with 1 to avoid log 0)
df_train["trip_duration"] = np.log1p(df_train["trip_duration"])

Visualize normalization of trip_duration (target).

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot before and after transformation
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Original Distribution - use np.expm1 as log inverse
sns.histplot(np.expm1(df_train["trip_duration"]), bins=50, kde=True, ax=axes[0])
axes[0].set_title("Original Trip Duration Distribution")

# Log-Normalized Distribution
sns.histplot(df_train["trip_duration"], bins=50, kde=True, ax=axes[1])
axes[1].set_title("Log-Normalized Trip Duration Distribution")

plt.show()

In [None]:
# Troubleshoot crash due to memory
import sys

print("DataFrame size in memory (bytes):", sys.getsizeof(df_train))
print("Expected NumPy array size (bytes):", df_train.memory_usage(deep=True).sum())

In [None]:
# Check types for data to ensure not trying to convert <obj> types.
print(df_train.dtypes)

Split data for training.

In [6]:
from sklearn.model_selection import train_test_split

df_train, df_val = train_test_split(df_train, test_size=0.2, random_state=42)

# Now convert separately to avoid loading everything at once
X_train = df_train.drop(columns=["trip_duration"]).values.astype(np.float32)
y_train = df_train["trip_duration"].values.astype(np.float32)

X_val = df_val.drop(columns=["trip_duration"]).values.astype(np.float32)
y_val = df_val["trip_duration"].values.astype(np.float32)


Define train_model function.

In [7]:
def train_model(model, X_train, y_train, X_val, y_val, epochs, batch_size, lr, patience, l2_reg, loss_function):
    training_losses = []
    validation_losses = []
    best_val_loss = float("inf")
    patience_counter = 0

    for epoch in range(epochs):
        total_loss = 0

        for i in range(0, len(X_train), batch_size):
            X_batch = X_train[i:i + batch_size]
            y_batch = y_train[i:i + batch_size]

            # Forward pass
            predictions = model.forward(X_batch)

            # Compute loss
            loss = loss_function.forward(y_batch, predictions)
            
            # Update loss
            total_loss += loss

            # Backward pass
            grad_loss = loss_function.backward()
            model.backward(grad_loss)

            # Update weights
            for layer in model.layers:
                if hasattr(layer, "weights") and hasattr(layer, "bias"):
                    np.clip(layer.grad_weights, -1, 1, out=layer.grad_weights)
                    np.clip(layer.grad_bias, -1, 1, out=layer.grad_bias)

                    layer.weights -= lr * (layer.grad_weights + l2_reg * layer.weights)
                    layer.bias -= lr * layer.grad_bias

        # Compute validation loss
        val_predictions = model.forward(X_val)

        val_loss = loss_function.forward(y_val, val_predictions)

        training_losses.append(total_loss / len(X_train))
        validation_losses.append(val_loss)

        print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {training_losses[-1]:.6f}, Validation Loss: {val_loss:.6f}")

        # Early stopping logic
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print("Early stopping triggered! Stopping training.")
            break

    return training_losses, validation_losses

Define 3 models.

In [None]:
# Import Models
from Layer_Implementations.Sequential import Sequential
from Layer_Implementations.Linear import Linear
from Layer_Implementations.Relu import Relu
from Layer_Implementations.MeanSquaredError import MSELoss

# Define input_size
input_size = X_train.shape[1]

# Define models
model_1 = Sequential()
model_1.add(Linear(input_size, 32))
model_1.add(Relu())
model_1.add(Linear(32, 1))

model_2 = Sequential()
model_2.add(Linear(input_size, 32))
model_2.add(Relu())
model_2.add(Linear(32, 16))
model_2.add(Relu())
model_2.add(Linear(16, 1))

model_3 = Sequential()
model_3.add(Linear(input_size, 64))
model_3.add(Relu())
model_3.add(Linear(64, 32))
model_3.add(Relu())
model_3.add(Linear(32, 16))
model_3.add(Relu())
model_3.add(Linear(16, 1))

models = [model_1, model_2, model_3]
model_names = ["Small Model", "Medium Model", "Large Model"]

Set hyperparameters and run models.

In [None]:
# Define hyperparmeters
epochs = 50
batch_size = 32
learning_rate = 0.01
patience = 3
l2_reg = 0.01
loss_function = MSELoss()

# Initialize dictionary to store training_results
training_results = {}

# Loop through models and run each, storing values in training_results
for i, model in enumerate(models):
    print(f"\nTraining {model_names[i]}:\n")

    train_loss, val_loss = train_model(
        model,
        X_train, y_train,
        X_val, y_val,
        epochs, batch_size, learning_rate, patience, l2_reg,
        loss_function
    )

    training_results[model_names[i]] = {"train_loss": train_loss, "val_loss": val_loss}

Plot Training Loss VS. Validation Loss

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))

for model_name, results in training_results.items():
    plt.plot(results["val_loss"], label=f"{model_name} (Validation Loss)")

plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.title("Validation Loss Comparison")
plt.show()