# PART - 3
---------
## Model - 2 (PassiveAggressiveRegressor)

In [0]:
# Import warnings to ignore UserWarnings
import warnings
warnings.filterwarnings("ignore", category=UserWarning)


In [0]:
# Use the Preprocessor that is already saved
import joblib
import os

save_path = "/Volumes/workspace/bde/assignment2"

# Ensure directory exists
os.makedirs(save_path, exist_ok=True)

# Load preprocessor
preprocessor = joblib.load(os.path.join(save_path, "preprocessor.joblib"))

print("✅ Model and preprocessor loaded successfully.")


✅ Model and preprocessor loaded successfully.


In [0]:
# Import Packages
import numpy as np
import pandas as pd
from pyspark.sql import functions as F
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline as sk_pipeline
import joblib
import time

# Set up dataset and columns
df = spark.table("train_df_2024_delta")  # PySpark DataFrame with 25+ million rows

# Categorical and numerical columns
categorical_cols = ["pickup_borough", "dropoff_borough", "RatecodeID", "payment_type", "trip_type", "taxi_colour"]
numerical_cols = [
    "passenger_count", "trip_distance", "trip_time", "speed_mph", "extra", "mta_tax",
    "tip_amount", "ehail_fee", "improvement_surcharge", "congestion_surcharge", "airport_fee",
    "month", "day_of_week", "hour"
]
target_col = "total_amount"
all_cols_for_preprocessor = categorical_cols + numerical_cols

# Initialize SGDRegressor for partial_fit
model = PassiveAggressiveRegressor()

BATCH_SIZE = 200_000
offset = 0
total_rows = df.count()
num_batches = int(np.ceil(total_rows / BATCH_SIZE))

print(f"Total rows: {total_rows}")
print(f"Training in {num_batches} batches of up to {BATCH_SIZE} rows.")

# Batch-wise training
while offset < total_rows:
    start_time = time.time()

    # Use Spark SQL row_number for batching (avoid limit/offset issues)
    batch_df = spark.sql(f"""
        SELECT *
        FROM (
            SELECT *, ROW_NUMBER() OVER (ORDER BY total_amount) AS rn
            FROM train_df_2024_delta
        ) tmp
        WHERE rn > {offset} AND rn <= {offset + BATCH_SIZE}
    """)

    batch_pandas_df = batch_df.toPandas()
    if batch_pandas_df.empty:
        break

    # Split features and target
    X_batch = batch_pandas_df[all_cols_for_preprocessor]
    y_batch = batch_pandas_df[target_col]

    # Preprocess
    X_batch_processed = preprocessor.transform(X_batch)

    # Partial fit
    model.partial_fit(X_batch_processed, y_batch)

    offset += len(X_batch)
    end_time = time.time()
    print(f"✅ Batch trained: {len(X_batch)} rows, Time: {end_time - start_time:.2f}s, Total rows processed: {offset}")

print("🎉 Model trained on full dataset using partial_fit.")

Total rows: 25689161
Training in 129 batches of up to 200000 rows.
✅ Batch trained: 200000 rows, Time: 14.77s, Total rows processed: 200000
✅ Batch trained: 200000 rows, Time: 14.94s, Total rows processed: 400000
✅ Batch trained: 200000 rows, Time: 14.90s, Total rows processed: 600000
✅ Batch trained: 200000 rows, Time: 14.55s, Total rows processed: 800000
✅ Batch trained: 200000 rows, Time: 15.01s, Total rows processed: 1000000
✅ Batch trained: 200000 rows, Time: 15.03s, Total rows processed: 1200000
✅ Batch trained: 200000 rows, Time: 15.19s, Total rows processed: 1400000
✅ Batch trained: 200000 rows, Time: 14.85s, Total rows processed: 1600000
✅ Batch trained: 200000 rows, Time: 14.74s, Total rows processed: 1800000
✅ Batch trained: 200000 rows, Time: 15.02s, Total rows processed: 2000000
✅ Batch trained: 200000 rows, Time: 14.67s, Total rows processed: 2200000
✅ Batch trained: 200000 rows, Time: 14.74s, Total rows processed: 2400000
✅ Batch trained: 200000 rows, Time: 14.73s, Total

In [0]:
# Save preprocessor and model
import joblib
import os

# Define the path
save_path = "/Volumes/workspace/bde/assignment2"

# Ensure directory exists
os.makedirs(save_path, exist_ok=True)

# Save them
joblib.dump(model, os.path.join(save_path, "passive_aggressive_model.joblib"))
joblib.dump(preprocessor, os.path.join(save_path, "preprocessor.joblib"))

print("✅ Model and preprocessor saved successfully.")

✅ Model and preprocessor saved successfully.


In [0]:
# Due to long running time sometimes it shows there is no packages imported
# So for safer side import numpy and os again
import numpy as np
import os

save_path = "/Volumes/workspace/bde/assignment2"

X_test = np.load(os.path.join(save_path, "X_test_processed.npy"))
y_test = np.load(os.path.join(save_path, "y_test.npy"))

print("✅ Reloaded preprocessed test set")


✅ Reloaded preprocessed test set


In [0]:
# Calculate RMSE for Test Set
from sklearn.metrics import root_mean_squared_error
y_pred = model.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
print(f"RMSE: {rmse:.2f}")

RMSE: 72.73
