# PART - 3
-----------
## Model - 3 & 4 (RandomForestRegressor & DecisionTreeRegressor with 2024 Sampling Data )

In [0]:
# Import required libraries
from pyspark.sql import functions as F

In [0]:
# Use the delta table already saved
train_df=spark.table("train_df_2024_delta")

In [0]:
%sql
-- Count the number of rows in the table
select count(*) from train_df_2024_delta;

count(*)
25689161


In [0]:
%sql
-- Count the number of rows in the table
select count(*) from test_df_2024_delta;

count(*)
9518622


In [0]:
# Create a Function for Sampling the data
def stratified_sample(df, frac=0.04, seed=42):
    """
    Stratified sample across multiple categorical dimensions.
    
    Parameters:
        df   : input Spark DataFrame
        frac : fraction of rows to sample from each stratum
        seed : random seed
    
    Returns:
        sample_df : stratified sample DataFrame
    """
    # Create a combined key for stratification
    df = df.withColumn(
        "strata",
        F.concat_ws("_",
            df["pickup_borough"],
            df["dropoff_borough"],
            df["payment_type"].cast("string"),
            df["taxi_colour"],
            df["month"].cast("string"),
            df["day_of_week"].cast("string"),
            df["hour"].cast("string"),
            df["trip_type"].cast("string"),
            df["RatecodeID"].cast("string")

        )
    )
    
    # Build fractions dictionary (one entry per unique stratum)
    fractions = (
        df.select("strata")
          .distinct()
          .withColumn("fraction", F.lit(frac))
    )
    fractions_dict = {row["strata"]: row["fraction"] for row in fractions.collect()}

    # Apply stratified sampling
    sample_df = df.sampleBy("strata", fractions=fractions_dict, seed=seed)

    # Drop helper column
    sample_df = sample_df.drop("strata")

    return sample_df


In [0]:
# Take a sample_df using stratified sample
sample_df = stratified_sample(train_df, frac=0.04, seed=42)

num_rows = sample_df.count()
num_cols = len(sample_df.columns)

print(f"Rows: {num_rows}, Columns: {num_cols}")

Rows: 1027780, Columns: 21


In [0]:
# Convert the dataframe to pandas dataframe
train_df_sample=sample_df.toPandas()

In [0]:
# Create X_train and y_train
X_train = train_df_sample.drop("total_amount",axis=1)
y_train = train_df_sample["total_amount"]

In [0]:
# Load the saved Preprocessor
import joblib
import os

save_path = "/Volumes/workspace/bde/assignment2"

# Ensure directory exists
os.makedirs(save_path, exist_ok=True)

# Load preprocessor
preprocessor = joblib.load(os.path.join(save_path, "preprocessor.joblib"))

print("✅ Model and preprocessor loaded successfully.")


✅ Model and preprocessor loaded successfully.


In [0]:
# Transform the data using the saved preprocessor
X_train_preprocessed= preprocessor.transform(X_train)

In [0]:
# Train the RandomForestRegressor model
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

model_1 = RandomForestRegressor(n_estimators=100, random_state=42)
model_1.fit(X_train_preprocessed, y_train)

In [0]:
# Save the model
import joblib
import os

# Define the path
save_path = "/Volumes/workspace/bde/assignment2"

# Ensure directory exists
os.makedirs(save_path, exist_ok=True)

# Save them
joblib.dump(model_1, os.path.join(save_path, "randomforest_2024_model.joblib"))

print("✅ Model saved successfully.")

✅ Model saved successfully.


In [0]:
# Load the saved X_test and y_test
import numpy as np
import os

save_path = "/Volumes/workspace/bde/assignment2"

X_test = np.load(os.path.join(save_path, "X_test_processed.npy"))
y_test = np.load(os.path.join(save_path, "y_test.npy"))

print("✅ Reloaded preprocessed test set")


✅ Reloaded preprocessed test set


In [0]:
# Calculate RMSE for the Test set
from sklearn.metrics import root_mean_squared_error
y_pred = model_1.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
print(f"Random Forest Regressor RMSE: {rmse:.2f}")

Random Forest Regressor RMSE: 1.66


In [0]:
# Train the DecisionTreeRegressor model
model_2 = DecisionTreeRegressor()
model_2.fit(X_train_preprocessed, y_train)

In [0]:
# Calculate the RMSE for the test set
from sklearn.metrics import root_mean_squared_error
y_pred = model_2.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
print(f"Decision Tree Regressor RMSE: {rmse:.2f}")

Decision Tree Regressor RMSE: 2.33
