In [2]:
import pandas as pd

In [3]:
pd.__version__

'1.4.2'

In [4]:
df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet')

In [5]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,2,2024-01-01 00:57:55,2024-01-01 01:17:43,1.0,1.72,1.0,N,186,79,2,17.7,1.0,0.5,0.0,0.0,1.0,22.7,2.5,0.0
1,1,2024-01-01 00:03:00,2024-01-01 00:09:36,1.0,1.8,1.0,N,140,236,1,10.0,3.5,0.5,3.75,0.0,1.0,18.75,2.5,0.0
2,1,2024-01-01 00:17:06,2024-01-01 00:35:01,1.0,4.7,1.0,N,236,79,1,23.3,3.5,0.5,3.0,0.0,1.0,31.3,2.5,0.0
3,1,2024-01-01 00:36:38,2024-01-01 00:44:56,1.0,1.4,1.0,N,79,211,1,10.0,3.5,0.5,2.0,0.0,1.0,17.0,2.5,0.0
4,1,2024-01-01 00:46:51,2024-01-01 00:52:57,1.0,0.8,1.0,N,211,148,1,7.9,3.5,0.5,3.2,0.0,1.0,16.1,2.5,0.0


In [6]:
import sklearn

In [7]:
sklearn.__version__

'1.0.2'

In [8]:
print(df.shape)

(2964624, 19)


In [9]:
print(len(df.columns))

19


In [10]:
print(list(df.columns))

['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge', 'Airport_fee']


In [11]:
df['duration'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']

In [12]:
df['duration'] = df['duration'].dt.total_seconds() / 60


In [13]:
std_dev_duration = df['duration'].std()

print(f"The data type of the 'duration' column is: {df['duration'].dtype}")
print(f"Standard deviation of trip duration in minutes: {std_dev_duration}")

The data type of the 'duration' column is: float64
Standard deviation of trip duration in minutes: 34.851053592192876


In [14]:
print(f"Number of records before dropping outliers: {len(df)}")

lower_bound_duration = 1
upper_bound_duration = 60

df_filtered = df[(df['duration'] >= lower_bound_duration) & (df['duration'] <= upper_bound_duration)].copy()
print(f"Number of records after dropping outliers (1 to 60 min duration): {len(df_filtered)}")


percentage_remaining = (len(df_filtered) / len(df)) * 100

print(f"Percentage of records remaining: {percentage_remaining:.2f}%")

Number of records before dropping outliers: 2964624
Number of records after dropping outliers (1 to 60 min duration): 2898906
Percentage of records remaining: 97.78%


In [15]:
from sklearn.feature_extraction import DictVectorizer

categorical_cols = ['PULocationID', 'DOLocationID']
df_filtered[categorical_cols] = df_filtered[categorical_cols].fillna(-1).astype(str)

records_for_dv = df_filtered[categorical_cols].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(records_for_dv)

num_features = X_train.shape[1]

print(f"The shape of the feature matrix is: {X_train.shape}")
print(f"The dimensionality (number of features) of the matrix after one-hot encoding is: {num_features}")


The shape of the feature matrix is: (2898906, 518)
The dimensionality (number of features) of the matrix after one-hot encoding is: 518


In [16]:
target = 'duration'
y_train = df_filtered[target].values
print(f"Shape of y_train: {y_train.shape}")

Shape of y_train: (2898906,)


In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np


target = 'duration'
if 'df_filtered' in locals(): # Check if df_filtered exists
    y_train = df_filtered[target].values
else:
    y_train = df[target].values

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

print("Linear Regression model trained.")
y_pred_train = lr_model.predict(X_train)

mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = np.sqrt(mse_train)

print(f"Mean Squared Error (MSE) on training data: {mse_train}")
print(f"Root Mean Squared Error (RMSE) on training data: {rmse_train}")

Shape of X_train: (2898906, 518)
Shape of y_train: (2898906,)
Linear Regression model trained.
Mean Squared Error (MSE) on training data: 63.141671060223224
Root Mean Squared Error (RMSE) on training data: 7.946173359562653


In [18]:
# CELL TO RUN BEFORE STARTING Q6 DATA LOADING

print("Attempting to free up memory from training data artifacts...")

# Make sure these variables actually exist from your Q4/Q5 run before trying to delete
if 'X_train' in locals():
    del X_train
    print("Deleted X_train.")
if 'y_train' in locals():
    del y_train
    print("Deleted y_train.")

# Assuming your filtered training DataFrame was named df_filtered (from Q3 output)
# or if you reassigned it to 'df' after filtering.
# Adjust the name if it was different (e.g., df_processed, train_df)
if 'df_filtered' in locals() and 'df_val_filtered' not in locals(): # Avoid deleting df_val_filtered prematurely
    # This is to delete the DataFrame that was used to create X_train and y_train
    # Be careful with the variable name here. It's the one that holds the filtered *training* data.
    # If you used 'df' for your main filtered training data, use 'del df'
    # For now, I'll assume it was 'df_filtered' as per Q3's output variable name.
    # This assumes 'df_val_filtered' is a distinct variable for validation.
    try:
        # Check if df_filtered is indeed the training dataframe by checking its length from previous Q3/Q4 outputs
        # e.g. if training df_filtered had 2898906 rows.
        # This is a bit heuristic, be careful.
        if len(df_filtered) == 2898906: # Number of rows from your Q4 X_train shape
             del df_filtered
             print("Deleted the main filtered training DataFrame (assumed name: df_filtered).")
    except NameError:
        print("Training DataFrame (df_filtered) not found or already deleted.")


import gc
gc.collect() # Trigger garbage collection
print("Memory cleanup attempted.")

# Crucially, 'dv' (fitted DictVectorizer) and 'lr_model' (trained LinearRegression)
# MUST remain in memory. Do NOT delete them.
# Also, 'categorical_cols', 'target', 'lower_bound_duration', 'upper_bound_duration' must remain.

Attempting to free up memory from training data artifacts...
Deleted X_train.
Deleted y_train.
Deleted the main filtered training DataFrame (assumed name: df_filtered).
Memory cleanup attempted.


In [19]:
# --- Question 6: RMSE on Validation Data (Revised for Memory) ---

# Ensure 'dv' (fitted DictVectorizer), 'lr_model' (trained LinearRegression model),
# 'categorical_cols', 'target', 'lower_bound_duration', 'upper_bound_duration'
# are defined and in memory. Run the cleanup cell above this one first.

print("Starting Question 6 processing (memory optimized)...")
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import gc # For garbage collection

# 1. Load Validation Data
# ADITYA: ENSURE THIS URL IS CORRECT FOR YOUR FEB 2024 (or other) VALIDATION DATA
validation_data_url = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet'

print(f"Loading validation data from: {validation_data_url}")
df_val = pd.read_parquet(validation_data_url)
print(f"Loaded validation data with {len(df_val)} records.")

# 2. Apply ALL Preprocessing Steps

# a) Calculate 'duration'
df_val['duration'] = df_val['tpep_dropoff_datetime'] - df_val['tpep_pickup_datetime']
df_val['duration'] = df_val['duration'].dt.total_seconds() / 60
print("Calculated duration for validation data.")

# b) Filter outliers
df_val_filtered = df_val[(df_val['duration'] >= lower_bound_duration) & (df_val['duration'] <= upper_bound_duration)].copy()
del df_val # Free memory from the original large unfiltered validation dataframe
gc.collect()
print(f"Validation data after filtering outliers: {len(df_val_filtered)} records.")

rmse_val = "Error: Could not compute."

if len(df_val_filtered) == 0:
    print("No data remains after filtering the validation set. Cannot proceed.")
else:
    # c) Prepare categorical features (on df_val_filtered)
    df_val_filtered[categorical_cols] = df_val_filtered[categorical_cols].fillna(-1).astype(str)
    
    # Define a generator for DictVectorizer
    def records_generator_for_dv(df, Pcols):
        for record_tuple in df[Pcols].itertuples(index=False):
            yield dict(zip(Pcols, record_tuple))

    # d) Transform using the FITTED DictVectorizer with the generator
    print("Transforming validation data with DictVectorizer (using generator)...")
    # 'dv' is the DictVectorizer fitted on training data
    X_val = dv.transform(records_generator_for_dv(df_val_filtered, categorical_cols))
    # The generator is consumed by transform, no separate del needed for val_records_for_dv
    gc.collect()
    print(f"Shape of X_val: {X_val.shape}")

    # 3. Prepare Target Variable (y_val)
    y_val = df_val_filtered[target].values
    # Now that X_val and y_val are created, we can delete df_val_filtered
    del df_val_filtered 
    gc.collect()
    print(f"Shape of y_val: {y_val.shape}")

    if X_val.shape[0] > 0 and X_val.shape[0] == y_val.shape[0]:
        # 4. Make Predictions
        print("Making predictions on validation data...")
        y_pred_val = lr_model.predict(X_val)

        # 5. Calculate RMSE
        mse_val = mean_squared_error(y_val, y_pred_val)
        rmse_val = np.sqrt(mse_val)

        print(f"Mean Squared Error (MSE) on validation data: {mse_val}")
        print(f"Root Mean Squared Error (RMSE) on validation data: {rmse_val}")
    else:
        print("Skipping RMSE calculation: Shape mismatch or empty data for X_val/y_val.")

# Final output for the question
print(f"\nFinal RMSE on validation (for Question 6): {rmse_val}")

Starting Question 6 processing (memory optimized)...
Loading validation data from: https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet
Loaded validation data with 3007526 records.
Calculated duration for validation data.
Validation data after filtering outliers: 2938060 records.
Transforming validation data with DictVectorizer (using generator)...
Shape of X_val: (2938060, 518)
Shape of y_val: (2938060,)
Making predictions on validation data...
Mean Squared Error (MSE) on validation data: 65.98935055442578
Root Mean Squared Error (RMSE) on validation data: 8.123382950127723

Final RMSE on validation (for Question 6): 8.123382950127723
