In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error
import warnings

warnings.filterwarnings('ignore')
print("Libraries imported successfully.")

Libraries imported successfully.


In [2]:
# --- 1. Load Data and Engineer Features ---
# --- THIS IS THE CHANGE: Using your new expanded file ---
input_filename = 'passenger_train_data_expanded.csv'

try:
    df_train = pd.read_csv(input_filename, sep=',')
    print(f"Successfully loaded '{input_filename}'. Found {len(df_train)} rows.")

    # --- 2. Engineer Features ---
    df_train['timeStamp'] = pd.to_datetime(df_train['timeStamp'])
    df_train['booking_month'] = df_train['timeStamp'].dt.month
    df_train['booking_day_of_week'] = df_train['timeStamp'].dt.dayofweek
    df_train['booking_hour'] = df_train['timeStamp'].dt.hour
    
    print("Feature engineering complete.")

except FileNotFoundError:
    print(f"Error: '{input_filename}' not found.")
except Exception as e:
    print(f"An error occurred: {e}")

Successfully loaded 'passenger_train_data_expanded.csv'. Found 180839 rows.
Feature engineering complete.


In [3]:
# --- 3. Define Feature Set (X) and Target (y) ---
feature_cols = [
    'fromStnCode', 
    'toStnCode', 
    'classCode', 
    'distance', 
    'duration', 
    'booking_month', 
    'booking_day_of_week',
    'booking_hour'
]

df_train = df_train.dropna(subset=['totalFare'] + feature_cols)
print(f"Rows after dropping any missing values: {len(df_train)}")

X = df_train[feature_cols]
y = df_train['totalFare']

print("--- 'Passenger' Feature Set (X) ---")
print(X.head())

# --- 4. Define Preprocessing ---
categorical_features = ['fromStnCode', 'toStnCode', 'classCode']
numerical_features = ['distance', 'duration', 'booking_month', 
                      'booking_day_of_week', 'booking_hour']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# --- 5. Split the Data ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=37)

print(f"Data split into {len(X_train)} training rows and {len(X_test)} test rows.")

Rows after dropping any missing values: 180839
--- 'Passenger' Feature Set (X) ---
  fromStnCode toStnCode classCode  distance  duration  booking_month  \
0         JBP      SRID        1A        54      33.0             10   
1         JBP      SRID        2A        54      33.0             10   
2         JBP      SRID        3A        54      33.0             10   
3         JBP      SRID        SL        54      33.0             10   
4         JBP       KKB        1A        69      49.0             10   

   booking_day_of_week  booking_hour  
0                    1            22  
1                    1            22  
2                    1            22  
3                    1            22  
4                    1            22  
Data split into 126587 training rows and 54252 test rows.


In [4]:
# --- 6. Create and Train the Pipeline ---
rf_model = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42)

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', rf_model)])

# --- NOTE ---
# This will take longer to train than before, as the dataset is
# now much larger (180k rows vs 52k). Please be patient.
print("Training the RandomForest model on 180,839 rows...")
pipeline.fit(X_train, y_train)
print("Training complete.")

Training the RandomForest model on 180,839 rows...
Training complete.


In [5]:
# --- 7. Evaluate the New Model ---
y_pred = pipeline.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"\n--- 'Passenger' Model Performance (Expanded Dataset) ---")
print(f"R-squared (R²): {r2:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")


--- 'Passenger' Model Performance (Expanded Dataset) ---
R-squared (R²): 0.9879
Mean Absolute Error (MAE): 38.50


In [6]:
# --- 8. Save the New Model ---
# We save it with the SAME name. This will overwrite the old,
# smaller model, and the app will automatically use this new one.
output_model_name = 'train_price_pipeline.pkl'
joblib.dump(pipeline, output_model_name)
print(f"\nNew, expanded train pipeline saved as '{output_model_name}'")


New, expanded train pipeline saved as 'train_price_pipeline.pkl'
