In [None]:
# Import necessary libraries

import pandas as pd
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np

In [None]:
# Load the dataset
df = pd.read_csv('train_tickets_fare_ds/price_data.csv')

# Select relevant columns
df = df[["classCode", "distance", "duration", "totalFare"]]

In [None]:
# Feature engineering function with enhanced features
def featuring(df):
    df_eng = df.copy()
    
    # 1. CLASS ENCODING (Most Important)
    class_hierarchy = ['1A', '2A', '3A', 'SL', 'CC', '2S']
    class_mapping = {class_name: i for i, class_name in enumerate(class_hierarchy, 1)}
    df_eng['class_encoded'] = df_eng['classCode'].map(class_mapping)
    
    # Luxury score (reverse of hierarchy)
    luxury_mapping = {'1A': 6, '2A': 5, '3A': 4, 'CC': 2, '2S': 1, 'SL': 3}
    df_eng['luxury_score'] = df_eng['classCode'].map(luxury_mapping)
    
    # 2. SPEED & EFFICIENCY FEATURES (ENHANCED)
    # Speed calculation
    df_eng['speed_kmh'] = df_eng['distance'] / (df_eng['duration'] / 60)
    
    # Efficiency metrics - Higher efficiency = higher cost
    df_eng['km_per_minute'] = df_eng['distance'] / df_eng['duration']
    
    # Speed premium features
    df_eng['speed_premium'] = df_eng['speed_kmh'] * df_eng['luxury_score']  # Fast + luxury = expensive
    df_eng['efficiency_score'] = (df_eng['distance'] / df_eng['duration']) * df_eng['class_encoded']
    
    # Express train indicator (high speed)
    df_eng['is_express'] = (df_eng['speed_kmh'] > 60).astype(int)  # Above 60 km/h = express train
    
    # 3. JOURNEY TYPE CATEGORIZATION (UPDATED)
    df_eng['is_short_journey'] = (df_eng['distance'] <= 200).astype(int)
    df_eng['is_medium_journey'] = ((df_eng['distance'] > 200) & (df_eng['distance'] <= 500)).astype(int)
    df_eng['is_long_journey'] = (df_eng['distance'] > 500).astype(int)
    
    # Journey intensity (distance/duration ratio)
    df_eng['journey_intensity'] = df_eng['distance'] / df_eng['duration']  # Higher = more intense/faster
    
    # 4. INTERACTION FEATURES (ENHANCED)
    # Class-distance interactions
    df_eng['distance_class'] = df_eng['distance'] * df_eng['class_encoded']
    df_eng['distance_luxury'] = df_eng['distance'] * df_eng['luxury_score']
    
    # Speed-class interactions
    df_eng['speed_class'] = df_eng['speed_kmh'] * df_eng['class_encoded']
    df_eng['speed_luxury'] = df_eng['speed_kmh'] * df_eng['luxury_score']
    
    # Combined interaction
    df_eng['distance_duration_ratio'] = df_eng['distance'] / df_eng['duration']
    df_eng['distance_duration_class'] = df_eng['distance'] * df_eng['duration'] * df_eng['class_encoded']
    
    # Express premium (express trains in higher classes cost more)
    df_eng['express_premium'] = df_eng['is_express'] * df_eng['luxury_score']
    
    # 5. POLYNOMIAL FEATURES
    df_eng['distance_squared'] = df_eng['distance'] ** 2
    df_eng['duration_squared'] = df_eng['duration'] ** 2
    df_eng['speed_squared'] = df_eng['speed_kmh'] ** 2  # NEW
    
    # 6. BINNING FEATURES
    df_eng['distance_bin'] = pd.cut(df_eng['distance'], bins=5, labels=False)
    df_eng['duration_bin'] = pd.cut(df_eng['duration'], bins=5, labels=False)
    df_eng['speed_bin'] = pd.cut(df_eng['speed_kmh'], bins=5, labels=False)  # NEW
    
    return df_eng

# Apply feature engineering
df_enhanced = featuring(df)

In [None]:
# Define final features for modeling:
final_features = [
    # Original features
    'distance', 'duration', 'class_encoded',
    
    # Class features
    'luxury_score',
    
    # Speed & Efficiency features (ENHANCED)
    'speed_kmh', 'km_per_minute', 'speed_premium', 'efficiency_score',
    'is_express', 'journey_intensity',
    
    # Journey type
    'is_short_journey', 'is_medium_journey', 'is_long_journey',
    
    # Interaction features
    'distance_class', 'distance_luxury', 'speed_class', 'speed_luxury',
    'distance_duration_ratio', 'distance_duration_class', 'express_premium',
    
    # Polynomial features
    'distance_squared', 'duration_squared', 'speed_squared',
    
    # Binning features
    'distance_bin', 'duration_bin', 'speed_bin'
]

In [92]:
# Prepare your enhanced features
X = df_enhanced[final_features]
y = df_enhanced['totalFare']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

Training set: 261314 samples
Test set: 65329 samples


In [None]:
# Define and train the Bagging Regressor with Decision Tree base estimator
model = BaggingRegressor(
    estimator=DecisionTreeRegressor(max_depth=15, random_state=42),
    n_estimators=120, 
    random_state=42
)

# Retrain and confirm performance
model.fit(X_train, y_train)

0,1,2
,estimator,DecisionTreeR...ndom_state=42)
,n_estimators,120
,max_samples,1.0
,max_features,1.0
,bootstrap,True
,bootstrap_features,False
,oob_score,False
,warm_start,False
,n_jobs,
,random_state,42

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,15
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [128]:
# Evaluate the model
pred = model.predict(X_test)

r2 = r2_score(y_test, pred)
mae = mean_absolute_error(y_test, pred)
print(f"Model: R²: {r2:.4f} | MAE: ₹{mae:.2f}")

Model: R²: 0.9759 | MAE: ₹50.24


In [None]:
def predict_fare_best(distance, duration, class_code):
    """
    Updated prediction function with speed premium features
    """
    # Class encoding
    class_hierarchy = ['1A', '2A', '3A', 'SL', 'CC', '2S']
    class_mapping = {class_name: i for i, class_name in enumerate(class_hierarchy, 1)}
    luxury_mapping = {'1A': 6, '2A': 5, '3A': 4, 'CC': 2, '2S': 1, 'SL': 3}
    
    class_encoded = class_mapping[class_code]
    luxury_score = luxury_mapping[class_code]
    
    # Calculate speed and efficiency features
    speed_kmh = distance / (duration / 60) if duration > 0 else 0
    km_per_minute = distance / duration if duration > 0 else 0
    speed_premium = speed_kmh * luxury_score
    efficiency_score = (distance / duration) * class_encoded if duration > 0 else 0
    is_express = 1 if speed_kmh > 60 else 0
    journey_intensity = distance / duration if duration > 0 else 0
    
    # Journey type
    is_short_journey = 1 if distance <= 200 else 0
    is_medium_journey = 1 if 200 < distance <= 500 else 0
    is_long_journey = 1 if distance > 500 else 0
    
    # Interaction features
    distance_class = distance * class_encoded
    distance_luxury = distance * luxury_score
    speed_class = speed_kmh * class_encoded
    speed_luxury = speed_kmh * luxury_score
    distance_duration_ratio = distance / duration if duration > 0 else 0
    distance_duration_class = distance * duration * class_encoded
    express_premium = is_express * luxury_score
    
    # Polynomial features
    distance_squared = distance ** 2
    duration_squared = duration ** 2
    speed_squared = speed_kmh ** 2
    
    # Binning features (placeholders)
    distance_bin = 0
    duration_bin = 0
    speed_bin = 0
    
    # Create input array
    input_data = [[
        distance, duration, class_encoded,
        luxury_score,
        speed_kmh, km_per_minute, speed_premium, efficiency_score,
        is_express, journey_intensity,
        is_short_journey, is_medium_journey, is_long_journey,
        distance_class, distance_luxury, speed_class, speed_luxury,
        distance_duration_ratio, distance_duration_class, express_premium,
        distance_squared, duration_squared, speed_squared,
        distance_bin, duration_bin, speed_bin
    ]]
    
    predicted_fare = model.predict(input_data)[0]
    return predicted_fare

In [None]:
# Testing the model with Custom inputs
print(f"{predict_fare_best(1860.5, 1790, "SL") * 1.8:.2f}")
print(f"{predict_fare_best(1860.5, 2015, "SL") * 1.8:.2f}")

1351.69
1362.31


