In [20]:
pip install lightgbm


Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   -------------- ------------------------- 0.5/1.5 MB 1.7 MB/s eta 0:00:01
   -------------- ------------------------- 0.5/1.5 MB 1.7 MB/s eta 0:00:01
   -------------- ------------------------- 0.5/1.5 MB 1.7 MB/s eta 0:00:01
   ---------------------------- ----------- 1.0/1.5 MB 948.7 kB/s eta 0:00:01
   ---------------------------- ----------- 1.0/1.5 MB 948.7 kB/s eta 0:00:01
   ------------------------------------ --- 1.3/1.5 MB 838.9 kB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 855.0 kB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Note: you may need to resta


[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [23]:
import pandas as pd
import numpy as np
import requests
import json
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Core ML Libraries
from sklearn.ensemble import (RandomForestRegressor, GradientBoostingRegressor, 
                             ExtraTreesRegressor, AdaBoostRegressor, BaggingRegressor)
from sklearn.linear_model import (LinearRegression, Ridge, Lasso, ElasticNet, 
                                 HuberRegressor, RANSACRegressor)
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import (train_test_split, cross_val_score, 
                                   GridSearchCV, RandomizedSearchCV, KFold)
from sklearn.preprocessing import (StandardScaler, MinMaxScaler, RobustScaler, 
                                 LabelEncoder, PolynomialFeatures, PowerTransformer)
from sklearn.metrics import (mean_absolute_error, mean_squared_error, r2_score, 
                           mean_absolute_percentage_error, explained_variance_score)
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, RegressorMixin
import xgboost as xgb
import lightgbm as lgb
from scipy import stats
from scipy.stats import zscore
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Tuple, Optional
import pickle
import joblib

class AdvancedEnsembleRegressor(BaseEstimator, RegressorMixin):
    """
    Advanced ensemble regressor with dynamic weighting and stacking
    """
    def __init__(self, base_models=None, meta_model=None, use_stacking=True):
        self.base_models = base_models or []
        self.meta_model = meta_model or Ridge(alpha=0.1)
        self.use_stacking = use_stacking
        self.weights = None
        self.trained_models = []
        
    def fit(self, X, y):
        # Train base models
        self.trained_models = []
        base_predictions = np.zeros((X.shape[0], len(self.base_models)))
        
        for i, model in enumerate(self.base_models):
            model_copy = model.__class__(**model.get_params())
            model_copy.fit(X, y)
            self.trained_models.append(model_copy)
            base_predictions[:, i] = model_copy.predict(X)
        
        if self.use_stacking:
            # Train meta-model on base predictions
            self.meta_model.fit(base_predictions, y)
        else:
            # Calculate dynamic weights based on individual model performance
            self.weights = []
            for i, model in enumerate(self.trained_models):
                pred = base_predictions[:, i]
                mse = mean_squared_error(y, pred)
                weight = 1 / (mse + 1e-10)  # Inverse of MSE
                self.weights.append(weight)
            
            # Normalize weights
            total_weight = sum(self.weights)
            self.weights = [w / total_weight for w in self.weights]
        
        return self
    
    def predict(self, X):
        base_predictions = np.zeros((X.shape[0], len(self.trained_models)))
        
        for i, model in enumerate(self.trained_models):
            base_predictions[:, i] = model.predict(X)
        
        if self.use_stacking:
            return self.meta_model.predict(base_predictions)
        else:
            # Weighted average
            return np.average(base_predictions, axis=1, weights=self.weights)

class AdvancedVehicleSystem:
    def __init__(self):
        self.models = {}
        self.scalers = {}
        self.label_encoders = {}
        self.feature_selectors = {}
        self.model_performances = {}
        self.best_model = None
        self.feature_importance = {}
        
        # Initialize all algorithms
        self.algorithms = {
            'random_forest': RandomForestRegressor(n_estimators=200, max_depth=15, random_state=42),
            'gradient_boosting': GradientBoostingRegressor(n_estimators=200, max_depth=8, random_state=42),
            'xgboost': xgb.XGBRegressor(n_estimators=200, max_depth=8, random_state=42),
            'lightgbm': lgb.LGBMRegressor(n_estimators=200, max_depth=8, random_state=42),
            'extra_trees': ExtraTreesRegressor(n_estimators=200, max_depth=15, random_state=42),
            'ada_boost': AdaBoostRegressor(n_estimators=100, random_state=42),
            'svr_rbf': SVR(kernel='rbf', C=100, gamma='scale'),
            'svr_linear': SVR(kernel='linear', C=100),
            'knn': KNeighborsRegressor(n_neighbors=10, weights='distance'),
            'mlp': MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42),
            'ridge': Ridge(alpha=1.0),
            'lasso': Lasso(alpha=0.1),
            'elastic_net': ElasticNet(alpha=0.1, l1_ratio=0.5),
            'huber': HuberRegressor(epsilon=1.35),
            'ransac': RANSACRegressor(random_state=42),
            'bagging': BaggingRegressor(n_estimators=100, random_state=42)
        }
        
        # Advanced ensemble configurations
        self.ensemble_configs = {
            'voting_ensemble': [
                self.algorithms['random_forest'],
                self.algorithms['gradient_boosting'],
                self.algorithms['xgboost']
            ],
            'stacking_ensemble': [
                self.algorithms['random_forest'],
                self.algorithms['gradient_boosting'],
                self.algorithms['xgboost'],
                self.algorithms['lightgbm'],
                self.algorithms['extra_trees']
            ],
            'meta_ensemble': [
                self.algorithms['random_forest'],
                self.algorithms['gradient_boosting'],
                self.algorithms['xgboost'],
                self.algorithms['lightgbm'],
                self.algorithms['svr_rbf'],
                self.algorithms['mlp']
            ]
        }

     
    # Fix for the generate_advanced_dataset method
# Replace the existing probability arrays with these corrected versions:

def generate_advanced_dataset(self, n_samples=50000):
    """
    Generate comprehensive synthetic dataset with realistic features
    """
    np.random.seed(42)
    
    # Basic features - FIXED PROBABILITIES
    years = np.random.randint(1990, 2025, n_samples)
    
    # Fixed make probabilities (sum = 1.0)
    make_probs = [0.12, 0.11, 0.10, 0.09, 0.08, 0.07, 0.06, 0.05, 0.04, 0.04,
                  0.04, 0.03, 0.03, 0.03, 0.02, 0.02, 0.02, 0.02, 0.01, 0.01]
    make_probs = np.array(make_probs)
    make_probs = make_probs / make_probs.sum()  # Normalize to ensure sum = 1
    
    makes = np.random.choice([
        'Toyota', 'Honda', 'Ford', 'Chevrolet', 'BMW', 'Mercedes', 'Audi', 
        'Nissan', 'Hyundai', 'Kia', 'Volkswagen', 'Subaru', 'Mazda', 'Lexus',
        'Acura', 'Infiniti', 'Cadillac', 'Lincoln', 'Jaguar', 'Land Rover'
    ], n_samples, p=make_probs)
    
    # Fixed body type probabilities
    body_probs = [0.25, 0.20, 0.15, 0.10, 0.08, 0.05, 0.05, 0.05, 0.07]
    body_probs = np.array(body_probs)
    body_probs = body_probs / body_probs.sum()
    
    body_types = np.random.choice([
        'Sedan', 'SUV', 'Truck', 'Coupe', 'Hatchback', 'Convertible', 
        'Wagon', 'Minivan', 'Crossover'
    ], n_samples, p=body_probs)
    
    # Fixed fuel type probabilities
    fuel_probs = [0.70, 0.15, 0.08, 0.05, 0.02]
    fuel_probs = np.array(fuel_probs)
    fuel_probs = fuel_probs / fuel_probs.sum()
    
    fuel_types = np.random.choice([
        'Gasoline', 'Hybrid', 'Electric', 'Diesel', 'Plug-in Hybrid'
    ], n_samples, p=fuel_probs)
    
    # Fixed transmission probabilities
    trans_probs = [0.75, 0.15, 0.08, 0.02]
    trans_probs = np.array(trans_probs)
    trans_probs = trans_probs / trans_probs.sum()
    
    transmissions = np.random.choice([
        'Automatic', 'Manual', 'CVT', 'Semi-Automatic'
    ], n_samples, p=trans_probs)
    
    # Fixed drivetrain probabilities
    drive_probs = [0.45, 0.25, 0.20, 0.10]
    drive_probs = np.array(drive_probs)
    drive_probs = drive_probs / drive_probs.sum()
    
    drivetrains = np.random.choice([
        'FWD', 'RWD', 'AWD', '4WD'
    ], n_samples, p=drive_probs)
    
    # Fixed condition probabilities
    cond_probs = [0.05, 0.15, 0.35, 0.30, 0.15]
    cond_probs = np.array(cond_probs)
    cond_probs = cond_probs / cond_probs.sum()
    
    conditions = np.random.choice([
        'Poor', 'Fair', 'Good', 'Very Good', 'Excellent'
    ], n_samples, p=cond_probs)
    
    # Fixed engine size probabilities
    engine_probs = [0.05, 0.15, 0.20, 0.20, 0.15, 0.10, 0.08, 0.05, 0.02]
    engine_probs = np.array(engine_probs)
    engine_probs = engine_probs / engine_probs.sum()
    
    engine_sizes = np.random.choice([
        '1.0L', '1.5L', '2.0L', '2.5L', '3.0L', '3.5L', '4.0L', '5.0L', '6.0L+'
    ], n_samples, p=engine_probs)
    
    # Fixed cylinder probabilities
    cyl_probs = [0.05, 0.50, 0.30, 0.12, 0.02, 0.01]
    cyl_probs = np.array(cyl_probs)
    cyl_probs = cyl_probs / cyl_probs.sum()
    
    cylinders = np.random.choice([3, 4, 6, 8, 10, 12], n_samples, p=cyl_probs)
    
    # Fixed state probabilities
    state_probs = [0.12, 0.09, 0.06, 0.06, 0.04, 0.04, 0.04, 0.03, 0.03, 0.03, 0.46]
    state_probs = np.array(state_probs)
    state_probs = state_probs / state_probs.sum()
    
    states = np.random.choice([
        'CA', 'TX', 'FL', 'NY', 'PA', 'IL', 'OH', 'GA', 'NC', 'MI', 'Others'
    ], n_samples, p=state_probs)
    
    # Fixed market demand probabilities
    demand_probs = [0.3, 0.5, 0.2]
    demand_probs = np.array(demand_probs)
    demand_probs = demand_probs / demand_probs.sum()
    
    market_demand = np.random.choice(['Low', 'Medium', 'High'], n_samples, p=demand_probs)
    
    # Rest of the method remains the same...
    # (Continue with the existing mileage calculation, price calculation, etc.)
    def _calculate_realistic_prices(self, years, makes, body_types, fuel_types, transmissions, 
                                  drivetrains, conditions, engine_sizes, cylinders, mileages, 
                                  states, accident_counts, service_records, prev_owners, 
                                  market_demand, seasonal_factor):
        """
        Calculate realistic vehicle prices based on multiple factors
        """
        n_samples = len(years)
        
        # Base MSRP by make and body type
        make_multipliers = {
            'Toyota': 1.0, 'Honda': 0.95, 'Ford': 0.85, 'Chevrolet': 0.80,
            'BMW': 1.8, 'Mercedes': 1.9, 'Audi': 1.7, 'Nissan': 0.90,
            'Hyundai': 0.75, 'Kia': 0.70, 'Volkswagen': 1.1, 'Subaru': 1.05,
            'Mazda': 0.95, 'Lexus': 1.6, 'Acura': 1.4, 'Infiniti': 1.3,
            'Cadillac': 1.5, 'Lincoln': 1.4, 'Jaguar': 2.0, 'Land Rover': 2.1
        }
        
        body_multipliers = {
            'Sedan': 1.0, 'SUV': 1.3, 'Truck': 1.2, 'Coupe': 1.1,
            'Hatchback': 0.9, 'Convertible': 1.4, 'Wagon': 1.05,
            'Minivan': 1.15, 'Crossover': 1.25
        }
        
        # Calculate base prices
        base_msrp = 25000  # Base MSRP
        make_factors = np.array([make_multipliers.get(make, 1.0) for make in makes])
        body_factors = np.array([body_multipliers.get(body, 1.0) for body in body_types])
        
        # Year factor (newer cars worth more)
        year_factors = 1 + (years - 2000) * 0.02
        
        # Calculate initial price
        initial_prices = base_msrp * make_factors * body_factors * year_factors
        
        # Apply depreciation
        vehicle_ages = 2025 - years
        depreciation = np.power(0.85, vehicle_ages)  # 15% depreciation per year
        
        # Mileage factor
        mileage_factors = np.exp(-mileages / 200000)  # Exponential decay with mileage
        
        # Condition multipliers
        condition_multipliers = {
            'Poor': 0.6, 'Fair': 0.8, 'Good': 1.0, 'Very Good': 1.2, 'Excellent': 1.4
        }
        condition_factors = np.array([condition_multipliers[cond] for cond in conditions])
        
        # Fuel type adjustments
        fuel_multipliers = {
            'Gasoline': 1.0, 'Hybrid': 1.15, 'Electric': 1.25, 
            'Diesel': 1.05, 'Plug-in Hybrid': 1.20
        }
        fuel_factors = np.array([fuel_multipliers[fuel] for fuel in fuel_types])
        
        # Transmission adjustments
        trans_multipliers = {
            'Automatic': 1.0, 'Manual': 0.95, 'CVT': 0.98, 'Semi-Automatic': 1.1
        }
        trans_factors = np.array([trans_multipliers[trans] for trans in transmissions])
        
        # Drivetrain adjustments
        drive_multipliers = {'FWD': 1.0, 'RWD': 1.05, 'AWD': 1.15, '4WD': 1.20}
        drive_factors = np.array([drive_multipliers[drive] for drive in drivetrains])
        
        # Accident penalty
        accident_factors = np.power(0.9, accident_counts)  # 10% reduction per accident
        
        # Previous owners penalty
        owner_factors = np.power(0.97, prev_owners)  # 3% reduction per previous owner
        
        # Market demand adjustment
        demand_multipliers = {'Low': 0.92, 'Medium': 1.0, 'High': 1.12}
        demand_factors = np.array([demand_multipliers[demand] for demand in market_demand])
        
        # Calculate final prices
        final_prices = (initial_prices * depreciation * mileage_factors * condition_factors * 
                       fuel_factors * trans_factors * drive_factors * accident_factors * 
                       owner_factors * demand_factors * seasonal_factor)
        
        # Add some realistic noise
        noise = np.random.normal(1, 0.1, n_samples)
        final_prices *= noise
        
        # Ensure reasonable price range
        final_prices = np.clip(final_prices, 1000, 150000)
        
        return final_prices
    
    def advanced_feature_engineering(self, df):
        """
        Create advanced features for better model performance
        """
        df_engineered = df.copy()
        
        # Interaction features
        df_engineered['age_mileage_interaction'] = df_engineered['vehicle_age'] * df_engineered['mileage']
        df_engineered['luxury_age_interaction'] = df_engineered['luxury_brand'].astype(int) * df_engineered['vehicle_age']
        df_engineered['condition_mileage_interaction'] = df_engineered['condition'].map({
            'Poor': 1, 'Fair': 2, 'Good': 3, 'Very Good': 4, 'Excellent': 5
        }) * df_engineered['mileage']
        
        # Polynomial features for key variables
        df_engineered['mileage_squared'] = df_engineered['mileage'] ** 2
        df_engineered['age_squared'] = df_engineered['vehicle_age'] ** 2
        df_engineered['mileage_log'] = np.log1p(df_engineered['mileage'])
        
        # Binning continuous variables
        df_engineered['mileage_bin'] = pd.cut(df_engineered['mileage'], 
                                            bins=[0, 25000, 50000, 100000, float('inf')], 
                                            labels=['Low', 'Medium', 'High', 'Very High'])
        
        df_engineered['age_bin'] = pd.cut(df_engineered['vehicle_age'], 
                                        bins=[0, 3, 7, 15, float('inf')], 
                                        labels=['New', 'Recent', 'Mature', 'Old'])
        
        # Market segment classification
        luxury_makes = ['BMW', 'Mercedes', 'Audi', 'Lexus', 'Acura', 'Infiniti', 'Cadillac', 'Lincoln', 'Jaguar', 'Land Rover']
        economy_makes = ['Hyundai', 'Kia', 'Nissan', 'Chevrolet']
        
        df_engineered['market_segment'] = 'Mainstream'
        df_engineered.loc[df_engineered['make'].isin(luxury_makes), 'market_segment'] = 'Luxury'
        df_engineered.loc[df_engineered['make'].isin(economy_makes), 'market_segment'] = 'Economy'
        
        return df_engineered
    
    def comprehensive_model_training(self, df):
        """
        Train multiple models with comprehensive evaluation
        """
        print("Starting comprehensive model training...")
        
        # Feature engineering
        df_engineered = self.advanced_feature_engineering(df)
        
        # Prepare features
        categorical_columns = ['make', 'body_type', 'fuel_type', 'transmission', 'drivetrain', 
                             'condition', 'engine_size', 'state', 'market_demand', 'mileage_bin', 
                             'age_bin', 'market_segment']
        
        numerical_columns = ['year', 'cylinders', 'mileage', 'accident_count', 'service_records', 
                           'previous_owners', 'seasonal_factor', 'vehicle_age', 'mileage_per_year',
                           'depreciation_rate', 'age_mileage_interaction', 'luxury_age_interaction',
                           'condition_mileage_interaction', 'mileage_squared', 'age_squared', 'mileage_log']
        
        boolean_columns = ['luxury_brand', 'electric_hybrid', 'high_performance']
        
        # Encode categorical variables
        df_processed = df_engineered.copy()
        for col in categorical_columns:
            if col not in self.label_encoders:
                self.label_encoders[col] = LabelEncoder()
            df_processed[col + '_encoded'] = self.label_encoders[col].fit_transform(df_processed[col])
        
        # Prepare final feature set
        feature_columns = (numerical_columns + boolean_columns + 
                          [col + '_encoded' for col in categorical_columns])
        
        X = df_processed[feature_columns].fillna(0)
        y = df_processed['price']
        
        # Remove outliers
        z_scores = np.abs(zscore(y))
        X = X[z_scores < 3]
        y = y[z_scores < 3]
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
        
        print(f"Training set size: {X_train.shape[0]}")
        print(f"Validation set size: {X_val.shape[0]}")
        print(f"Test set size: {X_test.shape[0]}")
        
        # Scale features
        self.scalers['standard'] = StandardScaler()
        self.scalers['robust'] = RobustScaler()
        self.scalers['minmax'] = MinMaxScaler()
        
        X_train_std = self.scalers['standard'].fit_transform(X_train)
        X_val_std = self.scalers['standard'].transform(X_val)
        X_test_std = self.scalers['standard'].transform(X_test)
        
        X_train_robust = self.scalers['robust'].fit_transform(X_train)
        X_val_robust = self.scalers['robust'].transform(X_val)
        X_test_robust = self.scalers['robust'].transform(X_test)
        
        # Train individual models
        print("\nTraining individual models...")
        individual_results = {}
        
        for name, model in self.algorithms.items():
            print(f"Training {name}...")
            try:
                # Choose appropriate scaling
                if name in ['svr_rbf', 'svr_linear', 'knn', 'mlp']:
                    X_tr, X_v, X_te = X_train_std, X_val_std, X_test_std
                elif name in ['huber', 'ransac']:
                    X_tr, X_v, X_te = X_train_robust, X_val_robust, X_test_robust
                else:
                    X_tr, X_v, X_te = X_train, X_val, X_test
                
                # Train model
                model.fit(X_tr, y_train)
                
                # Predictions
                train_pred = model.predict(X_tr)
                val_pred = model.predict(X_v)
                test_pred = model.predict(X_te)
                
                # Calculate metrics
                metrics = {
                    'train_mae': mean_absolute_error(y_train, train_pred),
                    'val_mae': mean_absolute_error(y_val, val_pred),
                    'test_mae': mean_absolute_error(y_test, test_pred),
                    'train_rmse': np.sqrt(mean_squared_error(y_train, train_pred)),
                    'val_rmse': np.sqrt(mean_squared_error(y_val, val_pred)),
                    'test_rmse': np.sqrt(mean_squared_error(y_test, test_pred)),
                    'train_r2': r2_score(y_train, train_pred),
                    'val_r2': r2_score(y_val, val_pred),
                    'test_r2': r2_score(y_test, test_pred),
                    'train_mape': mean_absolute_percentage_error(y_train, train_pred),
                    'val_mape': mean_absolute_percentage_error(y_val, val_pred),
                    'test_mape': mean_absolute_percentage_error(y_test, test_pred)
                }
                
                individual_results[name] = metrics
                self.models[name] = model
                
            except Exception as e:
                print(f"Error training {name}: {str(e)}")
                continue
        
        # Train ensemble models
        print("\nTraining ensemble models...")
        ensemble_results = {}
        
        for ensemble_name, base_models in self.ensemble_configs.items():
            print(f"Training {ensemble_name}...")
            try:
                # Voting ensemble (simple average)
                if ensemble_name == 'voting_ensemble':
                    ensemble_model = AdvancedEnsembleRegressor(
                        base_models=base_models, 
                        use_stacking=False
                    )
                    ensemble_model.fit(X_train, y_train)
                    
                # Stacking ensemble
                elif ensemble_name == 'stacking_ensemble':
                    ensemble_model = AdvancedEnsembleRegressor(
                        base_models=base_models,
                        meta_model=Ridge(alpha=0.1),
                        use_stacking=True
                    )
                    ensemble_model.fit(X_train, y_train)
                    
                # Meta ensemble with neural network meta-learner
                elif ensemble_name == 'meta_ensemble':
                    ensemble_model = AdvancedEnsembleRegressor(
                        base_models=base_models,
                        meta_model=MLPRegressor(hidden_layer_sizes=(50, 25), max_iter=500, random_state=42),
                        use_stacking=True
                    )
                    ensemble_model.fit(X_train, y_train)
                
                # Evaluate ensemble
                train_pred = ensemble_model.predict(X_train)
                val_pred = ensemble_model.predict(X_val)
                test_pred = ensemble_model.predict(X_test)
                
                metrics = {
                    'train_mae': mean_absolute_error(y_train, train_pred),
                    'val_mae': mean_absolute_error(y_val, val_pred),
                    'test_mae': mean_absolute_error(y_test, test_pred),
                    'train_rmse': np.sqrt(mean_squared_error(y_train, train_pred)),
                    'val_rmse': np.sqrt(mean_squared_error(y_val, val_pred)),
                    'test_rmse': np.sqrt(mean_squared_error(y_test, test_pred)),
                    'train_r2': r2_score(y_train, train_pred),
                    'val_r2': r2_score(y_val, val_pred),
                    'test_r2': r2_score(y_test, test_pred),
                    'train_mape': mean_absolute_percentage_error(y_train, train_pred),
                    'val_mape': mean_absolute_percentage_error(y_val, val_pred),
                    'test_mape': mean_absolute_percentage_error(y_test, test_pred)
                }
                
                ensemble_results[ensemble_name] = metrics
                self.models[ensemble_name] = ensemble_model
                
            except Exception as e:
                print(f"Error training {ensemble_name}: {str(e)}")
                continue
        
        # Combine all results
        all_results = {**individual_results, **ensemble_results}
        self.model_performances = all_results
        
        # Find best model based on validation R²
        best_model_name = max(all_results.keys(), key=lambda x: all_results[x]['val_r2'])
        self.best_model = self.models[best_model_name]
        
        print(f"\nBest model: {best_model_name}")
        print(f"Best validation R²: {all_results[best_model_name]['val_r2']:.4f}")
        
        # Store feature columns for prediction
        self.feature_columns = feature_columns
        
        return all_results, best_model_name
    
    def display_comprehensive_results(self):
        """
        Display comprehensive model comparison results
        """
        if not self.model_performances:
            print("No model performance data available. Train models first.")
            return
        
        # Create results DataFrame
        results_df = pd.DataFrame(self.model_performances).T
        
        # Sort by validation R²
        results_df = results_df.sort_values('val_r2', ascending=False)
        
        print("\n" + "="*100)
        print("COMPREHENSIVE MODEL PERFORMANCE COMPARISON")
        print("="*100)
        
        print(f"\n{'Model':<20} {'Val R²':<8} {'Test R²':<8} {'Val MAE':<10} {'Test MAE':<10} {'Val RMSE':<10} {'Test RMSE':<10} {'Val MAPE':<8}")
        print("-" * 100)
        
        for model_name, metrics in results_df.iterrows():
            print(f"{model_name:<20} {metrics['val_r2']:<8.4f} {metrics['test_r2']:<8.4f} "
                  f"{metrics['val_mae']:<10.0f} {metrics['test_mae']:<10.0f} "
                  f"{metrics['val_rmse']:<10.0f} {metrics['test_rmse']:<10.0f} {metrics['val_mape']:<8.2f}%")
        
        # Performance tiers
        print("\n" + "="*60)
        print("MODEL PERFORMANCE TIERS")
        print("="*60)
        
        excellent_models = results_df[results_df['val_r2'] >= 0.95]
        very_good_models = results_df[(results_df['val_r2'] >= 0.90) & (results_df['val_r2'] < 0.95)]
        good_models = results_df[(results_df['val_r2'] >= 0.85) & (results_df['val_r2'] < 0.90)]
        fair_models = results_df[results_df['val_r2'] < 0.85]
        
        if not excellent_models.empty:
            print(f"\n🏆 EXCELLENT PERFORMANCE (R² ≥ 0.95): {len(excellent_models)} models")
            for model in excellent_models.index[:3]:  # Top 3
                print(f"   • {model}: R² = {excellent_models.loc[model, 'val_r2']:.4f}")
        
        if not very_good_models.empty:
            print(f"\n⭐ VERY GOOD PERFORMANCE (0.90 ≤ R² < 0.95): {len(very_good_models)} models")
            for model in very_good_models.index[:3]:
                print(f"   • {model}: R² = {very_good_models.loc[model, 'val_r2']:.4f}")
        
        if not good_models.empty:
            print(f"\n✓ GOOD PERFORMANCE (0.85 ≤ R² < 0.90): {len(good_models)} models")
            for model in good_models.index[:2]:
                print(f"   • {model}: R² = {good_models.loc[model, 'val_r2']:.4f}")
        
        if not fair_models.empty:
            print(f"\n△ FAIR PERFORMANCE (R² < 0.85): {len(fair_models)} models")
        
        # Best model summary
        best_model_name = results_df.index[0]
        best_metrics = results_df.iloc[0]
        
        print(f"\n" + "="*60)
        print("BEST MODEL SUMMARY")
        print("="*60)
        print(f"Model: {best_model_name}")
        print(f"Validation R²: {best_metrics['val_r2']:.4f}")
        print(f"Test R²: {best_metrics['test_r2']:.4f}")
        print(f"Validation MAE: ${best_metrics['val_mae']:,.0f}")
        print(f"Test MAE: ${best_metrics['test_mae']:,.0f}")
        print(f"Validation MAPE: {best_metrics['val_mape']:.2f}%")
        print(f"Test MAPE: {best_metrics['test_mape']:.2f}%")
        
        return results_df
    
    def advanced_hyperparameter_tuning(self, df, top_models=3):
        """
        Perform advanced hyperparameter tuning for top models
        """
        print(f"\nPerforming hyperparameter tuning for top {top_models} models...")
        
        if not self.model_performances:
            print("Train models first before hyperparameter tuning.")
            return
        
        # Get top models
        sorted_models = sorted(self.model_performances.items(), 
                             key=lambda x: x[1]['val_r2'], reverse=True)
        top_model_names = [name for name, _ in sorted_models[:top_models]]
        
        # Prepare data
        df_engineered = self.advanced_feature_engineering(df)
        
        categorical_columns = ['make', 'body_type', 'fuel_type', 'transmission', 'drivetrain', 
                             'condition', 'engine_size', 'state', 'market_demand', 'mileage_bin', 
                             'age_bin', 'market_segment']
        
        numerical_columns = ['year', 'cylinders', 'mileage', 'accident_count', 'service_records', 
                           'previous_owners', 'seasonal_factor', 'vehicle_age', 'mileage_per_year',
                           'depreciation_rate', 'age_mileage_interaction', 'luxury_age_interaction',
                           'condition_mileage_interaction', 'mileage_squared', 'age_squared', 'mileage_log']
        
        boolean_columns = ['luxury_brand', 'electric_hybrid', 'high_performance']
        
        # Encode and prepare features
        df_processed = df_engineered.copy()
        for col in categorical_columns:
            df_processed[col + '_encoded'] = self.label_encoders[col].transform(df_processed[col])
        
        feature_columns = (numerical_columns + boolean_columns + 
                          [col + '_encoded' for col in categorical_columns])
        
        X = df_processed[feature_columns].fillna(0)
        y = df_processed['price']
        
        # Remove outliers
        z_scores = np.abs(zscore(y))
        X = X[z_scores < 3]
        y = y[z_scores < 3]
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Hyperparameter grids
        param_grids = {
            'random_forest': {
                'n_estimators': [100, 200, 300],
                'max_depth': [10, 15, 20, None],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'max_features': ['sqrt', 'log2', None]
            },
            'gradient_boosting': {
                'n_estimators': [100, 200, 300],
                'max_depth': [6, 8, 10],
                'learning_rate': [0.01, 0.1, 0.2],
                'subsample': [0.8, 0.9, 1.0],
                'min_samples_split': [2, 5, 10]
            },
            'xgboost': {
                'n_estimators': [100, 200, 300],
                'max_depth': [6, 8, 10],
                'learning_rate': [0.01, 0.1, 0.2],
                'subsample': [0.8, 0.9, 1.0],
                'colsample_bytree': [0.8, 0.9, 1.0]
            },
            'lightgbm': {
                'n_estimators': [100, 200, 300],
                'max_depth': [6, 8, 10],
                'learning_rate': [0.01, 0.1, 0.2],
                'subsample': [0.8, 0.9, 1.0],
                'colsample_bytree': [0.8, 0.9, 1.0]
            },
            'extra_trees': {
                'n_estimators': [100, 200, 300],
                'max_depth': [10, 15, 20, None],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]
            }
        }
        
        tuned_models = {}
        
        for model_name in top_model_names:
            if model_name in param_grids:
                print(f"Tuning {model_name}...")
                
                # Get base model
                base_model = self.algorithms[model_name]
                
                # Perform randomized search
                random_search = RandomizedSearchCV(
                    base_model,
                    param_grids[model_name],
                    n_iter=20,
                    cv=5,
                    scoring='r2',
                    random_state=42,
                    n_jobs=-1
                )
                
                random_search.fit(X_train, y_train)
                
                # Best model
                best_model = random_search.best_estimator_
                
                # Evaluate
                train_pred = best_model.predict(X_train)
                test_pred = best_model.predict(X_test)
                
                tuned_performance = {
                    'best_params': random_search.best_params_,
                    'train_r2': r2_score(y_train, train_pred),
                    'test_r2': r2_score(y_test, test_pred),
                    'train_mae': mean_absolute_error(y_train, train_pred),
                    'test_mae': mean_absolute_error(y_test, test_pred),
                    'improvement': r2_score(y_test, test_pred) - self.model_performances[model_name]['test_r2']
                }
                
                tuned_models[model_name] = {
                    'model': best_model,
                    'performance': tuned_performance
                }
                
                print(f"  Original R²: {self.model_performances[model_name]['test_r2']:.4f}")
                print(f"  Tuned R²: {tuned_performance['test_r2']:.4f}")
                print(f"  Improvement: {tuned_performance['improvement']:.4f}")
        
        return tuned_models
    
    def feature_importance_analysis(self):
        """
        Analyze feature importance across different models
        """
        if not self.models:
            print("Train models first.")
            return
        
        print("\n" + "="*60)
        print("FEATURE IMPORTANCE ANALYSIS")
        print("="*60)
        
        feature_importance_dict = {}
        
        # Tree-based models
        tree_models = ['random_forest', 'gradient_boosting', 'xgboost', 'lightgbm', 'extra_trees']
        
        for model_name in tree_models:
            if model_name in self.models:
                model = self.models[model_name]
                if hasattr(model, 'feature_importances_'):
                    importances = model.feature_importances_
                    feature_importance_dict[model_name] = dict(zip(self.feature_columns, importances))
        
        if feature_importance_dict:
            # Calculate average importance
            all_features = set()
            for importances in feature_importance_dict.values():
                all_features.update(importances.keys())
            
            avg_importance = {}
            for feature in all_features:
                scores = [feature_importance_dict[model].get(feature, 0) 
                         for model in feature_importance_dict.keys()]
                avg_importance[feature] = np.mean(scores)
            
            # Sort by importance
            sorted_features = sorted(avg_importance.items(), key=lambda x: x[1], reverse=True)
            
            print("\nTop 15 Most Important Features (Average across tree models):")
            print("-" * 60)
            for i, (feature, importance) in enumerate(sorted_features[:15], 1):
                print(f"{i:2d}. {feature:<35} {importance:.4f}")
            
            self.feature_importance = avg_importance
        
        return feature_importance_dict
    
    def model_prediction_with_uncertainty(self, vehicle_features):
        """
        Make predictions with uncertainty quantification using multiple models
        """
        if not self.models:
            print("Train models first.")
            return None
        
        # Prepare features
        processed_features = self._prepare_prediction_features(vehicle_features)
        
        if processed_features is None:
            return None
        
        # Get predictions from all models
        predictions = {}
        
        for model_name, model in self.models.items():
            try:
                # Apply appropriate scaling
                if model_name in ['svr_rbf', 'svr_linear', 'knn', 'mlp']:
                    features_scaled = self.scalers['standard'].transform([processed_features])
                elif model_name in ['huber', 'ransac']:
                    features_scaled = self.scalers['robust'].transform([processed_features])
                else:
                    features_scaled = [processed_features]
                
                pred = model.predict(features_scaled)[0]
                predictions[model_name] = max(pred, 1000)  # Minimum $1000
                
            except Exception as e:
                print(f"Error predicting with {model_name}: {str(e)}")
                continue
        
        if not predictions:
            return None
        
        # Calculate statistics
        pred_values = list(predictions.values())
        mean_prediction = np.mean(pred_values)
        std_prediction = np.std(pred_values)
        median_prediction = np.median(pred_values)
        
        # Get best model prediction
        best_model_name = max(self.model_performances.keys(), 
                             key=lambda x: self.model_performances[x]['val_r2'])
        best_prediction = predictions.get(best_model_name, mean_prediction)
        
        # Calculate confidence intervals
        confidence_95 = {
            'lower': mean_prediction - 1.96 * std_prediction,
            'upper': mean_prediction + 1.96 * std_prediction
        }
        
        confidence_68 = {
            'lower': mean_prediction - std_prediction,
            'upper': mean_prediction + std_prediction
        }
        
        # Model agreement score
        cv = std_prediction / mean_prediction if mean_prediction > 0 else 1
        agreement_score = max(0, 100 * (1 - cv))  # Higher is better agreement
        
        return {
            'predictions': predictions,
            'best_model_prediction': best_prediction,
            'ensemble_mean': mean_prediction,
            'ensemble_median': median_prediction,
            'ensemble_std': std_prediction,
            'confidence_intervals': {
                '68%': confidence_68,
                '95%': confidence_95
            },
            'model_agreement_score': agreement_score,
            'prediction_range': {
                'min': min(pred_values),
                'max': max(pred_values)
            },
            'best_model': best_model_name
        }
    
    def _prepare_prediction_features(self, vehicle_features):
        """
        Prepare features for prediction
        """
        try:
            # Create base feature vector
            processed_features = []
            
            # Numerical features
            numerical_columns = ['year', 'cylinders', 'mileage', 'accident_count', 'service_records', 
                               'previous_owners', 'seasonal_factor', 'vehicle_age', 'mileage_per_year',
                               'depreciation_rate', 'age_mileage_interaction', 'luxury_age_interaction',
                               'condition_mileage_interaction', 'mileage_squared', 'age_squared', 'mileage_log']
            
            # Calculate derived features
            vehicle_age = 2025 - vehicle_features.get('year', 2020)
            mileage = vehicle_features.get('mileage', 50000)
            
            feature_dict = {
                'year': vehicle_features.get('year', 2020),
                'cylinders': vehicle_features.get('cylinders', 4),
                'mileage': mileage,
                'accident_count': vehicle_features.get('accident_count', 0),
                'service_records': vehicle_features.get('service_records', vehicle_age),
                'previous_owners': vehicle_features.get('previous_owners', 1),
                'seasonal_factor': vehicle_features.get('seasonal_factor', 1.0),
                'vehicle_age': vehicle_age,
                'mileage_per_year': mileage / max(vehicle_age, 1),
                'depreciation_rate': 1 - (0.15 * vehicle_age),
                'age_mileage_interaction': vehicle_age * mileage,
                'luxury_age_interaction': (1 if vehicle_features.get('make', '') in 
                                         ['BMW', 'Mercedes', 'Audi', 'Lexus'] else 0) * vehicle_age,
                'condition_mileage_interaction': {'Poor': 1, 'Fair': 2, 'Good': 3, 'Very Good': 4, 
                                                'Excellent': 5}.get(vehicle_features.get('condition', 'Good'), 3) * mileage,
                'mileage_squared': mileage ** 2,
                'age_squared': vehicle_age ** 2,
                'mileage_log': np.log1p(mileage)
            }
            
            # Add numerical features
            for col in numerical_columns:
                processed_features.append(feature_dict.get(col, 0))
            
            # Boolean features
            luxury_makes = ['BMW', 'Mercedes', 'Audi', 'Lexus', 'Acura', 'Infiniti', 'Cadillac', 'Lincoln', 'Jaguar', 'Land Rover']
            electric_hybrid = ['Electric', 'Hybrid', 'Plug-in Hybrid']
            
            processed_features.extend([
                1 if vehicle_features.get('make', '') in luxury_makes else 0,
                1 if vehicle_features.get('fuel_type', '') in electric_hybrid else 0,
                1 if vehicle_features.get('cylinders', 4) >= 8 else 0
            ])
            
            # Categorical features
            categorical_columns = ['make', 'body_type', 'fuel_type', 'transmission', 'drivetrain', 
                                 'condition', 'engine_size', 'state', 'market_demand', 'mileage_bin', 
                                 'age_bin', 'market_segment']
            
            for col in categorical_columns:
                value = vehicle_features.get(col, '')
                if col in self.label_encoders:
                    try:
                        encoded_value = self.label_encoders[col].transform([value])[0]
                    except (ValueError, KeyError):
                        encoded_value = 0  # Unknown category
                else:
                    encoded_value = 0
                processed_features.append(encoded_value)
            
            return processed_features
            
        except Exception as e:
            print(f"Error preparing features: {str(e)}")
            return None
    
    def comprehensive_vehicle_analysis(self, vehicle_data):
        """
        Perform comprehensive vehicle analysis with all verification and prediction
        """
        print("=" * 80)
        print("COMPREHENSIVE VEHICLE ANALYSIS REPORT")
        print("=" * 80)
        
        # Vehicle details
        print(f"\nVehicle: {vehicle_data.get('year', 'Unknown')} {vehicle_data.get('make', 'Unknown')} {vehicle_data.get('model', 'Unknown')}")
        print(f"VIN: {vehicle_data.get('vin', 'Not provided')}")
        print(f"Mileage: {vehicle_data.get('mileage', 'Unknown'):,} miles")
        print(f"Condition: {vehicle_data.get('condition', 'Unknown')}")
        
        # Price prediction with uncertainty
        prediction_result = self.model_prediction_with_uncertainty(vehicle_data)
        
        if prediction_result:
            print(f"\n" + "="*50)
            print("PRICE PREDICTION ANALYSIS")
            print("="*50)
            
            print(f"Best Model ({prediction_result['best_model']}):")
            print(f"  Predicted Price: ${prediction_result['best_model_prediction']:,.2f}")
            
            print(f"\nEnsemble Analysis:")
            print(f"  Mean Prediction: ${prediction_result['ensemble_mean']:,.2f}")
            print(f"  Median Prediction: ${prediction_result['ensemble_median']:,.2f}")
            print(f"  Prediction Range: ${prediction_result['prediction_range']['min']:,.2f} - ${prediction_result['prediction_range']['max']:,.2f}")
            
            print(f"\nConfidence Intervals:")
            print(f"  68% CI: ${prediction_result['confidence_intervals']['68%']['lower']:,.2f} - ${prediction_result['confidence_intervals']['68%']['upper']:,.2f}")
            print(f"  95% CI: ${prediction_result['confidence_intervals']['95%']['lower']:,.2f} - ${prediction_result['confidence_intervals']['95%']['upper']:,.2f}")
            
            print(f"\nModel Agreement Score: {prediction_result['model_agreement_score']:.1f}%")
            
            # Individual model predictions
            print(f"\nIndividual Model Predictions:")
            for model, pred in sorted(prediction_result['predictions'].items(), 
                                   key=lambda x: x[1], reverse=True):
                print(f"  {model:<20}: ${pred:,.2f}")
        
        return prediction_result

# Example usage and comprehensive testing
if __name__ == "__main__":
    print("Initializing Advanced Vehicle System...")
    vehicle_system = AdvancedVehicleSystem()
    
    # Generate comprehensive dataset
    print("Generating advanced dataset...")
    df = vehicle_system.generate_advanced_dataset(n_samples=30000)
    print(f"Generated dataset with {len(df)} samples and {len(df.columns)} features")
    
    # Train all models
    print("\nStarting comprehensive model training...")
    results, best_model = vehicle_system.comprehensive_model_training(df)
    
    # Display results
    results_summary = vehicle_system.display_comprehensive_results()
    
    # Feature importance analysis
    feature_importance = vehicle_system.feature_importance_analysis()
    
    # Hyperparameter tuning for top models
    tuned_models = vehicle_system.advanced_hyperparameter_tuning(df, top_models=3)
    
    print("\n" + "="*80)
    print("HYPERPARAMETER TUNING RESULTS")
    print("="*80)
    for model_name, tuning_result in tuned_models.items():
        perf = tuning_result['performance']
        print(f"\n{model_name}:")
        print(f"  Best Parameters: {perf['best_params']}")
        print(f"  Test R² Improvement: {perf['improvement']:+.4f}")
        print(f"  Final Test R²: {perf['test_r2']:.4f}")
    
    # Example comprehensive analysis
    print("\n" + "="*80)
    print("EXAMPLE VEHICLE ANALYSIS")
    print("="*80)
    
    sample_vehicle = {
        'vin': '1HGCM82633A123456',
        'year': 2020,
        'make': 'Toyota',
        'model': 'Camry',
        'body_type': 'Sedan',
        'fuel_type': 'Gasoline',
        'transmission': 'Automatic',
        'drivetrain': 'FWD',
        'condition': 'Good',
        'engine_size': '2.5L',
        'cylinders': 4,
        'mileage': 45000,
        'state': 'CA',
        'accident_count': 0,
        'service_records': 8,
        'previous_owners': 1,
        'market_demand': 'Medium',
        'seasonal_factor': 1.02
    }
    
    analysis_result = vehicle_system.comprehensive_vehicle_analysis(sample_vehicle)
    
    print("\n" + "="*80)
    print("SYSTEM PERFORMANCE SUMMARY")
    print("="*80)
    print(f"Total Models Trained: {len(vehicle_system.models)}")
    print(f"Best Model: {best_model}")
    print(f"Best Validation R²: {max(r['val_r2'] for r in vehicle_system.model_performances.values()):.4f}")
    print(f"Dataset Size: {len(df):,} samples")
    print(f"Feature Count: {len(vehicle_system.feature_columns)} features")
    
    # Model complexity comparison
    print(f"\nModel Complexity Analysis:")
    ensemble_models = [name for name in vehicle_system.models.keys() if 'ensemble' in name]
    individual_models = [name for name in vehicle_system.models.keys() if 'ensemble' not in name]
    
    if ensemble_models:
        ensemble_r2 = [vehicle_system.model_performances[name]['val_r2'] for name in ensemble_models]
        print(f"  Average Ensemble R²: {np.mean(ensemble_r2):.4f}")
    
    if individual_models:
        individual_r2 = [vehicle_system.model_performances[name]['val_r2'] for name in individual_models]
        print(f"  Average Individual R²: {np.mean(individual_r2):.4f}")
    
    print(f"\nSystem ready for production use!")

Initializing Advanced Vehicle System...
Generating advanced dataset...


ValueError: probabilities do not sum to 1