In [None]:
# Vehicle Price Prediction Model
# Complete ML pipeline for predicting vehicle prices

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)


In [None]:
# ==============================================================================
# 1. LOAD AND EXPLORE DATA
# ==============================================================================

# Load the dataset
df = pd.read_csv('dataset.csv')

print("="*80)
print("VEHICLE PRICE PREDICTION - DATA EXPLORATION")
print("="*80)
print(f"\nDataset Shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
print(df.head())

print(f"\nData Types:")
print(df.dtypes)

print(f"\nMissing Values:")
print(df.isnull().sum())

print(f"\nBasic Statistics:")
print(df.describe())


FileNotFoundError: [Errno 2] No such file or directory: 'dataset.csv'

In [None]:
# ==============================================================================
# 2. DATA CLEANING AND PREPROCESSING
# ==============================================================================

print("\n" + "="*80)
print("DATA CLEANING")
print("="*80)

# Remove rows with missing or zero prices
df = df[df['price'].notna() & (df['price'] > 0)]

# Handle missing values
df['cylinders'].fillna(df['cylinders'].median(), inplace=True)
df['mileage'].fillna(df['mileage'].median(), inplace=True)
df['doors'].fillna(4, inplace=True)
df['year'].fillna(df['year'].median(), inplace=True)

# Fill categorical missing values
categorical_cols = ['make', 'model', 'fuel', 'transmission', 'body', 'drivetrain',
                    'exterior_color', 'interior_color', 'trim']
for col in categorical_cols:
    if col in df.columns:
        df[col].fillna('Unknown', inplace=True)

# Remove outliers (prices outside reasonable range)
df = df[(df['price'] >= 1000) & (df['price'] <= 500000)]
df = df[(df['year'] >= 1990) & (df['year'] <= 2025)]
df = df[df['mileage'] <= 500000]

print(f"\nCleaned Dataset Shape: {df.shape}")
print(f"Price Range: ${df['price'].min():.2f} - ${df['price'].max():.2f}")
print(f"Average Price: ${df['price'].mean():.2f}")


In [None]:
# ==============================================================================
# 3. FEATURE ENGINEERING
# ==============================================================================

print("\n" + "="*80)
print("FEATURE ENGINEERING")
print("="*80)

# Create new features
current_year = 2025
df['vehicle_age'] = current_year - df['year']
df['mileage_per_year'] = df['mileage'] / (df['vehicle_age'] + 1)

# Luxury brand indicator
luxury_brands = ['BMW', 'Mercedes-Benz', 'Audi', 'Lexus', 'Porsche', 'Tesla',
                 'Land Rover', 'Jaguar', 'Bentley', 'Maserati', 'Cadillac']
df['is_luxury'] = df['make'].isin(luxury_brands).astype(int)

# Electric vehicle indicator
df['is_electric'] = (df['fuel'] == 'Electric').astype(int)

# All-wheel/Four-wheel drive indicator
df['is_awd_4wd'] = df['drivetrain'].str.contains('All-wheel|Four-wheel', case=False, na=False).astype(int)

# Body type categories
df['is_suv'] = (df['body'] == 'SUV').astype(int)
df['is_truck'] = df['body'].str.contains('Truck', case=False, na=False).astype(int)
df['is_sedan'] = (df['body'] == 'Sedan').astype(int)

print(f"New Features Created:")
print(f"  - vehicle_age")
print(f"  - mileage_per_year")
print(f"  - is_luxury")
print(f"  - is_electric")
print(f"  - is_awd_4wd")
print(f"  - Body type indicators")

In [None]:
# ==============================================================================
# 4. EXPLORATORY DATA ANALYSIS
# ==============================================================================

print("\n" + "="*80)
print("EXPLORATORY DATA ANALYSIS")
print("="*80)

# Price distribution
plt.figure(figsize=(14, 5))

plt.subplot(1, 3, 1)
plt.hist(df['price'], bins=50, color='skyblue', edgecolor='black')
plt.xlabel('Price ($)')
plt.ylabel('Frequency')
plt.title('Price Distribution')

plt.subplot(1, 3, 2)
plt.scatter(df['vehicle_age'], df['price'], alpha=0.3, color='coral')
plt.xlabel('Vehicle Age (years)')
plt.ylabel('Price ($)')
plt.title('Price vs Vehicle Age')

plt.subplot(1, 3, 3)
plt.scatter(df['mileage'], df['price'], alpha=0.3, color='lightgreen')
plt.xlabel('Mileage')
plt.ylabel('Price ($)')
plt.title('Price vs Mileage')

plt.tight_layout()
plt.show()

# Average price by make (top 10)
top_makes = df.groupby('make')['price'].agg(['mean', 'count']).sort_values('count', ascending=False).head(10)
print("\nTop 10 Makes by Volume:")
print(top_makes)

plt.figure(figsize=(12, 6))
top_makes['mean'].plot(kind='bar', color='steelblue')
plt.xlabel('Make')
plt.ylabel('Average Price ($)')
plt.title('Average Price by Top 10 Makes')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Correlation analysis
numeric_features = ['year', 'price', 'cylinders', 'mileage', 'doors', 'vehicle_age',
                    'mileage_per_year', 'is_luxury', 'is_electric', 'is_awd_4wd']
correlation_matrix = df[numeric_features].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

print("\nCorrelation with Price:")
print(correlation_matrix['price'].sort_values(ascending=False))

In [None]:
# ==============================================================================
# 5. PREPARE DATA FOR MODELING
# ==============================================================================

print("\n" + "="*80)
print("PREPARING DATA FOR MODELING")
print("="*80)

# Select features for modeling
feature_cols = ['year', 'cylinders', 'mileage', 'doors', 'vehicle_age',
                'mileage_per_year', 'is_luxury', 'is_electric', 'is_awd_4wd',
                'is_suv', 'is_truck', 'is_sedan']

# Encode categorical variables
label_encoders = {}
categorical_features = ['make', 'fuel', 'transmission', 'body', 'drivetrain']

for col in categorical_features:
    if col in df.columns:
        le = LabelEncoder()
        df[f'{col}_encoded'] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le
        feature_cols.append(f'{col}_encoded')

# Prepare X and y
X = df[feature_cols]
y = df['price']

print(f"\nFeatures used: {len(feature_cols)}")
print(f"Total samples: {len(X)}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# ==============================================================================
# 6. MODEL TRAINING AND EVALUATION
# ==============================================================================

print("\n" + "="*80)
print("MODEL TRAINING AND EVALUATION")
print("="*80)

models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)
}

results = {}

for name, model in models.items():
    print(f"\n{'='*60}")
    print(f"Training {name}...")

    # Train model
    if name == 'Linear Regression':
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

    results[name] = {
        'MAE': mae,
        'RMSE': rmse,
        'R2': r2,
        'MAPE': mape,
        'predictions': y_pred
    }

    print(f"\n{name} Results:")
    print(f"  Mean Absolute Error (MAE): ${mae:,.2f}")
    print(f"  Root Mean Squared Error (RMSE): ${rmse:,.2f}")
    print(f"  R² Score: {r2:.4f} ({r2*100:.2f}%)")
    print(f"  Mean Absolute Percentage Error (MAPE): {mape:.2f}%")


In [None]:
# ==============================================================================
# 7. BEST MODEL SELECTION AND FEATURE IMPORTANCE
# ==============================================================================

print("\n" + "="*80)
print("MODEL COMPARISON")
print("="*80)

# Compare models
comparison_df = pd.DataFrame({
    'Model': results.keys(),
    'MAE': [results[m]['MAE'] for m in results.keys()],
    'RMSE': [results[m]['RMSE'] for m in results.keys()],
    'R² Score': [results[m]['R2'] for m in results.keys()],
    'MAPE (%)': [results[m]['MAPE'] for m in results.keys()]
})

print("\n" + comparison_df.to_string(index=False))

# Visualize model comparison
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

comparison_df.plot(x='Model', y='MAE', kind='bar', ax=axes[0], color='coral', legend=False)
axes[0].set_title('Mean Absolute Error')
axes[0].set_ylabel('MAE ($)')

comparison_df.plot(x='Model', y='R² Score', kind='bar', ax=axes[1], color='skyblue', legend=False)
axes[1].set_title('R² Score')
axes[1].set_ylabel('R² Score')

comparison_df.plot(x='Model', y='MAPE (%)', kind='bar', ax=axes[2], color='lightgreen', legend=False)
axes[2].set_title('Mean Absolute Percentage Error')
axes[2].set_ylabel('MAPE (%)')

plt.tight_layout()
plt.show()

# Feature importance from Random Forest
print("\n" + "="*80)
print("FEATURE IMPORTANCE (Random Forest)")
print("="*80)

rf_model = models['Random Forest']
feature_importance = pd.DataFrame({
    'Feature': feature_cols,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\n" + feature_importance.head(10).to_string(index=False))

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance.head(10), x='Importance', y='Feature', palette='viridis')
plt.title('Top 10 Most Important Features')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.show()


In [None]:
# ==============================================================================
# 8. PREDICTION VISUALIZATION
# ==============================================================================

print("\n" + "="*80)
print("PREDICTION VISUALIZATION")
print("="*80)

best_model_name = comparison_df.loc[comparison_df['R² Score'].idxmax(), 'Model']
best_predictions = results[best_model_name]['predictions']

plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
plt.scatter(y_test, best_predictions, alpha=0.5, color='steelblue')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Price ($)')
plt.ylabel('Predicted Price ($)')
plt.title(f'Actual vs Predicted Prices\n({best_model_name})')

plt.subplot(1, 2, 2)
residuals = y_test - best_predictions
plt.scatter(best_predictions, residuals, alpha=0.5, color='coral')
plt.axhline(y=0, color='r', linestyle='--', lw=2)
plt.xlabel('Predicted Price ($)')
plt.ylabel('Residuals ($)')
plt.title('Residual Plot')

plt.tight_layout()
plt.show()


In [None]:
# ==============================================================================
# 9. SAMPLE PREDICTIONS
# ==============================================================================

print("\n" + "="*80)
print("SAMPLE PREDICTIONS")
print("="*80)

sample_results = pd.DataFrame({
    'Actual Price': y_test.values[:10],
    'Predicted Price': best_predictions[:10],
    'Error': np.abs(y_test.values[:10] - best_predictions[:10]),
    'Error %': np.abs((y_test.values[:10] - best_predictions[:10]) / y_test.values[:10] * 100)
})

print("\nFirst 10 Test Set Predictions:")
print(sample_results.to_string(index=False))

In [None]:
# ==============================================================================
# 10. FINAL SUMMARY
# ==============================================================================

print("\n" + "="*80)
print("FINAL SUMMARY")
print("="*80)

print(f"\nBest Model: {best_model_name}")
print(f"R² Score: {results[best_model_name]['R2']:.4f} ({results[best_model_name]['R2']*100:.2f}%)")
print(f"Mean Absolute Error: ${results[best_model_name]['MAE']:,.2f}")
print(f"RMSE: ${results[best_model_name]['RMSE']:,.2f}")
print(f"MAPE: {results[best_model_name]['MAPE']:.2f}%")

print(f"\nDataset Size: {len(df)} vehicles")
print(f"Features Used: {len(feature_cols)}")
print(f"Training/Test Split: 80/20")

print("\n" + "="*80)
print("MODEL TRAINING COMPLETE")
print("="*80)