In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import re
import warnings
warnings.filterwarnings('ignore')

In [2]:
print("=== LOADING DATA ===")
df = pd.read_csv("Cardetails.csv")
print(f"Dataset shape: {df.shape}")
print(df.head())

=== LOADING DATA ===
Dataset shape: (8128, 13)
                           name  year  selling_price  km_driven    fuel  \
0        Maruti Swift Dzire VDI  2014         450000   145500.0  Diesel   
1  Skoda Rapid 1.5 TDI Ambition  2014         370000   120000.0  Diesel   
2      Honda City 2017-2020 EXi  2006         158000   140000.0  Petrol   
3     Hyundai i20 Sportz Diesel  2010         225000   127000.0  Diesel   
4        Maruti Swift VXI BSIII  2007         130000   120000.0  Petrol   

  seller_type transmission         owner     mileage   engine   max_power  \
0  Individual       Manual   First Owner         NaN  1248 CC      74 bhp   
1  Individual       Manual  Second Owner  21.14 kmpl  1498 CC  103.52 bhp   
2  Individual       Manual   Third Owner   17.7 kmpl  1497 CC      78 bhp   
3  Individual       Manual   First Owner   23.0 kmpl  1396 CC      90 bhp   
4  Individual       Manual   First Owner   16.1 kmpl  1298 CC    88.2 bhp   

                     torque  seats  
0 

In [3]:
if 'engine' in df.columns:
    df['engine'] = df['engine'].str.replace('CC', '', regex=False).str.strip()
    df['engine'] = pd.to_numeric(df['engine'], errors='coerce')

if 'max_power' in df.columns:
    df['max_power'] = df['max_power'].str.replace('bhp', '', regex=False).str.strip()
    df['max_power'] = pd.to_numeric(df['max_power'], errors='coerce')

if 'mileage' in df.columns:
    df['mileage'] = df['mileage'].str.replace('kmpl', '', regex=False).str.strip()
    df['mileage'] = pd.to_numeric(df['mileage'], errors='coerce')

df['engine'] = df['engine'].fillna(df['engine'].median())
df['max_power'] = df['max_power'].fillna(df['max_power'].median())
df['mileage'] = df['mileage'].fillna(df['mileage'].median())


=== DATA CLEANING ===


In [4]:
# Torque extraction function
def extract_torque_and_mean_rpm(torque_str):
    if pd.isna(torque_str) or torque_str == "":
        return None, None
    
    torque_str = str(torque_str)
    torque_nm = None
    mean_rpm = None
    
    torque_match = re.search(r'(\d+\.?\d*)\s*(Nm|kgm|KGM)', torque_str, re.IGNORECASE)
    if torque_match:
        torque_value = float(torque_match.group(1))
        unit = torque_match.group(2).lower()
        
        if unit in ['kgm', 'kgm']:
            torque_nm = round(torque_value * 9.80665, 1)
        else:  # Nm
            torque_nm = torque_value
    
    range_match = re.search(r'(\d+)\s*-\s*(\d+)\s*rpm', torque_str, re.IGNORECASE)
    if range_match:
        rpm_low = int(range_match.group(1))
        rpm_high = int(range_match.group(2))
        mean_rpm = (rpm_low + rpm_high) / 2
    else:
        single_match = re.search(r'(\d+)\s*rpm', torque_str, re.IGNORECASE)
        if single_match:
            mean_rpm = float(single_match.group(1))
    
    return torque_nm, mean_rpm

def apply_simple_torque_extraction(df):
    results = df['torque'].apply(extract_torque_and_mean_rpm)
    df['torque_nm'] = results.apply(lambda x: x[0] if x else None)
    df['mean_rpm'] = results.apply(lambda x: x[1] if x else None)
    return df

df = apply_simple_torque_extraction(df)

In [5]:
# Preprocess categorical features
def preprocess_car_data(df):
    df_processed = df.copy()
    
    df_processed['transmission_encoded'] = df_processed['transmission'].map({'Manual': 0, 'Automatic': 1})
    
    owner_mapping = {
        'First Owner': 0,
        'Second Owner': 1, 
        'Third Owner': 2,
        'Fourth & Above Owner': 3
    }
    df_processed['owner_encoded'] = df_processed['owner'].map(owner_mapping)
    
    seller_mapping = {
        'Individual': 0,
        'Dealer': 1,
        'Trustmark Dealer': 2
    }
    df_processed['seller_type_encoded'] = df_processed['seller_type'].map(seller_mapping)
    
    fuel_dummies = pd.get_dummies(df_processed['fuel'], prefix='fuel')
    fuel_dummies = fuel_dummies.astype(int)
    
    df_processed = pd.concat([df_processed, fuel_dummies], axis=1)
    df_processed = df_processed.drop(['transmission', 'owner', 'seller_type', 'fuel'], axis=1)
    
    return df_processed

df_processed = preprocess_car_data(df)

In [6]:
# Prepare features and target
Y = df_processed['selling_price']
X = df_processed.drop(['name', 'torque', 'selling_price'], axis=1)

print(f"Features: {X.shape[1]}")
print(f"Target variable range: {Y.min():,.0f} to {Y.max():,.0f}")

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")

for column in X_train.columns:
    if X_train[column].isna().any():
        median_val = X_train[column].median()
        X_train[column] = X_train[column].fillna(median_val)
        X_test[column] = X_test[column].fillna(median_val)

print(f"Remaining NaN in X_train: {X_train.isna().sum().sum()}")
print(f"Remaining NaN in X_test: {X_test.isna().sum().sum()}")

Features: 15
Target variable range: 29,999 to 10,000,000
Training set: (6502, 15), Test set: (1626, 15)
Remaining NaN in X_train: 0
Remaining NaN in X_test: 0


In [7]:
# SVR MODEL 1: Standard Scaling with Log Transform
print("\n" + "="*50)
print("SVR MODEL 1: Log Transform (Recommended)")
print("="*50)

Y_train_log = np.log1p(Y_train)
Y_test_log = np.log1p(Y_test)

svr_log_model = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('svr', SVR(
        kernel='rbf',
        C=10.0,
        epsilon=0.1,
        gamma='scale',
        cache_size=1000,
        max_iter=10000,
        verbose=False
    ))
])

svr_log_model.fit(X_train, Y_train_log)
y_pred_log = svr_log_model.predict(X_test)

# Transform predictions back to original scale
y_pred_log_original = np.expm1(y_pred_log)

r2_log = r2_score(Y_test, y_pred_log_original)
mae_log = mean_absolute_error(Y_test, y_pred_log_original)
rmse_log = np.sqrt(mean_squared_error(Y_test, y_pred_log_original))

print(f"R² Score: {r2_log:.4f}")
print(f"MAE: {mae_log:.2f}")
print(f"RMSE: {rmse_log:.2f}")


SVR MODEL 1: Log Transform (Recommended)
R² Score: 0.9542
MAE: 98106.60
RMSE: 173353.06


In [8]:
# SVR MODEL 2: Standard Scaling with Target Scaling
print("\n" + "="*50)
print("SVR MODEL 2: Target Scaling")
print("="*50)

# Scale the target variable
y_scaler = StandardScaler()
Y_train_scaled = y_scaler.fit_transform(Y_train.values.reshape(-1, 1)).ravel()

svr_scaled_model = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('svr', SVR(
        kernel='rbf',
        C=100.0,
        epsilon=0.01,
        gamma='scale',
        cache_size=1000,
        max_iter=10000,
        verbose=False
    ))
])

svr_scaled_model.fit(X_train, Y_train_scaled)
y_pred_scaled = svr_scaled_model.predict(X_test)

# Transform predictions back to original scale
y_pred_scaled_original = y_scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()

r2_scaled = r2_score(Y_test, y_pred_scaled_original)
mae_scaled = mean_absolute_error(Y_test, y_pred_scaled_original)
rmse_scaled = np.sqrt(mean_squared_error(Y_test, y_pred_scaled_original))

print(f"R² Score: {r2_scaled:.4f}")
print(f"MAE: {mae_scaled:.2f}")
print(f"RMSE: {rmse_scaled:.2f}")


SVR MODEL 2: Target Scaling
R² Score: 0.9126
MAE: 166546.77
RMSE: 239341.25


In [9]:
# SVR MODEL 3: Different Kernels Comparison
print("\n" + "="*50)
print("SVR MODEL 3: Kernel Comparison")
print("="*50)

kernels = {
    'rbf_tuned': {'kernel': 'rbf', 'C': 10.0, 'gamma': 0.1},
    'linear': {'kernel': 'linear', 'C': 1.0},
    'poly': {'kernel': 'poly', 'C': 10.0, 'degree': 2, 'gamma': 'scale'}
}

kernel_results = {}

for kernel_name, params in kernels.items():
    svr_kernel_model = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('svr', SVR(**params, cache_size=500, max_iter=10000, verbose=False))
    ])
    
    svr_kernel_model.fit(X_train, Y_train_log)
    y_pred_kernel = svr_kernel_model.predict(X_test)
    y_pred_kernel_original = np.expm1(y_pred_kernel)
    
    r2_kernel = r2_score(Y_test, y_pred_kernel_original)
    mae_kernel = mean_absolute_error(Y_test, y_pred_kernel_original)
    rmse_kernel = np.sqrt(mean_squared_error(Y_test, y_pred_kernel_original))
    
    kernel_results[kernel_name] = {
        'r2': r2_kernel,
        'mae': mae_kernel,
        'rmse': rmse_kernel
    }
    
    print(f"{kernel_name.upper():<12} - R²: {r2_kernel:.4f}, MAE: {mae_kernel:.2f}, RMSE: {rmse_kernel:.2f}")


SVR MODEL 3: Kernel Comparison
RBF_TUNED    - R²: 0.9585, MAE: 93097.50, RMSE: 164897.08
LINEAR       - R²: 0.7169, MAE: 187398.53, RMSE: 430794.21
POLY         - R²: -2.1044, MAE: 379310.16, RMSE: 1426504.60


In [10]:
# SVR MODEL 4: High-Performance SVR
print("\n" + "="*50)
print("SVR MODEL 4: High-Performance Tuned")
print("="*50)

svr_high_perf = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('svr', SVR(
        kernel='rbf',
        C=50.0,
        epsilon=0.05,
        gamma=0.01,
        cache_size=2000,
        max_iter=20000,
        verbose=False
    ))
])

svr_high_perf.fit(X_train, Y_train_log)
y_pred_high = svr_high_perf.predict(X_test)
y_pred_high_original = np.expm1(y_pred_high)

r2_high = r2_score(Y_test, y_pred_high_original)
mae_high = mean_absolute_error(Y_test, y_pred_high_original)
rmse_high = np.sqrt(mean_squared_error(Y_test, y_pred_high_original))

print(f"R² Score: {r2_high:.4f}")
print(f"MAE: {mae_high:.2f}")
print(f"RMSE: {rmse_high:.2f}")


SVR MODEL 4: High-Performance Tuned
R² Score: 0.9460
MAE: 100062.58
RMSE: 188175.08


In [11]:
print("\n" + "="*60)
print("FINAL SVR MODEL COMPARISON")
print("="*60)

comparison_data = {
    'Log Transform': {'r2': r2_log, 'mae': mae_log, 'rmse': rmse_log},
    'Target Scaling': {'r2': r2_scaled, 'mae': mae_scaled, 'rmse': rmse_scaled},
    'High Performance': {'r2': r2_high, 'mae': mae_high, 'rmse': rmse_high}
}

for kernel, metrics in kernel_results.items():
    comparison_data[f'Kernel_{kernel}'] = metrics

print(f"{'MODEL':<20} {'R² Score':<10} {'MAE':<12} {'RMSE':<12}")
print("-" * 60)
for model_name, metrics in comparison_data.items():
    print(f"{model_name:<20} {metrics['r2']:<10.4f} {metrics['mae']:<12.2f} {metrics['rmse']:<12.2f}")

print("\n" + "="*50)
print("FEATURE IMPORTANCE ANALYSIS")
print("="*50)

best_model = svr_log_model.named_steps['svr']
scaler = svr_log_model.named_steps['scaler']

if hasattr(best_model, 'coef_') and best_model.coef_ is not None:
    feature_importance = pd.DataFrame({
        'feature': X_train.columns,
        'importance': np.abs(best_model.coef_[0])  # Use absolute values
    }).sort_values('importance', ascending=False)
    
    print("Top 10 Most Important Features:")
    print(feature_importance.head(10))
else:
    print("Feature importance available only for linear kernel")

print("\n" + "="*50)
print("PREDICTION EXAMPLES")
print("="*50)

sample_indices = np.random.choice(len(X_test), min(5, len(X_test)), replace=False)

print(f"{'Actual':<12} {'Predicted':<12} {'Error':<12} {'Error %':<10}")
print("-" * 50)
for idx in sample_indices:
    actual = Y_test.iloc[idx]
    predicted = y_pred_log_original[idx]
    error = actual - predicted
    error_pct = (error / actual) * 100
    
    print(f"{actual:<12,.0f} {predicted:<12,.0f} {error:<12,.0f} {error_pct:<10.1f}%")

print(f"\nBest SVR Model R² Score: {max(r2_log, r2_scaled, r2_high):.4f}")


FINAL SVR MODEL COMPARISON
MODEL                R² Score   MAE          RMSE        
------------------------------------------------------------
Log Transform        0.9542     98106.60     173353.06   
Target Scaling       0.9126     166546.77    239341.25   
High Performance     0.9460     100062.58    188175.08   
Kernel_rbf_tuned     0.9585     93097.50     164897.08   
Kernel_linear        0.7169     187398.53    430794.21   
Kernel_poly          -2.1044    379310.16    1426504.60  

FEATURE IMPORTANCE ANALYSIS
Feature importance available only for linear kernel

PREDICTION EXAMPLES
Actual       Predicted    Error        Error %   
--------------------------------------------------
325,000      298,968      26,032       8.0       %
550,000      416,375      133,625      24.3      %
70,000       113,421      -43,421      -62.0     %
390,000      576,810      -186,810     -47.9     %
660,000      492,217      167,783      25.4      %

Best SVR Model R² Score: 0.9542
