In [7]:
import pandas as pd

CSV_URL = "https://raw.githubusercontent.com/SurajChouhan14/PriceMyRide-Used-Car-Price-Prediction/main/Cardetailsv3.csv"

try:
    df_raw = pd.read_csv(CSV_URL)
    print("Loaded from GitHub raw:", CSV_URL, "| Rows:", len(df))
except Exception as e:
    print("Failed to load from GitHub raw. Error:", e)
    print("Fallback: upload the CSV or mount Drive as described below.")
# Core

import numpy as np
import pandas as pd

# Modeling
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.metrics import r2_score, mean_absolute_error, make_scorer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor

# Plots (interactive)
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd

# Reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

Loaded from GitHub raw: https://raw.githubusercontent.com/SurajChouhan14/PriceMyRide-Used-Car-Price-Prediction/main/Cardetailsv3.csv | Rows: 8128


In [8]:


#  Drop torque  and rows with nulls in key columns
df = df_raw.drop(columns=['torque'], errors='ignore').copy()


# Remove nulls and duplicates
df = df.dropna().drop_duplicates().reset_index(drop=True)

# Clean string numerics to floats (e.g., "23.4 kmpl" -> 23.4)
def clean_number_prefix(val):
    # Safely handle non-strings
    if pd.isna(val):
        return np.nan
    s = str(val).strip().split(' ')[0]
    try:
        return float(s)
    except:
        return np.nan

df['mileage'] = df['mileage'].apply(clean_number_prefix)
df['engine'] = df['engine'].apply(clean_number_prefix)
df['max_power'] = df['max_power'].apply(clean_number_prefix)

# 5) Simplify brand from 'name' (first word)
def get_brand(name):
    try:
        return str(name).split()[0]
    except:
        return 'Unknown'
df['brand'] = df['name'].apply(get_brand)

#  Keep only rows with all needed fields
needed_cols = ['selling_price','year','km_driven','fuel','seller_type','transmission','owner',
               'mileage','engine','max_power','seats','brand']
df = df.dropna(subset=needed_cols).reset_index(drop=True)

print("Final rows:", len(df))
df.head(3)


Final rows: 6717


Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats,brand
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4,1248.0,74.0,5.0,Maruti
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14,1498.0,103.52,5.0,Skoda
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7,1497.0,78.0,5.0,Honda


In [9]:
# Conversion: static -> interactive
# 1) Selling price distribution (log-scale on x can help)
fig = px.histogram(df, x='selling_price', nbins=60, title='Selling Price Distribution')
fig.update_layout(bargap=0.02)
fig.show()

# 2) Price vs key features (scatter with trendline)
for col in ['year','km_driven','mileage','engine','max_power']:
    fig = px.scatter(df, x=col, y='selling_price', trendline='ols',
                     title=f'Selling Price vs {col}', opacity=0.6)
    fig.show()

# 3) Box plots for categorical groups
for cat in ['fuel','seller_type','transmission','owner','brand']:
    fig = px.box(df, x=cat, y='selling_price', points='suspectedoutliers',
                 title=f'Selling Price by {cat}')
    fig.update_xaxes(categoryorder='total descending')
    fig.show()


In [10]:
# Target and features
target_col = 'selling_price'
y = df[target_col].astype(float)

# Choose categorical columns to OHE (no integer coding)
cat_cols = ['fuel','seller_type','transmission','owner','brand']
num_cols = ['year','km_driven','mileage','engine','max_power','seats']

X = df[cat_cols + num_cols].copy()

# One-Hot encode categoricals; scale numerics
preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols),
        ('num', StandardScaler(), num_cols),
    ],
    remainder='drop'
)


In [11]:
#  apply a log1p transform to selling_price (target) and optionally to heavily skewed numerics.
# This often improves linear model fit and stabilizes errors.

# Wrap log1p and inverse in transformers for the target
def log1p_array(a): return np.log1p(a)
def expm1_array(a): return np.expm1(a)

# For models,  train on log1p(y) and report metrics on original scale by inverse-transforming predictions.
y_log = pd.Series(np.log1p(y), index=y.index, name='selling_price_log')

# Optional: log-transform selected skewed features inside a FunctionTransformer before scaling
skewed_cols = ['km_driven','engine','max_power']  # adjust if needed

def log_transform_df(Xin):
    Xout = Xin.copy()
    for c in skewed_cols:
        if c in Xout.columns:
            Xout[c] = np.log1p(Xout[c].clip(lower=0))
    return Xout

log_transformer = FunctionTransformer(log_transform_df, validate=False)

# Full preprocessing: log-transform skewed numerics -> OHE + Scale
preprocess_full = Pipeline(steps=[
    ('log_skew', log_transformer),
    ('ct', preprocess)
])


In [12]:
X_train, X_test, y_train_log, y_test_log = train_test_split(
    X, y_log, test_size=0.2, random_state=RANDOM_STATE
)


In [13]:
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(alpha=10.0, random_state=RANDOM_STATE),
    'Lasso': Lasso(alpha=0.001, random_state=RANDOM_STATE, max_iter=10000),
    'RandomForest': RandomForestRegressor(
        n_estimators=400, max_depth=None, random_state=RANDOM_STATE, n_jobs=-1
    ),
}


In [14]:
# We will:
# - Cross-validate on y_log with R² (on log-space).
# - Fit on train, predict on test, inverse-transform to original scale, then compute R² and MAE on original target.

kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

def evaluate_model(name, estimator):
    # Build pipeline: preprocess_full -> model
    pipe = Pipeline(steps=[
        ('prep', preprocess_full),
        ('model', estimator)
    ])

    # CV R² in log space (proxy for relative fit)
    cv_scores = cross_val_score(pipe, X_train, y_train_log, cv=kf, scoring='r2', n_jobs=-1)

    # Fit on train
    pipe.fit(X_train, y_train_log)

    # Predict on test in log space, then inverse to original scale
    y_pred_log = pipe.predict(X_test)
    y_pred = np.expm1(y_pred_log)
    y_true = np.expm1(y_test_log)

    # Metrics on original scale
    r2_orig = r2_score(y_true, y_pred)
    mae_orig = mean_absolute_error(y_true, y_pred)

    print(f"\n=== {name} ===")
    print(f"CV R² (log-space): {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")
    print(f"Test R² (original): {r2_orig:.3f}")
    print(f"Test MAE (original): {mae_orig:,.0f}")

    return pipe, y_true, y_pred

fitted = {}
for name, est in models.items():
    pipe, y_true, y_pred = evaluate_model(name, est)
    fitted[name] = (pipe, y_true, y_pred)



=== LinearRegression ===
CV R² (log-space): 0.894 ± 0.008
Test R² (original): 0.891
Test MAE (original): 88,957

=== Ridge ===
CV R² (log-space): 0.892 ± 0.008
Test R² (original): 0.883
Test MAE (original): 89,874

=== Lasso ===
CV R² (log-space): 0.889 ± 0.008
Test R² (original): 0.868
Test MAE (original): 93,042

=== RandomForest ===
CV R² (log-space): 0.913 ± 0.004
Test R² (original): 0.929
Test MAE (original): 73,063


In [15]:
# Pick the best model by Test R² (original) quickly
best_name = None
best_r2 = -1
for name, (pipe, y_true, y_pred) in fitted.items():
    r2_val = r2_score(y_true, y_pred)
    if r2_val > best_r2:
        best_r2 = r2_val
        best_name = name

print("\nBest model by Test R²:", best_name, f"(R²={best_r2:.3f})")
y_true = fitted[best_name][1]
y_pred = fitted[best_name][2]

# Scatter: Actual vs Predicted (interactive)
fig = px.scatter(x=y_true, y=y_pred, title=f'Actual vs Predicted — {best_name}',
                 labels={'x':'Actual Price', 'y':'Predicted Price'}, opacity=0.6)
fig.add_trace(go.Scatter(x=[y_true.min(), y_true.max()],
                         y=[y_true.min(), y_true.max()],
                         mode='lines', name='Ideal', line=dict(dash='dash')))
fig.show()

# Residuals
residuals = y_true - y_pred
fig = px.histogram(residuals, nbins=60, title=f'Residual Distribution — {best_name}')
fig.update_layout(bargap=0.02)
fig.show()

fig = px.scatter(x=y_pred, y=residuals, title=f'Residuals vs Predicted — {best_name}',
                 labels={'x':'Predicted Price', 'y':'Residual (Actual - Pred)'},
                 opacity=0.6)
fig.add_hline(y=0, line_dash='dash')
fig.show()



Best model by Test R²: RandomForest (R²=0.929)


In [16]:
# If the best model is RandomForest, we can inspect feature importances
if best_name == 'RandomForest':
    pipe = fitted[best_name][0]
    # Extract the ColumnTransformer to fetch feature names after OHE
    ct = pipe.named_steps['prep'].named_steps['ct']

    # Get feature names
    cat_ohe = ct.named_transformers_['cat']
    num_scaler = ct.named_transformers_['num']

    cat_feat_names = cat_ohe.get_feature_names_out(cat_cols)
    num_feat_names = np.array(num_cols, dtype=object)
    feature_names = np.concatenate([cat_feat_names, num_feat_names])

    rf = pipe.named_steps['model']
    importances = rf.feature_importances_

    imp_df = pd.DataFrame({'feature': feature_names, 'importance': importances}) \
               .sort_values('importance', ascending=False).head(25)
    fig = px.bar(imp_df.sort_values('importance'),
                 x='importance', y='feature', orientation='h',
                 title='Top Feature Importances (RandomForest)')
    fig.update_layout(yaxis={'categoryorder':'total ascending'})
    fig.show()
else:
    print("Feature importance plot is only shown for RandomForest in this template.")


In [17]:
import joblib

best_pipe = fitted[best_name][0]
joblib.dump(best_pipe, 'car_price_pipeline.pkl')
print("Saved: car_price_pipeline.pkl")

def predict_price(pipeline, raw_row: dict):
    """
    - raw_row is a dict with the same raw fields we used (brand optional; if not present, inferred from name)
    - Returns predicted price in original currency units.
    """
    row = raw_row.copy()
    # Ensure brand
    if 'brand' not in row and 'name' in row:
        row['brand'] = row['name'].split()[0]
    # Clean numeric-like strings (same logic as training)
    for c in ['mileage','engine','max_power']:
        if c in row and isinstance(row[c], str):
            row[c] = clean_number_prefix(row[c])
    # Build DataFrame for model input
    cols = ['fuel','seller_type','transmission','owner','brand',
            'year','km_driven','mileage','engine','max_power','seats']
    X_one = pd.DataFrame([{k: row.get(k, np.nan) for k in cols}])
    # Predict in log space then invert
    y_log_pred = pipeline.predict(X_one)[0]
    return float(np.expm1(y_log_pred))

# Example usage:
example = {
    'name':'Toyota Innova',
    'year':2020, 'km_driven':12000, 'fuel':'Diesel',
    'seller_type':'Individual', 'transmission':'Manual', 'owner':'First Owner',
    'mileage':'12.9 kmpl', 'engine':'2494 CC', 'max_power':'100.6 bhp', 'seats':7.0
}
print("Predicted price:", f"{predict_price(best_pipe, example):,.0f}")


Saved: car_price_pipeline.pkl
Predicted price: 906,393


CONCLUSIONS


*   Best model: RandomForest with OHE + log transforms achieved Test R² ≈ 0.929 and MAE ≈ 73k, outperforming Linear/Ridge/Lasso (R² ≈ 0.87–0.89; MAE ≈ 89–93k).

*   Diagnostics: Plotly actual vs predicted and residuals look healthy with no strong residual patterns—fit is stable; CV R² (log-space) tight across folds.

