In [64]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

In [40]:
df = pd.read_csv("car data.csv")


In [41]:
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [42]:
df.isnull().sum()

name             0
year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
dtype: int64

In [43]:
df['car_age'] = 2025 - df['year']


In [44]:
# Outlier removal
df = df[(df['selling_price'] < df['selling_price'].quantile(0.99)) &
        (df['selling_price'] > df['selling_price'].quantile(0.01))]
df = df[df['km_driven'] < df['km_driven'].quantile(0.99)]

In [45]:
df.drop(['name', 'year'], axis=1, inplace=True)

In [46]:
# Log transformation
df['selling_price_log'] = np.log1p(df['selling_price'])     # target
df['km_driven'] = np.log1p(df['km_driven']) 

In [47]:
X = df.drop(columns=['selling_price', 'selling_price_log'])
y = df['selling_price_log']

In [48]:
df.head()

Unnamed: 0,selling_price,km_driven,fuel,seller_type,transmission,owner,car_age,selling_price_log
0,60000,11.156265,Petrol,Individual,Manual,First Owner,18,11.002117
1,135000,10.819798,Petrol,Individual,Manual,First Owner,18,11.813037
2,600000,11.512935,Diesel,Individual,Manual,First Owner,13,13.304687
3,250000,10.736418,Petrol,Individual,Manual,First Owner,8,12.42922
4,450000,11.856522,Diesel,Individual,Manual,Second Owner,11,13.017005


In [49]:
# Categorical and numerical columns
categorical_cols = ['fuel', 'seller_type', 'transmission', 'owner']
numerical_cols = ['km_driven', 'car_age']

In [50]:
# Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(drop='first'), categorical_cols)
], remainder='passthrough')

In [51]:
models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Linear Regression': LinearRegression()
}

In [52]:
# Split & Train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [53]:
# Results storage
results = []

In [54]:
best_model = None
best_r2 = float('-inf')

for name, reg in models.items():
    
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', reg)
    ])
    
    pipe.fit(X_train, y_train)
    y_pred_log = pipe.predict(X_test)
    y_pred = np.expm1(y_pred_log)    # Convert back to original price
    y_test_actual = np.expm1(y_test) # Convert actual test values back

    r2 = r2_score(y_test_actual, y_pred)
    mae = mean_absolute_error(y_test_actual, y_pred)
    mse = mean_squared_error(y_test_actual, y_pred)
    rmse = np.sqrt(mse)
    
    results.append({
        'Model': name,
        'R²': round(r2, 4),
        'MAE': round(mae, 2),
        'MSE': round(mse, 2),
        'RMSE': round(rmse, 2)
    })

In [56]:
    # Save best model
    if r2 > best_r2:
        best_r2 = r2
        best_model = pipe
        best_model_name = name

# Create results DataFrame
results_df = pd.DataFrame(results)
print("\n🔍 Model Comparison :\n")
print(results_df)



🔍 Model Comparison :

               Model      R²        MAE           MSE       RMSE
0      Random Forest  0.6256  149342.66  6.983361e+10  264260.50
1  Gradient Boosting  0.5977  161783.79  7.503014e+10  273916.31
2      Decision Tree  0.4831  158273.40  9.640887e+10  310497.78
3  Linear Regression  0.5305  175002.48  8.756440e+10  295912.83


In [66]:
# Save best model to file
import joblib
model_filename = f"{best_model_name.lower().replace(' ', '_')}_best_model.pkl"
joblib.dump(best_model, model_filename)
print(f"\n✅ Best model '{best_model_name}' saved as: {model_filename}")



✅ Best model 'Linear Regression' saved as: linear_regression_best_model.pkl
