In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle

In [2]:
# Load the dataset
df = pd.read_csv("CAR DETAILS.csv")
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [3]:
df.shape

(4340, 8)

In [4]:
# Drop duplicates
df.drop_duplicates(inplace=True)
df.shape

(3577, 8)

In [5]:
# Calculate car age
df["car_age"] = 2023 - df["year"]
df.columns


Index(['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner', 'car_age'],
      dtype='object')

In [6]:
name = df["name"].str.split(" ", expand = True)
df["car_maker"] = name[0]
df["car_model"] = name[1]

In [7]:
df.drop(["name"], axis=1, inplace=True)

In [8]:
# Encode categorical columns
encoder = LabelEncoder()
df_encoded = df.apply(encoder.fit_transform)
df_encoded.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner,car_age,car_maker,car_model
0,13,17,474,4,1,1,0,13,18,6
1,13,60,335,4,1,1,0,13,18,164
2,18,236,607,1,1,1,0,8,10,161
3,23,114,309,4,1,1,0,3,5,125
4,20,193,681,1,1,1,2,6,9,17


In [9]:
# Split data into features and target
X = df_encoded.drop("selling_price", axis=1)
y = df_encoded["selling_price"]

In [10]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [11]:
# Define models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "KNeighbors Regressor": KNeighborsRegressor(),
    "Random Forest Regressor": RandomForestRegressor()
}

In [12]:
df_encoded.columns

Index(['year', 'selling_price', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner', 'car_age', 'car_maker', 'car_model'],
      dtype='object')

In [14]:
# Train and evaluate models
for name, model in models.items():
    pipeline = Pipeline([
        ("preprocessor", ColumnTransformer([
            ("encoder", OneHotEncoder(handle_unknown='ignore'), ["fuel", "seller_type", "transmission", "owner", "car_maker", "car_model"])
        ], remainder="passthrough")),
        ("model", model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(f"Model: {name}")
    print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_pred)}")
    print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred)}")
    print(f"R-squared: {r2_score(y_test, y_pred)}")
    print("--------------------------")

Model: Linear Regression
Mean Absolute Error: 24.775989533556956
Mean Squared Error: 1103.6906297171251
R-squared: 0.8840296352814718
--------------------------
Model: Ridge Regression
Mean Absolute Error: 25.00606751197837
Mean Squared Error: 1096.3203913240343
R-squared: 0.8848040635600994
--------------------------
Model: Lasso Regression
Mean Absolute Error: 42.18645601036093
Mean Squared Error: 2905.3379453977523
R-squared: 0.6947214263794987
--------------------------
Model: KNeighbors Regressor
Mean Absolute Error: 48.59581005586592
Mean Squared Error: 4512.514134078212
R-squared: 0.5258472837984594
--------------------------
Model: Random Forest Regressor
Mean Absolute Error: 27.15528758202536
Mean Squared Error: 1410.7428727085082
R-squared: 0.8517661008737551
--------------------------


In [27]:
# Save the best model
best_model = Pipeline([
    ("preprocessor", ColumnTransformer([
        ("encoder", OneHotEncoder(handle_unknown='ignore'), ["fuel", "seller_type", "transmission", "owner", "car_maker", "car_model"])
    ], remainder="passthrough")),
    ("model", Lasso())
])

best_model.fit(X, y)
with open('best_model1.pkl', 'wb') as file:
    pickle.dump(best_model, file)