In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle

In [2]:
# Read the data
df = pd.read_csv("CAR DETAILS.csv")

In [3]:
# Drop duplicates
df.drop_duplicates(inplace=True)

In [4]:
# Feature engineering
df["car_age"] = 2023 - df["year"]
name = df["name"].str.split(" ", expand=True)
df["car_maker"] = name[0]
df["car_model"] = name[1]
df.drop(["name"], axis=1, inplace=True)

In [5]:
# Encoding categorical variables
df = pd.get_dummies(df, drop_first=True)

In [6]:
# Encoding target variable
encoder = LabelEncoder()
df["selling_price_encoded"] = encoder.fit_transform(df["selling_price"])

In [7]:
# Define features and target
X = df.drop(["selling_price", "selling_price_encoded"], axis=1)
y = df["selling_price_encoded"]

In [8]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Define regression models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "KNeighbors Regressor": KNeighborsRegressor(),
    "Random Forest Regressor": RandomForestRegressor()
}

In [10]:
# Evaluate models
best_model = None
best_score = -np.inf
for name, model in models.items():
    print(f"Evaluating {name}:")
    model.fit(X_train, y_train)
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)
    print("Training Score:", train_score)
    print("Testing Score:", test_score)
    if test_score > best_score:
        best_model = model
        best_score = test_score
    print()

Evaluating Linear Regression:
Training Score: 0.8982205264647569
Testing Score: 0.8597588068552894

Evaluating Ridge Regression:
Training Score: 0.8943175753802686
Testing Score: 0.8648793076897621

Evaluating Lasso Regression:
Training Score: 0.6933451809912715
Testing Score: 0.7000773476563515

Evaluating KNeighbors Regressor:
Training Score: 0.5934847108358561
Testing Score: 0.4235616941128246

Evaluating Random Forest Regressor:
Training Score: 0.9768760437142111
Testing Score: 0.8449970159894954



In [11]:
# Save the best model
with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

In [12]:
# Create a sample random dataframe
df_sample = df.sample(20, random_state=42)

In [13]:
# Split sample data into features and target
X_sample = df_sample.drop(["selling_price", "selling_price_encoded"], axis=1)
y_sample = df_sample["selling_price_encoded"]

In [14]:
# Load the best model
with open('best_model.pkl', 'rb') as file:
    best_model = pickle.load(file)

In [15]:
# Predict on sample data
y_pred_sample = best_model.predict(X_sample)

In [16]:
# Print scores for main data
print("Scores for main data:")
print("Training Score:", best_model.score(X_train, y_train))
print("Testing Score:", best_model.score(X_test, y_test))
print("R2 Score:", r2_score(y_test, best_model.predict(X_test)))
print("MAE:", mean_absolute_error(y_test, best_model.predict(X_test)))
print("MSE:", mean_squared_error(y_test, best_model.predict(X_test)))
print()

Scores for main data:
Training Score: 0.8943175753802686
Testing Score: 0.8648793076897621
R2 Score: 0.8648793076897621
MAE: 27.0891986534017
MSE: 1371.0038364436639



In [17]:
# Print scores for sample data
print("Scores for sample data:")
print("Training Score:", best_model.score(X_sample, y_sample))
print("Testing Score:", best_model.score(X_sample, y_pred_sample))
print("R2 Score:", r2_score(y_sample, y_pred_sample))
print("MAE:", mean_absolute_error(y_sample, y_pred_sample))
print("MSE:", mean_squared_error(y_sample, y_pred_sample))

Scores for sample data:
Training Score: 0.881743728340207
Testing Score: 1.0
R2 Score: 0.881743728340207
MAE: 25.07485299621758
MSE: 1035.6987746202374
