In [27]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
import pickle

# Load the dataset
df = pd.read_csv("CAR DETAILS.csv")

# Drop the "Unnamed: 0" column before further preprocessing

# Drop duplicates
df.drop_duplicates(inplace=True)

# Calculate car age
df["car_age"] = 2023 - df["year"]

# Extract car maker and model from 'name' column
name = df["name"].str.split(" ", expand=True)
df["car_maker"] = name[0]
df["car_model"] = name[1]

# Drop unnecessary columns
df.drop(["name"], axis=1, inplace=True)

# One-hot encode categorical variables
df = pd.get_dummies(df, drop_first=True)

# Encode categorical columns
encoder = LabelEncoder()
df = df.apply(encoder.fit_transform)

# Split data into features and target
X = df.drop("selling_price", axis=1)
y = df["selling_price"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

# Define preprocessing steps
preprocessor = Pipeline([
    ('scaler', StandardScaler()),  # Scale features
])

# Define models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "KNeighbors Regressor": KNeighborsRegressor(),
    "Random Forest Regressor": RandomForestRegressor(max_depth=15, random_state=7)
}

# Create a pipeline for each model
pipelines = {}
for name, model in models.items():
    pipelines[name] = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

# Train and evaluate models
def evaluate_models(pipelines, X_train, y_train, X_test, y_test):
    for name, pipeline in pipelines.items():
        print(f"Evaluating {name}:")
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        print("MAE:", mae)
        print("MSE:", mse)
        print("R2 Score:", r2)
        print("Testing Score:", pipeline.score(X_test, y_test))
        print("Training Score: ", pipeline.score(X_train, y_train))
        print()

evaluate_models(pipelines, X_train, y_train, X_test, y_test)

# Select the best model
best_model = pipelines["Random Forest Regressor"]

# Save the best model using Pickle
with open('rfmodel_pipeline.pkl', 'wb') as file:
    pickle.dump(best_model, file)


Evaluating Linear Regression:
MAE: 60091537933690.99
MSE: 3.0595361955372445e+29
R2 Score: -3.000231662956363e+25
Testing Score: -3.000231662956363e+25
Training Score:  0.8268180599646024

Evaluating Ridge Regression:
MAE: 27.71866621560572
MSE: 1412.9450814355096
R2 Score: 0.8614442745431586
Testing Score: 0.8614442745431586
Training Score:  0.8989289804986564

Evaluating Lasso Regression:
MAE: 29.53420094767817
MSE: 1524.8586455662316
R2 Score: 0.850469845833701
Testing Score: 0.850469845833701
Training Score:  0.8827473823833063

Evaluating KNeighbors Regressor:
MAE: 31.22653631284916
MSE: 1815.9545810055865
R2 Score: 0.821924498217387
Testing Score: 0.821924498217387
Training Score:  0.8785677854101691

Evaluating Random Forest Regressor:
MAE: 31.597177879517922
MSE: 1901.9997462025267
R2 Score: 0.813486767379467
Testing Score: 0.813486767379467
Training Score:  0.9406781314068298

