In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the dataset
df = pd.read_csv('Downloads/CAR DETAILS.csv')

In [3]:
# Identify numerical and categorical columns
numerical_cols = ['year', 'km_driven']  # Add other numerical columns if present
categorical_cols = ['name', 'fuel', 'seller_type', 'transmission', 'owner']

In [4]:
# Handle missing values for numerical columns
imputer = SimpleImputer(strategy='mean')
df[numerical_cols] = imputer.fit_transform(df[numerical_cols])

In [5]:
# One-hot encode categorical variables
df = pd.get_dummies(df, columns=categorical_cols)

In [6]:
# Separate features and target variable
features = df.drop('selling_price', axis=1)
target = df['selling_price']

In [7]:
# Scale the numerical features
scaler = StandardScaler()
features[numerical_cols] = scaler.fit_transform(features[numerical_cols])

In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [9]:
# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

In [10]:
# Decision Tree Regressor
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
mse_dt = mean_squared_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

In [11]:
# Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

In [23]:
# Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(random_state=42, n_estimators=100)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)
mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

In [24]:
# Print evaluation metrics for all models
print(f'Linear Regression MSE: {mse_lr}, R2: {r2_lr}')
print(f'Decision Tree MSE: {mse_dt}, R2: {r2_dt}')
print(f'Random Forest MSE: {mse_rf}, R2: {r2_rf}')
print(f'Gradient Boosting MSE: {mse_gb}, R2: {r2_gb}')

Linear Regression MSE: 16404871.906838669, R2: 0.9964276563387626
Decision Tree MSE: 10000000000.0, R2: -1.1776114324600204
Random Forest MSE: 2154685000.0, R2: 0.5307933310649882
Gradient Boosting MSE: 692120.0283443156, R2: 0.9998492831513642


In [25]:
# Find the best model
model_performance = {
    'Linear Regression': {'MSE': mse_lr, 'R2': r2_lr},
    'Decision Tree': {'MSE': mse_dt, 'R2': r2_dt},
    'Random Forest': {'MSE': mse_rf, 'R2': r2_rf},
    'Gradient Boosting': {'MSE': mse_gb, 'R2': r2_gb}
}

In [26]:
best_model = min(model_performance, key=lambda x: model_performance[x]['MSE'])
print(f'The best model is {best_model} with MSE: {model_performance[best_model]["MSE"]} and R2: {model_performance[best_model]["R2"]}')

The best model is Gradient Boosting with MSE: 692120.0283443156 and R2: 0.9998492831513642


In [27]:
best_model_name = best_model  # Name of the best model
best_model_instance = None

if best_model_name == 'Linear Regression':
    best_model_instance = lr_model
elif best_model_name == 'Decision Tree':
    best_model_instance = dt_model
elif best_model_name == 'Random Forest':
    best_model_instance = rf_model
elif best_model_name == 'Gradient Boosting':
    best_model_instance = gb_model

In [28]:
import pickle
# Save the best model to a file
with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model_instance, file)

print(f'{best_model_name} model saved as best_model.pkl')

Gradient Boosting model saved as best_model.pkl


In [29]:
# Load the saved model from the file
with open('best_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

print(f'{best_model_name} model loaded from best_model.pkl')

Gradient Boosting model loaded from best_model.pkl


In [30]:
# Use the loaded model to make predictions
# Example prediction on test data
y_pred_loaded = loaded_model.predict(X_test)

In [31]:
# Evaluate the loaded model
mse_loaded = mean_squared_error(y_test, y_pred_loaded)
r2_loaded = r2_score(y_test, y_pred_loaded)

In [32]:
print(f'Loaded {best_model_name} MSE: {mse_loaded}')
print(f'Loaded {best_model_name} R2: {r2_loaded}')

Loaded Gradient Boosting MSE: 692120.0283443156
Loaded Gradient Boosting R2: 0.9998492831513642


In [33]:
# Evaluate the model on the sampled data
from sklearn.metrics import mean_squared_error, r2_score

mse_sampled = mean_squared_error(target_sampled, y_pred_sampled)
r2_sampled = r2_score(target_sampled, y_pred_sampled)

print(f'MSE on sampled data: {mse_sampled}')
print(f'R2 on sampled data: {r2_sampled}')

NameError: name 'target_sampled' is not defined

In [34]:
import streamlit as st
import joblib

In [36]:
# Load the model
model = joblib.load("best_model.pkl")

In [37]:
streamlit run Data-Science-Capstone-Project.py

SyntaxError: invalid syntax (595297795.py, line 1)