In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [4]:
# Load the dataset
df = pd.read_csv("car.csv")

# Display first 5 rows
df.head()


Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [5]:
df.info()  # Check column data types
df.isnull().sum()  # Check for missing values


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64

In [6]:
df.drop(columns=["Car_Name"], inplace=True)


In [7]:
df["Car_Age"] = 2025 - df["Year"]  # Assuming 2025 as the current year
df.drop(columns=["Year"], inplace=True)


In [8]:
df = pd.get_dummies(df, columns=["Fuel_Type", "Seller_Type", "Transmission"], drop_first=True)


In [9]:
# Define features and target variable
X = df.drop(columns=["Selling_Price"])  # Features
y = df["Selling_Price"]  # Target variable

# Split into train and test sets (80-20 ratio)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
# Initialize and train the model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)


In [11]:
# Predict on test data
y_pred = rf.predict(X_test)


In [12]:
# Compute error metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")


RMSE: 0.96
R² Score: 0.96


In [13]:
# Define hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(RandomForestRegressor(random_state=42), param_distributions=param_grid, n_iter=10, cv=3, n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

# Best parameters
print("Best Hyperparameters:", random_search.best_params_)

# Train model with best parameters
best_rf = random_search.best_estimator_
best_rf.fit(X_train, y_train)

# Predict using best model
y_best_pred = best_rf.predict(X_test)

# Evaluate tuned model
mse_best = mean_squared_error(y_test, y_best_pred)
rmse_best = np.sqrt(mse_best)
r2_best = r2_score(y_test, y_best_pred)

print(f"Tuned Model RMSE: {rmse_best:.2f}")
print(f"Tuned Model R² Score: {r2_best:.2f}")


Best Hyperparameters: {'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': None}
Tuned Model RMSE: 0.99
Tuned Model R² Score: 0.96


In [14]:
import joblib

# Save the model
joblib.dump(best_rf, "car_price_model.pkl")


['car_price_model.pkl']

In [16]:
print(X_train.shape)  # Check number of features used in training


(240, 8)


In [18]:
import pandas as pd

# Define the new car data (Ensure it has only the required number of features)
new_data = [[5.59, 27000, 0, 5, 1, 0, 1, 0]]  # Adjust the features accordingly

# Ensure the feature names match those used during training
feature_names = X_train.columns  # Get column names from training data

# Convert new data into a DataFrame with proper column names
new_data_df = pd.DataFrame(new_data, columns=feature_names)

# Predict selling price
predicted_price = model.predict(new_data_df)

# Display result
print(f"Predicted Selling Price: {predicted_price[0]:.2f} Lakhs")




Predicted Selling Price: 4.20 Lakhs
