In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
data = pd.read_csv(r"Cars_dataset.csv")
data.head()

Unnamed: 0,v.id,on road old,on road now,years,km,rating,condition,economy,top speed,hp,torque,current price
0,1,535651,798186,3,78945,1,2,14,177,73,123,351318.0
1,2,591911,861056,6,117220,5,9,9,148,74,95,285001.5
2,3,686990,770762,2,132538,2,8,15,181,53,97,215386.0
3,4,573999,722381,4,101065,4,3,11,197,54,116,244295.5
4,5,691388,811335,6,61559,3,9,12,160,53,105,531114.5


In [3]:
print(data.columns)
data.columns = data.columns.str.strip()  # Removes leading and trailing spaces
print(data.columns)

Index(['v.id', ' on road old', ' on road now', ' years', ' km    ', ' rating',
       ' condition', ' economy', ' top speed', ' hp ', ' torque',
       ' current price'],
      dtype='object')
Index(['v.id', 'on road old', 'on road now', 'years', 'km', 'rating',
       'condition', 'economy', 'top speed', 'hp', 'torque', 'current price'],
      dtype='object')


In [13]:
# Select relevant features and target variable
features = ["on road old", "on road now", "years", "km", "rating", "condition",
            "economy", "top speed", "hp", "torque"]
target = "current price"

# Handle missing values (if any)
data = data.dropna()

# Split data into training and testing sets (80% train, 20% test)
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on test data
y_pred = rf_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print results
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R² Score: {r2:.3f}")

# Feature importance
importances = pd.Series(rf_model.feature_importances_, index=features)
print("\nFeature Importance:")
print(importances.sort_values(ascending=False))


Mean Absolute Error (MAE): 14981.78
Mean Squared Error (MSE): 376172162.70
Root Mean Squared Error (RMSE): 19395.16
R² Score: 0.978

Feature Importance:
km             0.880075
on road now    0.055649
on road old    0.047959
condition      0.007023
top speed      0.002050
torque         0.001886
hp             0.001824
economy        0.001303
years          0.001164
rating         0.001066
dtype: float64


In [12]:
# Example: New car details (fill with real values)
new_cars = pd.DataFrame({
    "on road old": [700000, 1100000],
    "on road now": [800000, 1200000],
    "years": [5, 3],
    "km": [40000, 25000],
    "rating": [4.5, 4.7],
    "condition": [8, 9],
    "economy": [18, 15],
    "top speed": [180, 200],
    "hp": [120, 150],
    "torque": [250, 300]
})

new_predictions = rf_model.predict(new_cars)
print("\nPredicted Resale Prices:", new_predictions)

for i, price in enumerate(new_predictions):
    print(f"Car {i+1} predicted resale price: ₹ {price:.2f}")




Predicted Resale Prices: [545819.005 560994.265]
Car 1 predicted resale price: ₹ 545819.01
Car 2 predicted resale price: ₹ 560994.27
