In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

# Load the data from CSV
data = pd.read_csv('car.csv')

# Calculate Age (years of service) based on current year 2026
data['Age'] = 2026 - data['Year']

# Drop unnecessary columns
data = data.drop(['Car_Name', 'Year'], axis=1)

# Define features and target
X = data.drop('Selling_Price', axis=1)
y = data['Selling_Price']

# Identify categorical columns
categorical_features = ['Fuel_Type', 'Seller_Type', 'Transmission']

# Preprocessing pipeline: One-hot encode categoricals
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'  # Keep numerical features as is
)

# Model pipeline: Preprocessor + Random Forest Regressor
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.2f}")
print(f"R² Score: {r2:.2f}")

# Feature importances
# Get the feature names after preprocessing
ohe = model.named_steps['preprocessor'].named_transformers_['cat']
feature_names = ohe.get_feature_names_out(categorical_features)
remainder_features = [col for col in X.columns if col not in categorical_features]
all_feature_names = np.concatenate([feature_names, remainder_features])

importances = model.named_steps['regressor'].feature_importances_
importance_df = pd.DataFrame({'Feature': all_feature_names, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print("\nFeature Importances:")
print(importance_df)

# Example prediction
# Sample input: Present_Price=5.59, Kms_Driven=27000, Fuel_Type='Petrol', Seller_Type='Dealer', Transmission='Manual', Owner=0, Age=12
sample_input = pd.DataFrame({
    'Present_Price': [5.59],
    'Kms_Driven': [27000],
    'Fuel_Type': ['Petrol'],
    'Seller_Type': ['Dealer'],
    'Transmission': ['Manual'],
    'Owner': [0],
    'Age': [12]
})
predicted_price = model.predict(sample_input)
print(f"\nPredicted Selling Price for sample input: {predicted_price[0]:.2f}")

Mean Absolute Error: 0.64
R² Score: 0.96

Feature Importances:
                  Feature  Importance
4           Present_Price    0.886416
7                     Age    0.059439
5              Kms_Driven    0.035097
3     Transmission_Manual    0.008867
1        Fuel_Type_Petrol    0.003702
0        Fuel_Type_Diesel    0.003175
2  Seller_Type_Individual    0.001943
6                   Owner    0.001360

Predicted Selling Price for sample input: 3.82
