In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset (replace 'car_data.csv' with your actual file name)
df = pd.read_csv('cardekho_dataset.csv')

# Display the first few rows and check data information
print(df.head())
print(df.info())


   Unnamed: 0       car_name    brand     model  vehicle_age  km_driven  \
0           0    Maruti Alto   Maruti      Alto            9     120000   
1           1  Hyundai Grand  Hyundai     Grand            5      20000   
2           2    Hyundai i20  Hyundai       i20           11      60000   
3           3    Maruti Alto   Maruti      Alto            9      37000   
4           4  Ford Ecosport     Ford  Ecosport            6      30000   

  seller_type fuel_type transmission_type  mileage  engine  max_power  seats  \
0  Individual    Petrol            Manual    19.70     796      46.30      5   
1  Individual    Petrol            Manual    18.90    1197      82.00      5   
2  Individual    Petrol            Manual    17.00    1197      80.00      5   
3  Individual    Petrol            Manual    20.92     998      67.10      5   
4      Dealer    Diesel            Manual    22.77    1498      98.59      5   

   selling_price  
0         120000  
1         550000  
2         2

In [None]:
 # Remove redundant/irrelevant columns

In [None]:

df.dropna(inplace=True)


In [None]:
# Convert categorical features into dummy variables
df = pd.get_dummies(df, columns=['fuel_type'], drop_first=True)
# drop_first=True is used to avoid the dummy variable trap (multicollinearity)

In [None]:
# X = Independent features, y = Target variable
X = df.drop(['selling_price', 'car_name', 'brand', 'model', 'seller_type', 'transmission_type'], axis=1)
y = df['selling_price']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Initialize the model
rf_model = RandomForestRegressor()

# --- Hyperparameter Tuning (Optional but recommended for better results) ---
# Define a grid of hyperparameters to search
n_estimators = [int(x) for x in np.linspace(start=100, stop=1200, num=12)]
max_features = ['sqrt']
max_depth = [int(x) for x in np.linspace(5, 30, num=6)]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

# Use Randomized Search CV to find the best hyperparameters
rf_random = RandomizedSearchCV(estimator=rf_model, param_distributions=random_grid,
                               scoring='neg_mean_squared_error', n_iter=10, cv=5,
                               verbose=2, random_state=42, n_jobs=1)

# Fit the model (this will train and tune the model)
rf_random.fit(X_train, y_train)

# Select the best model
best_rf_model = rf_random.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=25, max_features=sqrt, min_samples_leaf=5, min_samples_split=15, n_estimators=1100; total time=  18.3s
[CV] END max_depth=25, max_features=sqrt, min_samples_leaf=5, min_samples_split=15, n_estimators=1100; total time=  17.8s
[CV] END max_depth=25, max_features=sqrt, min_samples_leaf=5, min_samples_split=15, n_estimators=1100; total time=  18.4s
[CV] END max_depth=25, max_features=sqrt, min_samples_leaf=5, min_samples_split=15, n_estimators=1100; total time=  19.1s
[CV] END max_depth=25, max_features=sqrt, min_samples_leaf=5, min_samples_split=15, n_estimators=1100; total time=  19.0s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=  15.4s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=  15.0s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_esti

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import pickle

predictions = best_rf_model.predict(X_test)

print(f'R-squared Score: {r2_score(y_test, predictions):.2f}')
print(f'Mean Absolute Error (MAE): {mean_absolute_error(y_test, predictions):.2f}')
print(f'Root Mean Squared Error (RMSE): {np.sqrt(mean_squared_error(y_test, predictions)):.2f}')

# Save the list of features (column names of X_train)
model_features = list(X_train.columns)
with open('model_features.pkl', 'wb') as file:
    pickle.dump(model_features, file)

print("\n--- Saved Features ---")
print(model_features)

R-squared Score: 0.92
Mean Absolute Error (MAE): 110031.92
Root Mean Squared Error (RMSE): 250567.07

--- Saved Features ---
['Unnamed: 0', 'vehicle_age', 'km_driven', 'mileage', 'engine', 'max_power', 'seats', 'fuel_type_Diesel', 'fuel_type_Electric', 'fuel_type_LPG', 'fuel_type_Petrol']


In [None]:
import pickle

# Load the feature list object
with open('model_features.pkl', 'rb') as file:
    loaded_features = pickle.load(file)

# 'loaded_features' is now a standard Python list of strings
print("--- Loaded Features ---")
print(loaded_features)
print(f"Total features: {len(loaded_features)}")
# Example Output: ['Present_Price', 'Kms_Driven', 'Owner', 'Car_Age', 'Fuel_Type_Diesel', ...]


--- Loaded Features ---
['Unnamed: 0', 'vehicle_age', 'km_driven', 'mileage', 'engine', 'max_power', 'seats', 'fuel_type_Diesel', 'fuel_type_Electric', 'fuel_type_LPG', 'fuel_type_Petrol']
Total features: 11


In [None]:
import pickle

# Load the trained model object
with open('random_forest_regression_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Now, 'loaded_model' is your Random Forest Regressor instance.
# You can use it to make predictions:
# loaded_model.predict(new_data)

FileNotFoundError: [Errno 2] No such file or directory: 'random_forest_regression_model.pkl'

In [None]:
def predict_price(features):
    """
    Predicts car price based on input features.

    Args:
        features: A dictionary containing car features.

    Returns:
        The predicted car price.
    """
    # Create a DataFrame from the input features
    features_df = pd.DataFrame([features])

    # Ensure the columns match the trained model's features
    # Add missing dummy columns with value 0 if they are not present in the input
    for col in loaded_features:
        if col not in features_df.columns:
            features_df[col] = 0

    # Reorder columns to match the order used during training
    features_df = features_df[loaded_features]

    # Make prediction using the loaded model
    predicted_price = loaded_model.predict(features_df)

    # Return the predicted price (extract the single value from the array)
    return predicted_price[0]

In [None]:
import ipywidgets as widgets
from IPython.display import display

# Create input widgets for numerical features
unnamed_0_widget = widgets.IntText(description='Unnamed: 0:', value=0)
vehicle_age_widget = widgets.IntSlider(description='Vehicle Age:', min=0, max=30, value=5)
km_driven_widget = widgets.IntText(description='Km Driven:', value=50000)
mileage_widget = widgets.FloatText(description='Mileage:', value=18.0)
engine_widget = widgets.IntText(description='Engine (CC):', value=1200)
max_power_widget = widgets.FloatText(description='Max Power (BHP):', value=80.0)
seats_widget = widgets.IntSlider(description='Seats:', min=2, max=10, value=5)

# Create a dropdown for fuel type
fuel_type_widget = widgets.Dropdown(
    options=['Petrol', 'Diesel', 'CNG', 'LPG', 'Electric'],
    value='Petrol',
    description='Fuel Type:',
)

# Arrange widgets in a VBox for better layout
input_widgets = widgets.VBox([
    unnamed_0_widget,
    vehicle_age_widget,
    km_driven_widget,
    mileage_widget,
    engine_widget,
    max_power_widget,
    seats_widget,
    fuel_type_widget
])

# Display the input widgets
display(input_widgets)

VBox(children=(IntText(value=0, description='Unnamed: 0:'), IntSlider(value=5, description='Vehicle Age:', max…