In [61]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [62]:
df = pd.read_csv('../Data/cardekho_dataset.csv')
df.shape

(15411, 14)

In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15411 entries, 0 to 15410
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         15411 non-null  int64  
 1   car_name           15411 non-null  object 
 2   brand              15411 non-null  object 
 3   model              15411 non-null  object 
 4   vehicle_age        15411 non-null  int64  
 5   km_driven          15411 non-null  int64  
 6   seller_type        15411 non-null  object 
 7   fuel_type          15411 non-null  object 
 8   transmission_type  15411 non-null  object 
 9   mileage            15411 non-null  float64
 10  engine             15411 non-null  int64  
 11  max_power          15411 non-null  float64
 12  seats              15411 non-null  int64  
 13  selling_price      15411 non-null  int64  
dtypes: float64(2), int64(6), object(6)
memory usage: 1.6+ MB


In [64]:
num_features = df.select_dtypes(include=[np.number]).columns
print('Numerical Features:',num_features)
print(num_features)
cat_features = df.select_dtypes(include=['object']).columns
print('Categorical Features:',cat_features)
print(cat_features)
discrete_features = [feature for feature in num_features if len(df[feature].unique())<25]
print('Discrete Features Count:',len(discrete_features))
print(discrete_features)
continuous_features = [feature for feature in num_features if feature not in discrete_features]
print('Continuous Features Count:',len(continuous_features))
print(continuous_features)

Numerical Features: Index(['Unnamed: 0', 'vehicle_age', 'km_driven', 'mileage', 'engine',
       'max_power', 'seats', 'selling_price'],
      dtype='object')
Index(['Unnamed: 0', 'vehicle_age', 'km_driven', 'mileage', 'engine',
       'max_power', 'seats', 'selling_price'],
      dtype='object')
Categorical Features: Index(['car_name', 'brand', 'model', 'seller_type', 'fuel_type',
       'transmission_type'],
      dtype='object')
Index(['car_name', 'brand', 'model', 'seller_type', 'fuel_type',
       'transmission_type'],
      dtype='object')
Discrete Features Count: 2
['vehicle_age', 'seats']
Continuous Features Count: 6
['Unnamed: 0', 'km_driven', 'mileage', 'engine', 'max_power', 'selling_price']


In [65]:
df.drop(['Unnamed: 0', 'car_name'], axis=1, inplace=True)

In [66]:
X = df.drop(['selling_price'], axis=1)
y = df['selling_price']

In [67]:
X.shape, y.shape

((15411, 11), (15411,))

In [68]:
y.head()

0    120000
1    550000
2    215000
3    226000
4    570000
Name: selling_price, dtype: int64

In [69]:
len(df['model'].unique())

120

In [70]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X['model'] = le.fit_transform(X['model'])
X['brand'] = le.fit_transform(X['brand'])

In [71]:
X.head()

Unnamed: 0,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,18,7,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5
1,8,54,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5
2,8,118,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5
3,18,7,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5
4,6,38,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5


In [72]:
X.head()

Unnamed: 0,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,18,7,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5
1,8,54,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5
2,8,118,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5
3,18,7,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5
4,6,38,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5


In [79]:
num_features = num_features.delete(0)
num_features = num_features.delete(-1)
num_features

Index(['mileage', 'engine', 'max_power', 'seats'], dtype='object')

In [87]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

one_hot_columns = ['seller_type', 'fuel_type', 'transmission_type']

standard_scaler = Pipeline(steps=[('scaler', StandardScaler())])
one_hot_encoder = Pipeline(steps=[('onehot', OneHotEncoder(drop='first'))])

preprocessor = ColumnTransformer(
    [
        ('num', standard_scaler, num_features),
        ('cat', one_hot_encoder, one_hot_columns)
    ],
    remainder='passthrough'
)

In [88]:
X_processed = preprocessor.fit_transform(X)

In [89]:
X_processed
pd.DataFrame(X_processed).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,-0.000276,-1.324259,-1.263352,-0.403022,1.0,0.0,0.0,0.0,0.0,1.0,1.0,18.0,7.0,9.0,120000.0
1,-0.192071,-0.554718,-0.432571,-0.403022,1.0,0.0,0.0,0.0,0.0,1.0,1.0,8.0,54.0,5.0,20000.0
2,-0.647583,-0.554718,-0.479113,-0.403022,1.0,0.0,0.0,0.0,0.0,1.0,1.0,8.0,118.0,11.0,60000.0
3,0.292211,-0.93661,-0.779312,-0.403022,1.0,0.0,0.0,0.0,0.0,1.0,1.0,18.0,7.0,9.0,37000.0
4,0.735736,0.022918,-0.046502,-0.403022,0.0,0.0,1.0,0.0,0.0,0.0,1.0,6.0,38.0,6.0,30000.0


In [90]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train.shape, X_test.shape

((12328, 11), (3083, 11))

In [91]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestRegressor()

param_dist = {
    'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [3, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    'min_samples_split': [2, 5, 10, 15, 100],
    'min_samples_leaf': [1, 2, 5, 10]
}

rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=10, cv=5, verbose=2, random_state=42, n_jobs=-1)

rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   0.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   0.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   0.0s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1100; total time=   0.0s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1100; total time=   0.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   0.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   0.0s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimator

ValueError: 
All the 50 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/satviksawhney/Downloads/AIML/Machine Learning/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/satviksawhney/Downloads/AIML/Machine Learning/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/satviksawhney/Downloads/AIML/Machine Learning/.venv/lib/python3.12/site-packages/sklearn/ensemble/_forest.py", line 360, in fit
    X, y = validate_data(
           ^^^^^^^^^^^^^^
  File "/Users/satviksawhney/Downloads/AIML/Machine Learning/.venv/lib/python3.12/site-packages/sklearn/utils/validation.py", line 2961, in validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/satviksawhney/Downloads/AIML/Machine Learning/.venv/lib/python3.12/site-packages/sklearn/utils/validation.py", line 1370, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "/Users/satviksawhney/Downloads/AIML/Machine Learning/.venv/lib/python3.12/site-packages/sklearn/utils/validation.py", line 1055, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/satviksawhney/Downloads/AIML/Machine Learning/.venv/lib/python3.12/site-packages/sklearn/utils/_array_api.py", line 839, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/satviksawhney/Downloads/AIML/Machine Learning/.venv/lib/python3.12/site-packages/pandas/core/generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'Dealer'

--------------------------------------------------------------------------------
11 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/satviksawhney/Downloads/AIML/Machine Learning/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/satviksawhney/Downloads/AIML/Machine Learning/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/Users/satviksawhney/Downloads/AIML/Machine Learning/.venv/lib/python3.12/site-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/Users/satviksawhney/Downloads/AIML/Machine Learning/.venv/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of RandomForestRegressor must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'sqrt', 'log2'} or None. Got 'auto' instead.

--------------------------------------------------------------------------------
9 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/satviksawhney/Downloads/AIML/Machine Learning/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/satviksawhney/Downloads/AIML/Machine Learning/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/Users/satviksawhney/Downloads/AIML/Machine Learning/.venv/lib/python3.12/site-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/Users/satviksawhney/Downloads/AIML/Machine Learning/.venv/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of RandomForestRegressor must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'log2', 'sqrt'} or None. Got 'auto' instead.
