In [295]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [296]:
df = pd.read_csv('cleanedData.csv', low_memory=False)

In [297]:
df = df.drop(columns=['Unnamed: 0'], errors='ignore')
df.sample(3)

Unnamed: 0,Body Type,Driven(KM),No of Owner,Brand(Model),Variant Name,Price(Lakhs),City,Fuel Type,RTO,Transmission,...,Steering Type,Turning Radius,Front Brake Type,Rear Brake Type,Top Speed(KMPH),Acceleration,No Door Numbers,Cargo Volumn,Wheel Size,Age(Months)
4607,Hatchback,40724.0,3,Hyundai i10,Magna,3.11,Hyderabad,Petrol,AP28,Manual,...,Power,,Ventilated Disc,Drum,,,4.0,,13.0,12.0
7250,Sedan,69000.0,1,Hyundai Verna,1.6 CRDI,2.5,Kolkata,Diesel,WB02,Manual,...,Power,5.2,Disc,Disc,,,4.0,,15.0,11.0
1713,Sedan,143185.0,1,Honda Amaze,S i-Dtech,3.7,Chennai,Diesel,TN14,Manual,...,Power,4.7,Disc,Drum,170.0,17.0,4.0,400.0,,10.0


In [298]:
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer

In [299]:
X = df.drop(columns=['Price(Lakhs)'])
y = df['Price(Lakhs)']

In [300]:
y_log = np.log(y)

In [301]:
X_train, X_test, y_train_log, y_test_log = train_test_split(X, y_log, test_size = 0.2, random_state = 42)
X_train.shape, X_test.shape

((6695, 54), (1674, 54))

In [302]:
# The columns with more than 80% same values has no proper contribution to the model.
# So, the columns have to be dropped

X_train_same_value_threshold = 0.8
X_train_same_value_drop_list = [col for col in X_train.columns if X_train[col].value_counts(normalize=True).values[0] > X_train_same_value_threshold]
X_train_same_value_drop_list

['Steering',
 'Front',
 'Lights',
 'Rear',
 'Opener',
 'Beams',
 'Bag',
 'Seats',
 'Drive Type']

In [303]:
X_train = X_train.drop(columns=X_train_same_value_drop_list, axis=1)
X_test = X_test.drop(columns=X_train_same_value_drop_list, axis=1)

In [304]:
# The columns with more than 50% missing values have no proper contribution to the model.
# So, the columns have to be dropped

X_train_missing_value_threshold = 0.5
X_train_missing_value_drop_list = [col for col in X_train.columns if X_train[col].isna().mean() > X_train_missing_value_threshold]
X_train_missing_value_drop_list

[]

In [305]:
# IMPUTING MISSING VALUES

In [306]:
num_col_list = X_train.select_dtypes(include=['float64']).columns.to_list()

In [307]:
# Filling missing values with the KNNImputing method
kimputer = KNNImputer(weights='distance')

X_train[num_col_list] = kimputer.fit_transform(X_train[num_col_list])
X_test[num_col_list] = kimputer.transform(X_test[num_col_list])

In [308]:
cat_col_list = X_train.select_dtypes(include=['object']).columns.to_list()

In [309]:
for col in cat_col_list:
    X_train.loc[X_train[col].isna(), col] = X_train[col].dropna().sample(X_train[col].isna().sum()).values
    X_test.loc[X_test[col].isna(), col] = X_train[col].dropna().sample(X_test[col].isna().sum()).values

In [310]:
# DETECTING AND CAPPING OUTLIERS

In [311]:
for col in num_col_list:

    # Quantiles range calculation
    Q1 = X_train[col].quantile(0.25)
    Q3 = X_train[col].quantile(0.75)

    # Inter Quartile Range calculation
    IQR = Q3 - Q1

    # Minimum and Maximum values calculations for outliers
    lower_value = Q1 - 1.5 * IQR
    upper_value = Q3 + 1.5 * IQR

    # Capping outliers with the max and min values for Training data
    X_train.loc[X_train[col] > upper_value, col] = upper_value
    X_train.loc[X_train[col] < lower_value, col] = lower_value

    # Capping outliers with the max and min values for Testing data
    X_test.loc[X_test[col] > upper_value, col] = upper_value
    X_test.loc[X_test[col] < lower_value, col] = lower_value

In [312]:
# CORRELATION ANALYSIS

In [313]:
corr_matrix = pd.concat([X_train[num_col_list], y_train_log], axis=1).corr().abs()
threshold = 0.8

# Find columns that are highly correlated
high_corr_pairs = np.where(corr_matrix > threshold)
high_corr_pairs = [(corr_matrix.index[x], corr_matrix.columns[y]) for x, y in zip(*high_corr_pairs) if x != y and x < y]
high_corr_pairs

[('Max Power', 'Torque'),
 ('Max Power', 'Displacement'),
 ('Max Power', 'Length(mm)'),
 ('Max Power', 'Width(mm)'),
 ('Max Power', 'Wheel Base(mm)'),
 ('Max Power', 'Kerb Weight(KG)'),
 ('Max Power', 'Turning Radius'),
 ('Max Power', 'Top Speed(KMPH)'),
 ('Max Power', 'Acceleration'),
 ('Max Power', 'Wheel Size'),
 ('Torque', 'Displacement'),
 ('Torque', 'Kerb Weight(KG)'),
 ('Displacement', 'Kerb Weight(KG)'),
 ('Displacement', 'Turning Radius'),
 ('Length(mm)', 'Width(mm)'),
 ('Length(mm)', 'Wheel Base(mm)'),
 ('Length(mm)', 'Turning Radius'),
 ('Width(mm)', 'Wheel Base(mm)'),
 ('Width(mm)', 'Front Tread'),
 ('Width(mm)', 'Rear Tread'),
 ('Width(mm)', 'Wheel Size'),
 ('Wheel Base(mm)', 'Turning Radius'),
 ('Front Tread', 'Rear Tread'),
 ('Kerb Weight(KG)', 'Turning Radius'),
 ('Top Speed(KMPH)', 'Acceleration')]

In [314]:
# By observing the above highly correlating columns have to be dropped.
# No of Cylinder and No Door Number columns not relating to the Target column. So, the columns have to be dropped. 

cols_drop_list = ['Turning Radius', 'Kerb Weight(KG)', 'Front Tread', 'Rear Tread', 'Wheel Size', 
                  'No Door Numbers', 'No of Cylinder', 'Wheel Base(mm)']
X_train = X_train.drop(columns = cols_drop_list, axis=1)
X_test = X_test.drop(columns = cols_drop_list, axis=1)

In [315]:
X_train['Area'] = 2 * (X_train['Length(mm)'] + X_train['Width(mm)'])
X_test['Area'] = 2 * (X_test['Length(mm)'] + X_test['Width(mm)'])

X_train = X_train.drop(columns = ['Length(mm)', 'Width(mm)'], axis=1)
X_test = X_test.drop(columns = ['Length(mm)', 'Width(mm)'], axis=1)

In [316]:
cat_cols_drop_list = ['Front Brake Type', 'Rear Brake Type']

In [317]:
# By domain knowledge, the above categorical columns will not affect the price much. So, the columns have to be dropped.

X_train = X_train.drop(columns = cat_cols_drop_list, axis=1)
X_test = X_test.drop(columns = cat_cols_drop_list, axis=1)

In [318]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [319]:
# SCALING NUMERICAL COLUMNS

In [320]:
scaling_list = X_train.select_dtypes(include=['float64']).columns.to_list()

scaler = StandardScaler()
X_train[scaling_list] = scaler.fit_transform(X_train[scaling_list])
X_test[scaling_list] = scaler.transform(X_test[scaling_list])

In [321]:
# ENCODING THE CATEGORICAL COLUMNS

In [322]:
import category_encoders as ce

target_en_cols = ['Brand(Model)', 'Variant Name', 'RTO', 'Color', 'Engine Type', 'Fuel Suppy System']
for col in target_en_cols:
    mean_target = df.groupby(col)['Price(Lakhs)'].mean()
    X_train[col] = X_train[col].map(mean_target)
    X_test[col] = X_test[col].map(mean_target)

In [323]:
ohe_en_cols = ['Body Type', 'Fuel Type', 'Transmission', 'City', 'Value Configuration', 'Light', 'Headrest', 'Antenna', 'Headlamps', 'System', 
               'Wheel', 'Control', 'Warning', 'Turbo Charger', 'Steering Type']

In [324]:
ohencoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

X_train_encoded = ohencoder.fit_transform(X_train[ohe_en_cols])
X_test_encoded = ohencoder.transform(X_test[ohe_en_cols])

encoded_columns = ohencoder.get_feature_names_out(ohe_en_cols)

X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoded_columns, index=X_train.index)
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoded_columns, index=X_test.index)

X_train = pd.concat([X_train.drop(ohe_en_cols, axis=1), X_train_encoded_df], axis=1)
X_test = pd.concat([X_test.drop(ohe_en_cols, axis=1), X_test_encoded_df], axis=1)

In [325]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

rfregressor = RandomForestRegressor(n_estimators=150, max_depth=10, min_samples_leaf=4, random_state=42)
rfregressor.fit(X_train, y_train_log)

In [326]:
y_rfregress_pred_log = rfregressor.predict(X_test)
y_train_rfregress_pred_log = rfregressor.predict(X_train)

In [327]:
y_rfregress_pred = np.exp(y_rfregress_pred_log)
y_train_rfregress_pred = np.exp(y_train_rfregress_pred_log)
y_test = np.exp(y_test_log)
y_train = np.exp(y_train_log)

In [328]:
print('MAE: ', mean_absolute_error(y_test, y_rfregress_pred))
print('MSE: ', mean_squared_error(y_test, y_rfregress_pred))
print('R2 Score: ', r2_score(y_test, y_rfregress_pred))

MAE:  1.0886367728432988
MSE:  8.882072630592045
R2 Score:  0.9342912888209275


In [329]:
print(mean_squared_error(y_train, y_train_rfregress_pred))
print(mean_squared_error(y_test, y_rfregress_pred))

print(r2_score(y_train, y_train_rfregress_pred))
print(r2_score(y_test, y_rfregress_pred))

4.153116867619554
8.882072630592045
0.9611859153062712
0.9342912888209275


In [330]:
# GRID SEARCH 

In [352]:
from scipy.stats import randint
param_dist = {
    'n_estimators': randint(100, 200),     
    'max_depth': [None, 10, 20, 30],              
    'bootstrap': [True, False]             
    }

In [353]:
from sklearn.model_selection import RandomizedSearchCV
rf = RandomForestRegressor()
random_search = RandomizedSearchCV(estimator=rf, 
                                   param_distributions=param_dist, 
                                   n_iter=50,  
                                   cv=5,       
                                   verbose=2, 
                                   random_state=42, 
                                   n_jobs=2)

random_search.fit(X_train, y_train)

print(f"Best Parameters: {random_search.best_params_}")
print(f"Best Score: {random_search.best_score_}")

Fitting 5 folds for each of 50 candidates, totalling 250 fits


exception calling callback for <Future at 0x1fcb814ab70 state=finished raised TerminatedWorkerError>
Traceback (most recent call last):
  File "C:\Users\HP\Documents\Python\Lib\site-packages\joblib\externals\loky\_base.py", line 625, in _invoke_callbacks
  File "C:\Users\HP\Documents\Python\Lib\site-packages\joblib\parallel.py", line 360, in __call__
    n_jobs=default_parallel_config["n_jobs"],
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\HP\Documents\Python\Lib\site-packages\joblib\parallel.py", line 797, in dispatch_next
    # main thread when calling `get_result`.
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\HP\Documents\Python\Lib\site-packages\joblib\parallel.py", line 864, in dispatch_one_batch
    #            This method can be called either in the main thread         #
                ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\HP\Documents\Python\Lib\site-packages\joblib\parallel.py", line 782, in _dispatch
  File "C:\Users\HP\Documents\Python\Lib\

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.


In [349]:
# Feature Selection

In [350]:
cat_cols = ['Light', 'Headrest', 'Antenna', 'Headlamps', 'System', 'Variant Name', 'RTO', 'Engine Type', 'Fuel Suppy System', 
            'Wheel', 'Control', 'Warning', 'Turbo Charger', 'Steering Type', 'Color', 'Value Configuration', 'Body Type']
num_cols = ['Mileage', 'Torque', 'Displacement', 'Acceleration', 'Cargo Volumn', 'No of Owner', 'Top Speed', 'Height']

In [351]:
cat_pattern = '|'.join(cat_cols)
num_pattern = '|'.join(num_cols)

X_train = X_train.drop(X_train.filter(regex=cat_pattern).columns, axis=1)
X_test = X_test.drop(X_test.filter(regex=cat_pattern).columns, axis=1)

X_train = X_train.drop(X_train.filter(regex=num_pattern).columns, axis=1)
X_test = X_test.drop(X_test.filter(regex=num_pattern).columns, axis=1)
