In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [2]:
df = pd.read_csv('cleanedData.csv', low_memory=False)

In [3]:
df = df.drop(columns=['Unnamed: 0'], errors='ignore')
df.sample(3)

Unnamed: 0,Body Type,Driven(KM),No of Owner,Brand(Model),Variant Name,Price(Lakhs),City,Fuel Type,RTO,Transmission,...,Steering Type,Turning Radius,Front Brake Type,Rear Brake Type,Top Speed(KMPH),Acceleration,No Door Numbers,Cargo Volumn,Wheel Size,Age(Months)
2472,SUV,49000.0,1,BMW X1,sDrive 20D xLine,27.0,Chennai,Diesel,TN02,Automatic,...,Power,5.8,Ventilated Discs,Ventilated Discs,205.0,7.9,5.0,420.0,17.0,6.0
1373,MUV,73000.0,1,Mahindra Xylo,H8,7.75,Bangalore,Diesel,KA05,Manual,...,Power,5.5,Disc,Drum,160.0,16.0,5.0,376.0,,8.0
3381,Hatchback,20755.0,2,Maruti Baleno,1.2 Delta,5.03,Delhi,Petrol,DL2C,Manual,...,Power,4.9,Disc,Drum,180.0,12.36,5.0,339.0,,8.0


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer

In [5]:
X = df.drop(columns=['Price(Lakhs)'])
y = df['Price(Lakhs)']

In [6]:
y_log = np.log(y)

In [7]:
X_train, X_test, y_train_log, y_test_log = train_test_split(X, y_log, test_size = 0.2, random_state = 42)
X_train.shape, X_test.shape

((6695, 54), (1674, 54))

In [8]:
# The columns with more than 80% same values has no proper contribution to the model.
# So, the columns have to be dropped

X_train_same_value_threshold = 0.8
X_train_same_value_drop_list = [col for col in X_train.columns if X_train[col].value_counts(normalize=True).values[0] > X_train_same_value_threshold]
X_train_same_value_drop_list

['Steering',
 'Front',
 'Lights',
 'Rear',
 'Opener',
 'Beams',
 'Bag',
 'Seats',
 'Drive Type']

In [9]:
X_train = X_train.drop(columns=X_train_same_value_drop_list, axis=1)
X_test = X_test.drop(columns=X_train_same_value_drop_list, axis=1)

In [10]:
# The columns with more than 50% missing values have no proper contribution to the model.
# So, the columns have to be dropped

X_train_missing_value_threshold = 0.5
X_train_missing_value_drop_list = [col for col in X_train.columns if X_train[col].isna().mean() > X_train_missing_value_threshold]
X_train_missing_value_drop_list

[]

In [11]:
# IMPUTING MISSING VALUES

In [12]:
num_col_list = X_train.select_dtypes(include=['float64']).columns.to_list()

In [13]:
# Filling missing values with the KNNImputing method
kimputer = KNNImputer(weights='distance')

X_train[num_col_list] = kimputer.fit_transform(X_train[num_col_list])
X_test[num_col_list] = kimputer.transform(X_test[num_col_list])

In [14]:
cat_col_list = X_train.select_dtypes(include=['object']).columns.to_list()

In [15]:
for col in cat_col_list:
    X_train.loc[X_train[col].isna(), col] = X_train[col].dropna().sample(X_train[col].isna().sum()).values
    X_test.loc[X_test[col].isna(), col] = X_train[col].dropna().sample(X_test[col].isna().sum()).values

In [16]:
# DETECTING AND CAPPING OUTLIERS

In [17]:
for col in num_col_list:

    # Quantiles range calculation
    Q1 = X_train[col].quantile(0.25)
    Q3 = X_train[col].quantile(0.75)

    # Inter Quartile Range calculation
    IQR = Q3 - Q1

    # Minimum and Maximum values calculations for outliers
    lower_value = Q1 - 1.5 * IQR
    upper_value = Q3 + 1.5 * IQR

    # Capping outliers with the max and min values for Training data
    X_train.loc[X_train[col] > upper_value, col] = upper_value
    X_train.loc[X_train[col] < lower_value, col] = lower_value

    # Capping outliers with the max and min values for Testing data
    X_test.loc[X_test[col] > upper_value, col] = upper_value
    X_test.loc[X_test[col] < lower_value, col] = lower_value

In [18]:
# CORRELATION ANALYSIS

In [19]:
corr_matrix = pd.concat([X_train[num_col_list], y_train_log], axis=1).corr().abs()
threshold = 0.8

# Find columns that are highly correlated
high_corr_pairs = np.where(corr_matrix > threshold)
high_corr_pairs = [(corr_matrix.index[x], corr_matrix.columns[y]) for x, y in zip(*high_corr_pairs) if x != y and x < y]
high_corr_pairs

[('Max Power', 'Torque'),
 ('Max Power', 'Displacement'),
 ('Max Power', 'Length(mm)'),
 ('Max Power', 'Width(mm)'),
 ('Max Power', 'Wheel Base(mm)'),
 ('Max Power', 'Kerb Weight(KG)'),
 ('Max Power', 'Turning Radius'),
 ('Max Power', 'Top Speed(KMPH)'),
 ('Max Power', 'Acceleration'),
 ('Max Power', 'Wheel Size'),
 ('Torque', 'Displacement'),
 ('Torque', 'Kerb Weight(KG)'),
 ('Displacement', 'Kerb Weight(KG)'),
 ('Displacement', 'Turning Radius'),
 ('Length(mm)', 'Width(mm)'),
 ('Length(mm)', 'Wheel Base(mm)'),
 ('Length(mm)', 'Turning Radius'),
 ('Width(mm)', 'Wheel Base(mm)'),
 ('Width(mm)', 'Front Tread'),
 ('Width(mm)', 'Rear Tread'),
 ('Width(mm)', 'Wheel Size'),
 ('Wheel Base(mm)', 'Turning Radius'),
 ('Front Tread', 'Rear Tread'),
 ('Kerb Weight(KG)', 'Turning Radius'),
 ('Top Speed(KMPH)', 'Acceleration')]

In [20]:
# By observing the above highly correlating columns have to be dropped.
# No of Cylinder and No Door Number columns not relating to the Target column. So, the columns have to be dropped. 

cols_drop_list = ['Turning Radius', 'Kerb Weight(KG)', 'Front Tread', 'Rear Tread', 'Wheel Size', 
                  'No Door Numbers', 'No of Cylinder', 'Wheel Base(mm)']
X_train = X_train.drop(columns = cols_drop_list, axis=1)
X_test = X_test.drop(columns = cols_drop_list, axis=1)

In [21]:
X_train['Area'] = 2 * (X_train['Length(mm)'] + X_train['Width(mm)'])
X_test['Area'] = 2 * (X_test['Length(mm)'] + X_test['Width(mm)'])

X_train = X_train.drop(columns = ['Length(mm)', 'Width(mm)'], axis=1)
X_test = X_test.drop(columns = ['Length(mm)', 'Width(mm)'], axis=1)

In [22]:
cat_cols_drop_list = ['Front Brake Type', 'Rear Brake Type']

In [23]:
# By domain knowledge, the above categorical columns will not affect the price much. So, the columns have to be dropped.

X_train = X_train.drop(columns = cat_cols_drop_list, axis=1)
X_test = X_test.drop(columns = cat_cols_drop_list, axis=1)

In [24]:
# SCALING THE NUMERICAL COLUMNS

In [25]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [26]:
scaling_list = X_train.select_dtypes(include=['float64']).columns.to_list()

scaler = StandardScaler()
X_train[scaling_list] = scaler.fit_transform(X_train[scaling_list])
X_test[scaling_list] = scaler.transform(X_test[scaling_list])

In [27]:
# ENCODING THE CATEGORICAL COLUMNS

In [28]:
target_en_cols = ['Brand(Model)', 'Variant Name', 'RTO', 'Color', 'Engine Type', 'Fuel Suppy System']
for col in target_en_cols:
    mean_target = df.groupby(col)['Price(Lakhs)'].mean()
    X_train[col] = X_train[col].map(mean_target)
    X_test[col] = X_test[col].map(mean_target)

In [29]:
ohe_en_cols = ['Body Type', 'Fuel Type', 'Transmission', 'City', 'Value Configuration', 'Light', 'Headrest', 'Antenna', 'Headlamps', 'System', 
               'Wheel', 'Control', 'Warning', 'Turbo Charger', 'Steering Type']

In [30]:
ohencoder = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)

X_train_encoded = ohencoder.fit_transform(X_train[ohe_en_cols])
X_test_encoded = ohencoder.transform(X_test[ohe_en_cols])

encoded_columns = ohencoder.get_feature_names_out(ohe_en_cols)

X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoded_columns, index=X_train.index)
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoded_columns, index=X_test.index)

X_train = pd.concat([X_train.drop(ohe_en_cols, axis=1), X_train_encoded_df], axis=1)
X_test = pd.concat([X_test.drop(ohe_en_cols, axis=1), X_test_encoded_df], axis=1)



In [75]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

gbregressor = GradientBoostingRegressor(n_estimators=100, random_state=42)
gbregressor.fit(X_train, y_train_log)

In [76]:
y_gbregress_pred_log = gbregressor.predict(X_test)
y_train_gbregress_pred_log = gbregressor.predict(X_train)

In [77]:
y_gbregress_pred = np.exp(y_gbregress_pred_log)
y_train_gbregress_pred = np.exp(y_train_gbregress_pred_log)
y_test = np.exp(y_test_log)
y_train = np.exp(y_train_log)

In [78]:
print('MAE: ', mean_absolute_error(y_test, y_gbregress_pred))
print('MSE: ', mean_squared_error(y_test, y_gbregress_pred))
print('R2 Score: ', r2_score(y_test, y_gbregress_pred))

MAE:  1.5445573914017365
MSE:  18.528719376942515
R2 Score:  0.8629263325471765


In [79]:
print(mean_squared_error(y_train, y_train_gbregress_pred))
print(mean_squared_error(y_test, y_gbregress_pred))

print(r2_score(y_train, y_train_gbregress_pred))
print(r2_score(y_test, y_gbregress_pred))

8.54461263042148
18.528719376942515
0.9201439957305182
0.8629263325471765


In [36]:
# Feature Selection

In [37]:
cat_cols = ['Light', 'Headrest', 'Antenna', 'Headlamps', 'System', 'Variant Name', 'RTO', 'Engine Type', 'Fuel Suppy System', 
            'Wheel', 'Control', 'Warning', 'Turbo Charger', 'Steering Type', 'Color', 'Value Configuration', 'Body Type']
num_cols = ['Mileage', 'Torque', 'Displacement', 'Acceleration', 'Cargo Volumn', 'No of Owner', 'Top Speed', 'Height']

In [38]:
cat_pattern = '|'.join(cat_cols)
num_pattern = '|'.join(num_cols)

X_train = X_train.drop(X_train.filter(regex=cat_pattern).columns, axis=1)
X_test = X_test.drop(X_test.filter(regex=cat_pattern).columns, axis=1)

X_train = X_train.drop(X_train.filter(regex=num_pattern).columns, axis=1)
X_test = X_test.drop(X_test.filter(regex=num_pattern).columns, axis=1)
