Model Training

Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,OrdinalEncoder,StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV


In [None]:
!pip install catboost



In [None]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor,VotingRegressor,BaggingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
df=pd.read_csv('/content/Cleaned_car_data (1).csv')
df.head()

Unnamed: 0,brand,model,color,year,price_in_euro,power_kw,power_ps,transmission_type,fuel_type,fuel_consumption_l_100km,fuel_consumption_g_km,mileage_in_km
0,hyundai,Hyundai i10,black,2018,11555.0,49.0,67.0,Manual,Petrol,4.6,106.0,27782.0
1,honda,Honda CR-V,red,2018,24490.0,114.0,155.0,Automatic,Petrol,7.5,175.0,57000.0
2,kia,Kia Sportage,black,2023,34990.0,110.0,150.0,Manual,Petrol,5.9,150.0,7500.0
3,honda,Honda Civic,black,2009,5800.0,103.0,140.0,Manual,Petrol,6.7,155.0,160000.0
4,hyundai,Hyundai KONA,red,2019,18690.0,88.0,120.0,Manual,Petrol,5.4,125.0,64300.0


In [None]:
x=df.drop('price_in_euro',axis=1)
y=df['price_in_euro']

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.30,random_state=30)

Encoding

In [None]:
categorical_features_lb = ['fuel_type', 'brand', 'model', 'color']
categorical_features_one = ['transmission_type']
numeric_features = ['mileage_in_km', 'power_kw', 'fuel_consumption_l_100km', 'fuel_consumption_g_km']

onehot_scaled = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ('scaler', StandardScaler(with_mean=False))  # with_mean=False for sparse matrix
])

ordinal_scaled = Pipeline([
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)), # Use OrdinalEncoder here and set handle_unknown and unknown_value
    ('scaler', StandardScaler())
])

numeric_scaled = Pipeline([
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('onehot_scaled', onehot_scaled, categorical_features_one),
        ('ordinal_scaled', ordinal_scaled, categorical_features_lb), # Changed name and used ordinal_scaled pipeline
        ('numeric_scaled', numeric_scaled, numeric_features)
    ]
)

X_train_encoded = preprocessor.fit_transform(x_train)
X_test_encoded = preprocessor.transform(x_test)

In [None]:
pd.DataFrame(X_train_encoded)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.000000,2.00176,0.0,0.0,0.889714,0.852642,1.075880,-0.985331,-1.296038,-0.253490,-0.445467,-0.597745
1,0.000000,2.00176,0.0,0.0,0.889714,0.600391,0.716315,0.665774,0.249249,-1.182104,-0.611001,-0.705905
2,2.000831,0.00000,0.0,0.0,-1.159102,-0.786988,-0.801169,0.194029,0.525356,0.744423,-0.445467,-0.208368
3,0.000000,2.00176,0.0,0.0,0.889714,0.600391,0.661466,0.194029,1.056539,-1.057365,-0.335111,-0.489585
4,2.000831,0.00000,0.0,0.0,-0.134694,1.483269,1.374501,-0.513587,-0.769306,-0.821746,-1.383496,-1.419763
...,...,...,...,...,...,...,...,...,...,...,...,...
59675,0.000000,2.00176,0.0,0.0,-1.159102,0.600391,0.826013,1.609262,-0.828041,-0.821746,0.382206,0.635282
59676,2.000831,0.00000,0.0,0.0,-1.159102,-0.786988,-0.484265,-1.221204,3.776901,-0.017872,1.320234,1.651988
59677,2.000831,0.00000,0.0,0.0,0.889714,-1.039239,-1.233866,-1.221204,0.859804,-0.710867,-0.555823,-0.727537
59678,0.000000,2.00176,0.0,0.0,-1.159102,0.600391,0.612712,-1.221204,-0.515979,-0.974205,-0.886892,-0.684273


Model Building

In [None]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [None]:
estimators=[('rf', RandomForestRegressor()), ('xg', XGBRegressor()), ('cat', CatBoostRegressor(verbose=False)), ('lgbm', LGBMRegressor())]

In [None]:
models={

    'K-Neighbors Regressor':KNeighborsRegressor(),
    'SVR':SVR(kernel='linear'),#

    'LinearRegression':LinearRegression(),#
    'Lasso':Lasso(),#
    'Ridge':Ridge(),#
    'ElasticNet':ElasticNet(),#


    'Decision Tree':DecisionTreeRegressor(),

    'Random Forest Regressor':RandomForestRegressor(),
    'VotingRegressor':VotingRegressor(estimators=estimators),
    'BaggingRegressor':BaggingRegressor(estimator=LinearRegression(),bootstrap=True),

    'AdaBoost Regressor':AdaBoostRegressor(),#
    'GradientBoostingRegressor':GradientBoostingRegressor(),
    'XGBRegressor':XGBRegressor(),
    'CatBoosting Regressor':CatBoostRegressor(verbose=False),
    'LGBMRegressor':LGBMRegressor(),
     
    'StackingRegressor':StackingRegressor(estimators=estimators,final_estimator=XGBRegressor()),
}

In [None]:

trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train_encoded,y_train)

    y_pred=model.predict(X_test_encoded)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)

    print('='*35)
    print('\n')
    #cat,vot,sta  top 3

K-Neighbors Regressor
Model Training Performance
RMSE: 8077.497169999624
MAE: 4598.474509343967
R2 score 82.70686767136745


SVR
Model Training Performance
RMSE: 11592.212056741993
MAE: 6340.4069431393755
R2 score 64.38338269485625


LinearRegression
Model Training Performance
RMSE: 10183.671978036566
MAE: 6421.866381358076
R2 score 72.5129052382445


Lasso
Model Training Performance
RMSE: 10183.457786996281
MAE: 6420.793355017151
R2 score 72.51406148670162


Ridge
Model Training Performance
RMSE: 10183.64593706382
MAE: 6421.736530988527
R2 score 72.51304581421196


ElasticNet
Model Training Performance
RMSE: 11455.649157499267
MAE: 6907.401935100278
R2 score 65.21760806655959


Decision Tree
Model Training Performance
RMSE: 9141.300291842614
MAE: 4928.209527346789
R2 score 77.85192491298126


Random Forest Regressor
Model Training Performance
RMSE: 6566.5395618654
MAE: 3632.121413232581
R2 score 88.57139649389877


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead



VotingRegressor
Model Training Performance
RMSE: 6270.939360448801
MAE: 3559.316566238315
R2 score 89.57717998969225


BaggingRegressor
Model Training Performance
RMSE: 10183.882280876263
MAE: 6418.706310027856
R2 score 72.51176995545386


AdaBoost Regressor
Model Training Performance
RMSE: 13927.20535919072
MAE: 11343.604185753387
R2 score 48.58995376174945


GradientBoostingRegressor
Model Training Performance
RMSE: 7664.980352674648
MAE: 4566.665360185917
R2 score 84.42808129913195


XGBRegressor
Model Training Performance
RMSE: 6451.6918598810225
MAE: 3685.2863282339317
R2 score 88.96766935726987


CatBoosting Regressor
Model Training Performance
RMSE: 6363.693849906428
MAE: 3705.6953221328686
R2 score 89.26656843050796


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001357 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1203
[Lig



LGBMRegressor
Model Training Performance
RMSE: 6610.465308108871
MAE: 3903.467970005954
R2 score 88.41798570951178


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001356 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1203
[LightGBM] [Info] Number of data points in the train set: 59680, number of used features: 12
[LightGBM] [Info] Start training from score 23746.547118
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001093 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1197
[LightGBM] [Info] Number of data points in the train set: 47744, number of used features: 12
[LightGBM] [Info] Start training from score 23799.374979




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001077 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1193
[LightGBM] [Info] Number of data points in the train set: 47744, number of used features: 12
[LightGBM] [Info] Start training from score 23769.040906




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001086 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1197
[LightGBM] [Info] Number of data points in the train set: 47744, number of used features: 12
[LightGBM] [Info] Start training from score 23758.885012




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001093 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1195
[LightGBM] [Info] Number of data points in the train set: 47744, number of used features: 12
[LightGBM] [Info] Start training from score 23648.348065




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001082 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1189
[LightGBM] [Info] Number of data points in the train set: 47744, number of used features: 12
[LightGBM] [Info] Start training from score 23757.086629




StackingRegressor
Model Training Performance
RMSE: 6397.266261525737
MAE: 3538.3520582835777
R2 score 89.15301874753251




In [None]:
dhf

Hyper Parameter tuning

In [None]:
final_models={

    'Random Forest Regressor':RandomForestRegressor(),
    'VotingRegressor':VotingRegressor(estimators=estimators),
    'BaggingRegressor':BaggingRegressor(estimator=LinearRegression(),bootstrap=True),


    'GradientBoostingRegressor':GradientBoostingRegressor(),
    'XGBRegressor':XGBRegressor(),
    'LGBMRegressor':LGBMRegressor(),
    'CatBoosting Regressor':CatBoostRegressor(verbose=False),


    'StackingRegressor':StackingRegressor(estimators=estimators,final_estimator=XGBRegressor()),

}

In [None]:
param_grids={

    'RandomForestRegressor': {'n_estimators': [100, 200, 500], 'max_depth': [None, 10, 20]},

    'CatBoostRegressor': {'depth': [4, 6, 8], 'iterations': [200, 500]},

    'XGBRegressor': {'n_estimators': [100, 200, 500], 'learning_rate': [0.01, 0.1, 0.2,0.05]},

    'GradientBoostingRegressor': {'n_estimators': [100, 200, 500]},

    'LGBMRegressor': {'n_estimators': [100, 200, 500]},

    'BaggingRegressor': {
        'n_estimators': [10,20],
        'max_samples': [0.5, 0.7, 1.0],
        'max_depth':[None,10,20]
}
}

In [None]:
trained_model_list = []
model_list = []
r2_list = []

for model_name, model in final_models.items():
    print(f"Tuning {model_name}...")
    search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_grids.get(model_name, {}),
        n_iter=5,  # Number of parameter combinations to try
        scoring='r2',
        cv=3,
        n_jobs=-1,
        random_state=42
    )

    # Fit with hyperparameter tuning
    search.fit(X_train_encoded, y_train)

    # Get best model
    best_model = search.best_estimator_
    trained_model_list.append(best_model)
    model_list.append(model_name)

    # Predict
    y_pred = best_model.predict(X_test_encoded)
    mae, rmse, r2_square = evaluate_model(y_test, y_pred)
    r2_list.append(r2_square)

    print(f"Best Params for {model_name}: {search.best_params_}")
    print(f"R2 Score: {r2_square*100:.2f}%\n")