In [61]:
## Importing Libraries

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, accuracy_score
from sklearn.preprocessing import StandardScaler

import xgboost as xgb
from xgboost import XGBRegressor

In [62]:
## Importing dataset and preview
df = pd.read_csv('24-testcar-2024-01-29.csv')
df.head()

Unnamed: 0,Model Year,Vehicle Manufacturer Name,Veh Mfr Code,Represented Test Veh Make,Represented Test Veh Model,Test Vehicle ID,Test Veh Configuration #,Test Veh Displacement (L),Actual Tested Testgroup,Vehicle Type,...,Set Coef A (lbf),Set Coef B (lbf/mph),Set Coef C (lbf/mph**2),Aftertreatment Device Cd,Aftertreatment Device Desc,Police - Emergency Vehicle?,Averaging Group ID,Averaging Weighting Factor,Averaging Method Cd,Averging Method Desc
0,2024,aston martin,ASX,Aston Martin,DB12 V8,5723PT5601,0,4.0,RASXV04.0AES,Car,...,5.67,0.0083,0.0221,TWC,Three-way catalyst,N,,,N,No averaging
1,2024,aston martin,ASX,Aston Martin,DB12 V8,5723PT5601,0,4.0,RASXV04.0AES,Car,...,5.67,0.0083,0.0221,TWC,Three-way catalyst,N,,,N,No averaging
2,2024,aston martin,ASX,Aston Martin,DBX,8001PT8342,1,4.0,MASXJ04.0AEX,Both,...,-4.88,-0.5318,0.0367,TWC,Three-way catalyst,N,,,N,No averaging
3,2024,aston martin,ASX,Aston Martin,DBX,8001PT8342,1,4.0,MASXJ04.0AEX,Both,...,-4.88,-0.5318,0.0367,TWC,Three-way catalyst,N,,,N,No averaging
4,2024,aston martin,ASX,Aston Martin,DBX 707,8841PT8473,0,4.0,PASXJ04.0AEX,Both,...,-4.88,-0.5318,0.0367,TWC,Three-way catalyst,N,,,N,No averaging


In [63]:
## Using the duitable features for tarining the model and preview
df = df[['Vehicle Manufacturer Name', 'Test Veh Displacement (L)','Vehicle Type', 'Rated Horsepower', '# of Cylinders and Rotors','Drive System Code', 'Equivalent Test Weight (lbs.)', 'CO2 (g/mi)']]
df.head()

Unnamed: 0,Vehicle Manufacturer Name,Test Veh Displacement (L),Vehicle Type,Rated Horsepower,# of Cylinders and Rotors,Drive System Code,Equivalent Test Weight (lbs.),CO2 (g/mi)
0,aston martin,4.0,Car,680,8.0,R,4500,484.329
1,aston martin,4.0,Car,680,8.0,R,4500,283.235
2,aston martin,4.0,Both,550,8.0,4,5500,521.26
3,aston martin,4.0,Both,550,8.0,4,5500,327.54
4,aston martin,4.0,Both,707,8.0,4,5500,486.75


In [64]:
## Checking for Null values
df.isnull().sum()

Vehicle Manufacturer Name          0
Test Veh Displacement (L)          0
Vehicle Type                       0
Rated Horsepower                   0
# of Cylinders and Rotors        493
Drive System Code                  0
Equivalent Test Weight (lbs.)      0
CO2 (g/mi)                       590
dtype: int64

In [65]:
##Dropping null values
df.dropna(inplace=True)
df.shape

(3345, 8)

In [66]:
## Using numerical values instead of categorical for Vehicle Type column
df.replace('Car',0,inplace=True)
df.replace('Both',1,inplace=True)
df.replace('Truck',2,inplace=True)


In [67]:
## Scaling the numerical features
scaler = StandardScaler()
scaled_columns = scaler.fit_transform(df[['Test Veh Displacement (L)', 'Rated Horsepower', '# of Cylinders and Rotors', 'Equivalent Test Weight (lbs.)']])
df[['Test Veh Displacement (L)', 'Rated Horsepower', '# of Cylinders and Rotors', 'Equivalent Test Weight (lbs.)']] = scaled_columns

In [68]:
## getting one hot vectors for cateforical columns
df = pd.get_dummies(df, columns=['Vehicle Manufacturer Name','Drive System Code'], dtype='int')

In [69]:
## defining features and lables
X= df.drop(columns=['CO2 (g/mi)'])
Y= df['CO2 (g/mi)']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2 , random_state=42)

In [70]:
## defining and hyper parameter tuning the model using GridSearchCV

xgb = XGBRegressor()

param_grid={'n_estimators':[40 , 80, 120], 'learning_rate': [0.02 , 0.05, 0.1], 'max_depth': [2,4,8]}

grid_search = GridSearchCV(estimator = xgb , param_grid= param_grid, cv = 5, scoring='neg_mean_squared_error', n_jobs=-1)

grid_search.fit(X_train, Y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

Best Parameters: {'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 80}
Best Score: -6162.275961369277


In [71]:
## model prediction and R2 scores

model = XGBRegressor(n_estimators = 80 , learning_rate = 0.05, max_depth = 4)

model.fit(X_train, Y_train)

Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)

r2_train = r2_score(Y_train, Y_train_pred)
r2_test = r2_score(Y_test, Y_test_pred)

print("R^2 score on training data:", r2_train)
print("R^2 score on testing data:", r2_test)

R^2 score on training data: 0.6282942095459338
R^2 score on testing data: 0.5673583501087107
