# FuelNet: Artificial Intelligence Tool for Fuel Consumption Prediction in Heavy Vehicles 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score,median_absolute_error,mean_squared_log_error,explained_variance_score

In [2]:
data = pd.read_csv(r'FuelConsumption.csv')
data

Unnamed: 0,MODELYEAR,MAKE,MODEL,VEHICLECLASS,ENGINESIZE,CYLINDERS,TRANSMISSION,FUELTYPE,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
0,2014,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,2014,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,2014,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,2014,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,2014,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1062,2014,VOLVO,XC60 AWD,SUV - SMALL,3.0,6,AS6,X,13.4,9.8,11.8,24,271
1063,2014,VOLVO,XC60 AWD,SUV - SMALL,3.2,6,AS6,X,13.2,9.5,11.5,25,264
1064,2014,VOLVO,XC70 AWD,SUV - SMALL,3.0,6,AS6,X,13.4,9.8,11.8,24,271
1065,2014,VOLVO,XC70 AWD,SUV - SMALL,3.2,6,AS6,X,12.9,9.3,11.3,25,260


In [3]:
data.head()

Unnamed: 0,MODELYEAR,MAKE,MODEL,VEHICLECLASS,ENGINESIZE,CYLINDERS,TRANSMISSION,FUELTYPE,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
0,2014,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,2014,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,2014,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,2014,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,2014,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1067 entries, 0 to 1066
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   MODELYEAR                 1067 non-null   int64  
 1   MAKE                      1067 non-null   object 
 2   MODEL                     1067 non-null   object 
 3   VEHICLECLASS              1067 non-null   object 
 4   ENGINESIZE                1067 non-null   float64
 5   CYLINDERS                 1067 non-null   int64  
 6   TRANSMISSION              1067 non-null   object 
 7   FUELTYPE                  1067 non-null   object 
 8   FUELCONSUMPTION_CITY      1067 non-null   float64
 9   FUELCONSUMPTION_HWY       1067 non-null   float64
 10  FUELCONSUMPTION_COMB      1067 non-null   float64
 11  FUELCONSUMPTION_COMB_MPG  1067 non-null   int64  
 12  CO2EMISSIONS              1067 non-null   int64  
dtypes: float64(4), int64(4), object(5)
memory usage: 108.5+ KB


In [5]:
data.describe()

In [6]:
data.shape

In [7]:
data.size

In [8]:
data.isnull().sum()

In [9]:
data.isnull()

In [10]:
print(data.FUELCONSUMPTION_COMB_MPG.unique())
data.FUELCONSUMPTION_COMB_MPG.count()

In [11]:
# Columns to be label encoded
columns_encode = ['MAKE','MODEL','VEHICLECLASS','TRANSMISSION','FUELTYPE']

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Loop through each column and perform label encoding
for col in columns_encode:
    data[col] = label_encoder.fit_transform(data[col])

# The DataFrame 'df' now contains the label-encoded values for the specified columns
data

In [12]:
x = data.drop('FUELCONSUMPTION_COMB_MPG',axis=1)
x

In [13]:
y=data['FUELCONSUMPTION_COMB_MPG']
y

In [14]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the data using the StandardScaler
scaled_data = scaler.fit_transform(x)

# The transformed data is now standardized (mean=0, std=1) and stored in 'scaled_data'
print(scaled_data)

In [15]:
from sklearn.decomposition import PCA
pca = PCA(n_components=8)

# Fit and transform the data using PCA
transformed_data = pca.fit_transform(scaled_data)
transformed_data

In [16]:
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.2,random_state=70)
x_train
y_train

In [17]:
model = LinearRegression()
model.fit(x_train,y_train)

In [18]:
y_pred = model.predict(x_test)
y_pred

In [19]:
rmse=(np.sqrt(mean_squared_error(y_test,y_pred)))
print("RMSE TestData = ",str(rmse))

In [20]:
mse = mean_squared_error(y_test, y_pred)

#mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
medae = median_absolute_error(y_test, y_pred)
msle = mean_squared_log_error(y_test, y_pred)
explained_var = explained_variance_score(y_test, y_pred)

In [21]:
print("MSE of LR:", mse)
print("RMSE of LR:", rmse)
#print("MAE:", mae)
print("R-squared with LR:", r2)
print("Median Absolute Error using LR:", medae)
print("Mean Squared Logarithmic Error usign LR:", msle)
print("Explained Variance Score using LR:", explained_var)

In [22]:
# Create a scatter plot of the true target values (Y_test) against the predicted target values (Y_pred)
plt.scatter(y_test,y_pred , color='red')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='blue')  # Diagonal line for comparison
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.title('Scatter Plot of True vs. Predicted Values obtained using LR model')
plt.show()

In [23]:
model = RandomForestRegressor()
model.fit(x_train,y_train)

In [24]:
y_pred1 = model.predict(x_test)
y_pred1

In [25]:
mse1 = mean_squared_error(y_test, y_pred1)
rmse1 = (np.sqrt(mean_squared_error(y_test,y_pred1)))
#mae = mean_absolute_error(y_test, y_pred)
r21 = r2_score(y_test, y_pred1)
medae1 = median_absolute_error(y_test, y_pred1)
msle1 = mean_squared_log_error(y_test, y_pred1)
explained_var1 = explained_variance_score(y_test, y_pred1)

In [26]:
print("MSE of RFR model:", mse1)
print("RMSE of RFR model:", rmse1)
#print("MAE:", mae)
print("R-squared with RFR model:", r21)
print("Median Absolute Error using RFR model:", medae1)
print("Mean Squared Logarithmic Error using RFR model:", msle1)
print("Explained Variance Score using RFR model:", explained_var1)

In [27]:
# Create a scatter plot of the true target values (Y_test) against the predicted target values (Y_pred)
plt.scatter(y_test,y_pred1 , color='magenta')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='blue')  # Diagonal line for comparison
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.title('Scatter Plot of True vs. Predicted Values obtained using RFR model')
plt.show()

In [28]:
# mgp refers to miles per gallon

In [29]:
# Create a DataFrame with the actual and predicted values
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred1})

# Print the DataFrame
df
pd.set_option("display.max_rows",300)
df