In [12]:
# Importing the libraries 

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

# Ignore harmless warnings 

import warnings 
warnings.filterwarnings("ignore")

# Set to display all the columns in dataset

pd.set_option("display.max_columns", None)

# Import psql to run queries 

import pandasql as psql

In [13]:
# load the CO2 Emission dataset 

CO2Emission = pd.read_csv(r"C:\Users\NIKITHA PAGADALA\Downloads\MY2021_Fuel_Consumption_Ratings (1).csv", header=0)

# Copy the file to back-up file

CO2Emission_bk = CO2Emission.copy()

# Display first 5 records

CO2Emission.head()

Unnamed: 0,Year,Make,Model,Vehicle_Class,Engine_Size,Cylinders,Transmission,Fuel_Type,Fuel_Consumption_city,Fuel_Consumption_Hwy,Fuel_Consumption_Comb,Fuel_Consumption_Comb_MPG,CO2_Emissions,CO2_Rating,Smog_Rating
0,2021,Acura,ILX,Compact,2.4,4,AM8,Z,9.9,7.0,8.6,33,199,6,3
1,2021,Acura,NSX,Two-seater,3.5,6,AM9,Z,11.1,10.8,11.0,26,256,4,3
2,2021,Acura,RDX SH-AWD,SUV: Small,2.0,4,AS10,Z,11.0,8.6,9.9,29,232,5,6
3,2021,Acura,RDX SH-AWD A-SPEC,SUV: Small,2.0,4,AS10,Z,11.3,9.1,10.3,27,242,5,6
4,2021,Acura,TLX SH-AWD,Compact,2.0,4,AS10,Z,11.2,8.0,9.8,29,230,5,7


In [14]:
CO2Emission.dtypes

Year                           int64
Make                          object
Model                         object
Vehicle_Class                 object
Engine_Size                  float64
Cylinders                      int64
Transmission                  object
Fuel_Type                     object
Fuel_Consumption_city        float64
Fuel_Consumption_Hwy         float64
Fuel_Consumption_Comb        float64
Fuel_Consumption_Comb_MPG      int64
CO2_Emissions                  int64
CO2_Rating                     int64
Smog_Rating                    int64
dtype: object

In [15]:

CO2Emission = CO2Emission.drop(['Year', 'Make', 'Model', 'Vehicle_Class', 'Transmission',
                                 'Fuel_Type', 'CO2_Rating', 'Smog_Rating'], axis = 1)
CO2Emission.head()
# drop the columns with categorical data

Unnamed: 0,Engine_Size,Cylinders,Fuel_Consumption_city,Fuel_Consumption_Hwy,Fuel_Consumption_Comb,Fuel_Consumption_Comb_MPG,CO2_Emissions
0,2.4,4,9.9,7.0,8.6,33,199
1,3.5,6,11.1,10.8,11.0,26,256
2,2.0,4,11.0,8.6,9.9,29,232
3,2.0,4,11.3,9.1,10.3,27,242
4,2.0,4,11.2,8.0,9.8,29,230


In [16]:
IndepVar = []
for col in CO2Emission.columns:
    if col != 'CO2_Emissions':
        IndepVar.append(col)

TargetVar = 'CO2_Emissions'

x = CO2Emission[IndepVar]
y = CO2Emission[TargetVar]

In [17]:
from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

# Display the shape of the train_data and test_data

x_train.shape, x_test.shape, y_train.shape, y_test.shape

((618, 6), (265, 6), (618,), (265,))

In [18]:
# Scaling the features by using MinMaxScaler

from sklearn.preprocessing import MinMaxScaler

mmscaler = MinMaxScaler(feature_range=(0, 1))

x_train = mmscaler.fit_transform(x_train)
x_train = pd.DataFrame(x_train)

x_test = mmscaler.fit_transform(x_test)
x_test = pd.DataFrame(x_test)

In [19]:
# Build the model with Gradient Boosting Regressor

from sklearn.linear_model import LinearRegression  

# Create object for the model

ModelMLR = LinearRegression()

# Train the model with training data

ModelMLR.fit(x_train, y_train)

# Predict the model with test dataset

y_pred = ModelMLR.predict(x_test)

# Evaluation metrics for Regression analysis

from sklearn import metrics

print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))  
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))  
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))

# Define the function to calculate the MAPE - Mean Absolute Percentage Error

def MAPE (y_test, y_pred): 
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

# Evaluation of MAPE 

result = MAPE(y_test, y_pred)
print('Mean Absolute Percentage Error (MAPE):', round(result, 3), '%')

# Calculate Adjusted R squared values 

r_squared = round(metrics.r2_score(y_test, y_pred),6)
adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
print('Adj R Square: ', adjusted_r_squared)

Mean Absolute Error (MAE): 20.284
Mean Squared Error (MSE): 646.683
Root Mean Squared Error (RMSE): 25.43
R2_score: 0.845908
Root Mean Squared Log Error (RMSLE): 3.236
Mean Absolute Percentage Error (MAPE): 7.12 %
Adj R Square:  0.844853


In [20]:
Results = pd.DataFrame({'CO2_Emissions_A':y_test, 'CO2_Emissions_P':y_pred})

# Merge two Dataframes on index of both the dataframes

ResultsFinal = CO2Emission_bk.merge(Results, left_index=True, right_index=True)
ResultsFinal.sample(5)

Unnamed: 0,Year,Make,Model,Vehicle_Class,Engine_Size,Cylinders,Transmission,Fuel_Type,Fuel_Consumption_city,Fuel_Consumption_Hwy,Fuel_Consumption_Comb,Fuel_Consumption_Comb_MPG,CO2_Emissions,CO2_Rating,Smog_Rating,CO2_Emissions_A,CO2_Emissions_P
163,2021,Chevrolet,Camaro,Subcompact,3.6,6,M6,X,14.4,9.0,12.0,24,281,4,6,281,258.46196
881,2021,Volvo,XC90 T5 AWD,SUV: Standard,2.0,4,AS8,Z,11.5,8.4,10.1,28,236,5,5,236,218.564087
404,2021,Honda,Ridgeline AWD,Pickup truck: Standard,3.5,6,AS9,X,12.8,9.9,11.5,25,271,4,3,271,250.339741
711,2021,Porsche,718 Cayman GT4,Two-seater,4.0,6,AM7,Z,13.0,9.9,11.6,24,271,4,5,271,253.699821
494,2021,Kia,Sorento AWD,SUV: Small,2.5,4,AS8,X,10.1,9.2,9.7,29,227,5,5,227,212.955228
