# ESTIMATION OF CO2 EMITTED BY CAR

## 1. IMPORTING LIBRARIES

In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

## 2. READING AND EXPLORING THE DATA

In [2]:
train = pd.read_csv('FuelConsumption.csv')
train.shape

(1067, 13)

In [3]:
train.columns

Index(['MODELYEAR', 'MAKE', 'MODEL', 'VEHICLECLASS', 'ENGINESIZE', 'CYLINDERS',
       'TRANSMISSION', 'FUELTYPE', 'FUELCONSUMPTION_CITY',
       'FUELCONSUMPTION_HWY', 'FUELCONSUMPTION_COMB',
       'FUELCONSUMPTION_COMB_MPG', 'CO2EMISSIONS'],
      dtype='object')

In [4]:
train.head()

Unnamed: 0,MODELYEAR,MAKE,MODEL,VEHICLECLASS,ENGINESIZE,CYLINDERS,TRANSMISSION,FUELTYPE,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
0,2014,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,2014,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,2014,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,2014,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,2014,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1067 entries, 0 to 1066
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   MODELYEAR                 1067 non-null   int64  
 1   MAKE                      1067 non-null   object 
 2   MODEL                     1067 non-null   object 
 3   VEHICLECLASS              1067 non-null   object 
 4   ENGINESIZE                1067 non-null   float64
 5   CYLINDERS                 1067 non-null   int64  
 6   TRANSMISSION              1067 non-null   object 
 7   FUELTYPE                  1067 non-null   object 
 8   FUELCONSUMPTION_CITY      1067 non-null   float64
 9   FUELCONSUMPTION_HWY       1067 non-null   float64
 10  FUELCONSUMPTION_COMB      1067 non-null   float64
 11  FUELCONSUMPTION_COMB_MPG  1067 non-null   int64  
 12  CO2EMISSIONS              1067 non-null   int64  
dtypes: float64(4), int64(4), object(5)
memory usage: 108.5+ KB


In [6]:
train.nunique()

MODELYEAR                     1
MAKE                         39
MODEL                       663
VEHICLECLASS                 16
ENGINESIZE                   45
CYLINDERS                     7
TRANSMISSION                 22
FUELTYPE                      4
FUELCONSUMPTION_CITY        167
FUELCONSUMPTION_HWY         118
FUELCONSUMPTION_COMB        148
FUELCONSUMPTION_COMB_MPG     43
CO2EMISSIONS                159
dtype: int64

In [7]:
train.describe()

Unnamed: 0,MODELYEAR,ENGINESIZE,CYLINDERS,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
count,1067.0,1067.0,1067.0,1067.0,1067.0,1067.0,1067.0,1067.0
mean,2014.0,3.346298,5.794752,13.296532,9.474602,11.580881,26.441425,256.228679
std,0.0,1.415895,1.797447,4.101253,2.79451,3.485595,7.468702,63.372304
min,2014.0,1.0,3.0,4.6,4.9,4.7,11.0,108.0
25%,2014.0,2.0,4.0,10.25,7.5,9.0,21.0,207.0
50%,2014.0,3.4,6.0,12.6,8.8,10.9,26.0,251.0
75%,2014.0,4.3,8.0,15.55,10.85,13.35,31.0,294.0
max,2014.0,8.4,12.0,30.2,20.5,25.8,60.0,488.0


## 3. CHECKING FOR NAN VALUES

In [8]:
train.isnull().sum()

MODELYEAR                   0
MAKE                        0
MODEL                       0
VEHICLECLASS                0
ENGINESIZE                  0
CYLINDERS                   0
TRANSMISSION                0
FUELTYPE                    0
FUELCONSUMPTION_CITY        0
FUELCONSUMPTION_HWY         0
FUELCONSUMPTION_COMB        0
FUELCONSUMPTION_COMB_MPG    0
CO2EMISSIONS                0
dtype: int64

### Nan values are not present

## 4. FILTERING THE DATA

### Removing Outliers from the Data

### Removing the outliers in FUELCONSUMPTION_COMB

In [9]:
train.shape

(1067, 13)

In [10]:
print(train.FUELCONSUMPTION_COMB.min())
print(train.FUELCONSUMPTION_COMB.max())

4.7
25.8


In [11]:
print(len(train[train['FUELCONSUMPTION_COMB'] > 23]))
print(len(train[train['FUELCONSUMPTION_COMB'] < 5.4]))

5
3


In [12]:
train = train[train['FUELCONSUMPTION_COMB'] < 23]
train = train[train['FUELCONSUMPTION_COMB'] > 5.4]
train.shape

(1057, 13)

### Removing the outliers in ENGINESIZE

In [13]:
print(train.ENGINESIZE.min())
print(train.ENGINESIZE.max())

1.0
8.4


In [14]:
print(len(train[train['ENGINESIZE'] > 6.7]))
print(len(train[train['ENGINESIZE'] < 1.2]))

4
3


In [15]:
train = train[train['ENGINESIZE'] < 6.7]
train = train[train['ENGINESIZE'] > 1.2]
train.shape

(1042, 13)

### Removing the outliers in CYLINDERS

In [16]:
print(train.CYLINDERS.min())
print(train.CYLINDERS.max())

4
12


In [17]:
print(len(train[train['CYLINDERS'] > 10]))
print(len(train[train['CYLINDERS'] < 5]))

13
413


In [18]:
train = train[train['CYLINDERS'] < 10]
train.shape

(1023, 13)

### Removing the outliers in MAKE 

In [19]:
train.MAKE.value_counts()

FORD             86
CHEVROLET        82
BMW              63
MERCEDES-BENZ    59
TOYOTA           47
GMC              47
AUDI             44
PORSCHE          44
VOLKSWAGEN       41
DODGE            39
MINI             36
KIA              33
NISSAN           33
CADILLAC         32
JEEP             31
MAZDA            27
HYUNDAI          24
SUBARU           23
JAGUAR           22
LEXUS            22
INFINITI         21
HONDA            20
CHRYSLER         19
LAND ROVER       19
BUICK            16
MITSUBISHI       14
RAM              13
ACURA            12
LINCOLN          11
VOLVO            11
FIAT             10
SCION             9
MASERATI          6
ASTON MARTIN      4
BENTLEY           3
Name: MAKE, dtype: int64

In [20]:
train = train[train['MAKE'] != 'BENTLEY']
train.shape

(1020, 13)

### Removing the outliers in CO2EMISSIONS

In [21]:
print(train.CO2EMISSIONS.min())
print(train.CO2EMISSIONS.max())

126
435


In [22]:
print(len(train[train['CO2EMISSIONS'] > 425]))
print(len(train[train['CO2EMISSIONS'] < 135]))

4
4


In [23]:
train = train[train['CO2EMISSIONS'] < 425]
train = train[train['CO2EMISSIONS'] > 135]
train.shape

(1012, 13)

### Droping the Unnecessary Columns

In [24]:
train = train.drop(columns=['MODELYEAR', 'FUELCONSUMPTION_CITY', 'FUELCONSUMPTION_HWY', 'FUELCONSUMPTION_COMB_MPG'])

In [25]:
train.head()

Unnamed: 0,MAKE,MODEL,VEHICLECLASS,ENGINESIZE,CYLINDERS,TRANSMISSION,FUELTYPE,FUELCONSUMPTION_COMB,CO2EMISSIONS
0,ACURA,ILX,COMPACT,2.0,4,AS5,Z,8.5,196
1,ACURA,ILX,COMPACT,2.4,4,M6,Z,9.6,221
2,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,5.9,136
3,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,11.1,255
4,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,10.6,244


### Performing label encoding for categorical data

In [26]:
label_encoder = LabelEncoder()

In [27]:
train['MAKE'] = label_encoder.fit_transform(train['MAKE'])
train['MODEL'] = label_encoder.fit_transform(train['MODEL'])
train['VEHICLECLASS'] = label_encoder.fit_transform(train['VEHICLECLASS'])
train['TRANSMISSION'] = label_encoder.fit_transform(train['TRANSMISSION'])
train['FUELTYPE'] = label_encoder.fit_transform(train['FUELTYPE'])
train.head()

Unnamed: 0,MAKE,MODEL,VEHICLECLASS,ENGINESIZE,CYLINDERS,TRANSMISSION,FUELTYPE,FUELCONSUMPTION_COMB,CO2EMISSIONS
0,0,311,0,2.0,4,9,3,8.5,196
1,0,311,0,2.4,4,19,3,9.6,221
2,0,312,0,1.5,4,16,3,5.9,136
3,0,370,11,3.5,6,10,3,11.1,255
4,0,454,11,3.5,6,10,3,10.6,244


In [28]:
train.dtypes

MAKE                      int32
MODEL                     int32
VEHICLECLASS              int32
ENGINESIZE              float64
CYLINDERS                 int64
TRANSMISSION              int32
FUELTYPE                  int32
FUELCONSUMPTION_COMB    float64
CO2EMISSIONS              int64
dtype: object

## 5.SEPARATEING DEPENDENT, INDEPENDENT VARIABLES AND TRAIN, TEST DATA

In [29]:
y = train.CO2EMISSIONS
X = train.drop(['CO2EMISSIONS'],axis=1)

In [30]:
X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size=0.2)

## 6. TRAINING THE MODEL

### 1. Linear Regression

In [31]:
model1 = LinearRegression() 
model1.fit(X_train, y_train) 
y_pred = model1.predict(X_valid) 

In [32]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_valid, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_valid, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_valid, y_pred)))
print("R2 score : %f" % r2_score(y_valid,y_pred))

Mean Absolute Error: 15.504061376495718
Mean Squared Error: 455.57851592804394
Root Mean Squared Error: 21.34428532249426
R2 score : 0.862163


### 2. Random Forest Regressor

In [33]:
model2 = RandomForestRegressor(n_estimators=200)
model2.fit(X_train, y_train)
y_pred2 = model2.predict(X_valid)

In [34]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_valid, y_pred2))
print('Mean Squared Error:', metrics.mean_squared_error(y_valid, y_pred2))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_valid, y_pred2)))
print("R2 score : %f" % r2_score(y_valid,y_pred2))

Mean Absolute Error: 1.4607635467980278
Mean Squared Error: 37.709551108374384
Root Mean Squared Error: 6.140810297377243
R2 score : 0.988591


### 3. XGBoost Regressor

In [35]:
model3 = XGBRegressor(n_estimators=1000, learning_rate=0.05)
model3.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_valid, y_valid)], verbose=False)
y_pred3 = model3.predict(X_valid)

In [36]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_valid, y_pred3))
print('Mean Squared Error:', metrics.mean_squared_error(y_valid, y_pred3))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_valid, y_pred3)))
print("R2 score : %f" % r2_score(y_valid,y_pred3))

Mean Absolute Error: 1.226267622022206
Mean Squared Error: 10.986751685762007
Root Mean Squared Error: 3.3146269301026936
R2 score : 0.996676


### 4. Ridge Regressor

In [37]:
model4 = Ridge(alpha=1.0)
model4.fit(X_train, y_train)
y_pred4 = model4.predict(X_valid)

In [38]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_valid, y_pred4))
print('Mean Squared Error:', metrics.mean_squared_error(y_valid, y_pred4))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_valid, y_pred4)))
print("R2 score : %f" % r2_score(y_valid,y_pred4))

Mean Absolute Error: 15.500136931643329
Mean Squared Error: 455.3520256305759
Root Mean Squared Error: 21.33897902034153
R2 score : 0.862232


### 5. Lasso Regressor

In [39]:
model5 = Lasso(alpha=1.0)
model5.fit(X_train, y_train)
y_pred5 = model5.predict(X_valid)

In [40]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_valid, y_pred5))
print('Mean Squared Error:', metrics.mean_squared_error(y_valid, y_pred5))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_valid, y_pred5)))
print("R2 score : %f" % r2_score(y_valid,y_pred5))

Mean Absolute Error: 15.348249959681059
Mean Squared Error: 449.6601221382378
Root Mean Squared Error: 21.205190924352408
R2 score : 0.863954


### XGBoost Regressor has the best performance among the regressors chosen. So choosing XGBoost for prediciton on test values.

In [41]:
y_estimated = model3.predict(X_valid)

In [42]:
y_valid = y_valid.reset_index(drop=True)
y_estimated = pd.DataFrame(y_estimated)

output=pd.concat([y_valid, y_estimated],axis=1)
output.columns = ['Actual CO2 Emission', 'Estimated CO2 Emission']
output

Unnamed: 0,Actual CO2 Emission,Estimated CO2 Emission
0,276,278.450531
1,242,241.108505
2,285,287.277008
3,342,340.504364
4,297,295.769745
...,...,...
198,200,200.029663
199,225,224.987473
200,292,291.893799
201,269,270.583954
