In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("FuelConsumption.csv")

In [3]:
data.head(10)

Unnamed: 0,MODELYEAR,MAKE,MODEL,VEHICLECLASS,ENGINESIZE,CYLINDERS,TRANSMISSION,FUELTYPE,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
0,2014,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,2014,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,2014,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,2014,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,2014,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244
5,2014,ACURA,RLX,MID-SIZE,3.5,6,AS6,Z,11.9,7.7,10.0,28,230
6,2014,ACURA,TL,MID-SIZE,3.5,6,AS6,Z,11.8,8.1,10.1,28,232
7,2014,ACURA,TL AWD,MID-SIZE,3.7,6,AS6,Z,12.8,9.0,11.1,25,255
8,2014,ACURA,TL AWD,MID-SIZE,3.7,6,M6,Z,13.4,9.5,11.6,24,267
9,2014,ACURA,TSX,COMPACT,2.4,4,AS5,Z,10.6,7.5,9.2,31,212


In [4]:
data = data[['ENGINESIZE','CYLINDERS','FUELCONSUMPTION_COMB','FUELCONSUMPTION_COMB_MPG','CO2EMISSIONS']]

In [5]:
data.head()

Unnamed: 0,ENGINESIZE,CYLINDERS,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
0,2.0,4,8.5,33,196
1,2.4,4,9.6,29,221
2,1.5,4,5.9,48,136
3,3.5,6,11.1,25,255
4,3.5,6,10.6,27,244


### Splitting the data for training the model

In [6]:
X = data[['ENGINESIZE','CYLINDERS','FUELCONSUMPTION_COMB','FUELCONSUMPTION_COMB_MPG']] # iv should always be 2D

In [7]:
X

Unnamed: 0,ENGINESIZE,CYLINDERS,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG
0,2.0,4,8.5,33
1,2.4,4,9.6,29
2,1.5,4,5.9,48
3,3.5,6,11.1,25
4,3.5,6,10.6,27
...,...,...,...,...
1062,3.0,6,11.8,24
1063,3.2,6,11.5,25
1064,3.0,6,11.8,24
1065,3.2,6,11.3,25


In [8]:
y = data['CO2EMISSIONS']

In [9]:
y

0       196
1       221
2       136
3       255
4       244
       ... 
1062    271
1063    264
1064    271
1065    260
1066    294
Name: CO2EMISSIONS, Length: 1067, dtype: int64

### Training the model 

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2)

Ml_reg = LinearRegression()      # Object creation of class LinearRegression

Ml_reg.fit(X_train,y_train)      # for training linear regression use .fit()

y_pred = Ml_reg.predict(X_test)   # for testing use .predict()

In [11]:
# for checking the error we will use either mean_Squared_error or mean_absolute_error

print(f"The mean squared errors is {mean_squared_error(y_pred,y_test)}")

print(f"The mean absolute errors is {mean_absolute_error(y_pred,y_test)}")

The mean squared errors is 465.0118687611934
The mean absolute errors is 14.093188185209007


In [12]:
#  checking accuracy of model

r2_score = r2_score(y_test,y_pred)
print(f'The r2_score/accuracy value is {r2_score:.2f}')

The r2_score/accuracy value is 0.87


### Predicting CO2 emmision on real/unknown data

In [13]:
data

Unnamed: 0,ENGINESIZE,CYLINDERS,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
0,2.0,4,8.5,33,196
1,2.4,4,9.6,29,221
2,1.5,4,5.9,48,136
3,3.5,6,11.1,25,255
4,3.5,6,10.6,27,244
...,...,...,...,...,...
1062,3.0,6,11.8,24,271
1063,3.2,6,11.5,25,264
1064,3.0,6,11.8,24,271
1065,3.2,6,11.3,25,260


In [14]:
Ml_reg.predict([[2.4,4,11.1,33]])



array([210.37736002])