In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import pylab as pl

In [2]:
data = pd.read_csv('data/FuelConsumption.csv')
data.head()

Unnamed: 0,MODELYEAR,MAKE,MODEL,VEHICLECLASS,ENGINESIZE,CYLINDERS,TRANSMISSION,FUELTYPE,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
0,2014,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,2014,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,2014,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,2014,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,2014,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244


In [3]:
data.columns

Index(['MODELYEAR', 'MAKE', 'MODEL', 'VEHICLECLASS', 'ENGINESIZE', 'CYLINDERS',
       'TRANSMISSION', 'FUELTYPE', 'FUELCONSUMPTION_CITY',
       'FUELCONSUMPTION_HWY', 'FUELCONSUMPTION_COMB',
       'FUELCONSUMPTION_COMB_MPG', 'CO2EMISSIONS'],
      dtype='object')

In [4]:
data = data.drop(['MAKE', 'MODEL', 'VEHICLECLASS', 'TRANSMISSION', 'FUELTYPE', ], axis = 1)
data.head()

Unnamed: 0,MODELYEAR,ENGINESIZE,CYLINDERS,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
0,2014,2.0,4,9.9,6.7,8.5,33,196
1,2014,2.4,4,11.2,7.7,9.6,29,221
2,2014,1.5,4,6.0,5.8,5.9,48,136
3,2014,3.5,6,12.7,9.1,11.1,25,255
4,2014,3.5,6,12.1,8.7,10.6,27,244


In [5]:
data.columns

Index(['MODELYEAR', 'ENGINESIZE', 'CYLINDERS', 'FUELCONSUMPTION_CITY',
       'FUELCONSUMPTION_HWY', 'FUELCONSUMPTION_COMB',
       'FUELCONSUMPTION_COMB_MPG', 'CO2EMISSIONS'],
      dtype='object')

In [6]:
x = data[['MODELYEAR', 'ENGINESIZE', 'CYLINDERS', 'FUELCONSUMPTION_CITY',
       'FUELCONSUMPTION_HWY', 'FUELCONSUMPTION_COMB',
       'FUELCONSUMPTION_COMB_MPG']]
x.head()

Unnamed: 0,MODELYEAR,ENGINESIZE,CYLINDERS,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG
0,2014,2.0,4,9.9,6.7,8.5,33
1,2014,2.4,4,11.2,7.7,9.6,29
2,2014,1.5,4,6.0,5.8,5.9,48
3,2014,3.5,6,12.7,9.1,11.1,25
4,2014,3.5,6,12.1,8.7,10.6,27


In [7]:
y = data['CO2EMISSIONS']
y.head()

0    196
1    221
2    136
3    255
4    244
Name: CO2EMISSIONS, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.2, random_state = 2)

In [9]:
x_train.shape

(213, 7)

In [10]:
x_test.shape

(854, 7)

In [11]:
y_train.shape

(213,)

In [12]:
y_test.shape

(854,)

In [13]:
from sklearn.linear_model import LinearRegression
regr = LinearRegression()
regr.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [14]:
y_test[0:10]

455    292
954    288
738    301
913    286
702    170
321    168
765    327
343    373
641    347
309    258
Name: CO2EMISSIONS, dtype: int64

In [15]:
ypred = regr.predict(x_test)
ypred[0:10]

array([311.69704089, 248.20739831, 310.49988096, 249.55923803,
       172.79069902, 170.77033628, 317.80258419, 337.59568897,
       329.26573184, 263.69018568])

In [16]:
regr.intercept_

200.4809776470093

In [17]:
regr.coef_

array([  0.        ,   5.23948348,  10.84050507, -42.48325292,
       -33.19811405,  80.68953863,  -2.99265059])

In [18]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mean_absolute_error(y_test, ypred)

13.86754263449759

In [19]:
mean_squared_error(y_test, ypred)

422.48594536256417

In [20]:
r2_score(y_test, ypred)

0.892148747621274

In [21]:
from sklearn.model_selection import cross_val_score
cv_results = cross_val_score(regr, x_train, y_train, cv = 50)
cv_results

array([ 0.85312116,  0.67534678,  0.76585514,  0.97440676,  0.87029978,
        0.72713676, -0.80921852,  0.97328746,  0.94027921,  0.97805251,
        0.52184782,  0.9411551 ,  0.93194725,  0.85705154,  0.0308005 ,
        0.97968755,  0.81580061,  0.69965409,  0.82556979,  0.93793292,
        0.84910008,  0.71279908,  0.75027336,  0.59447662,  0.99021299,
        0.69921981,  0.96092316,  0.92452235,  0.86565753,  0.87621903,
        0.81784487,  0.91606545,  0.82135292,  0.95540193,  0.76759169,
        0.25902401,  0.80224687,  0.9538598 ,  0.86750277,  0.92742781,
        0.17729343,  0.85275561,  0.65141568,  0.80297307,  0.90355643,
        0.9469783 ,  0.79405868,  0.32778462,  0.93664786,  0.88642452])

In [22]:
np.min(cv_results)

-0.8092185203577734

In [23]:
np.max(cv_results)

0.9902129851899849

In [24]:
np.mean(cv_results)

0.7616324900663655