In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import pylab as pl

In [2]:
data = pd.read_csv('data/FuelConsumption.csv')
data.head()

Unnamed: 0,MODELYEAR,MAKE,MODEL,VEHICLECLASS,ENGINESIZE,CYLINDERS,TRANSMISSION,FUELTYPE,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
0,2014,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,2014,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,2014,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,2014,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,2014,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244


In [3]:
data.columns

Index(['MODELYEAR', 'MAKE', 'MODEL', 'VEHICLECLASS', 'ENGINESIZE', 'CYLINDERS',
       'TRANSMISSION', 'FUELTYPE', 'FUELCONSUMPTION_CITY',
       'FUELCONSUMPTION_HWY', 'FUELCONSUMPTION_COMB',
       'FUELCONSUMPTION_COMB_MPG', 'CO2EMISSIONS'],
      dtype='object')

In [4]:
x = data[['ENGINESIZE']]
x.head()

Unnamed: 0,ENGINESIZE
0,2.0
1,2.4
2,1.5
3,3.5
4,3.5


In [5]:
y = data.CO2EMISSIONS
y.head()

0    196
1    221
2    136
3    255
4    244
Name: CO2EMISSIONS, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.7)

In [7]:
x_train.shape

(746, 1)

In [8]:
x_test.shape

(321, 1)

In [9]:
y_train.shape

(746,)

In [10]:
y_test.shape

(321,)

In [11]:
from sklearn.linear_model import LinearRegression
regr = LinearRegression()
regr.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [14]:
regr.coef_

array([39.10011346])

In [15]:
regr.rank_

1

In [16]:
regr.intercept_

124.68931544199808

In [13]:
y_test[0:15]

308     237
100     213
239     292
716     186
1025    197
251     184
1012    168
658     239
143     212
398     278
13      359
132     370
181     377
973     168
259     159
Name: CO2EMISSIONS, dtype: int64

In [12]:
yhat = regr.predict(x_test)
yhat[0:15]

array([265.44972389, 241.98965581, 331.91991676, 222.43959908,
       202.88954236, 179.42947428, 183.33948563, 261.53971254,
       179.42947428, 261.53971254, 355.37998484, 296.72981465,
       367.11001887, 195.06951966, 171.60945159])

In [22]:
regr.predict([[2.146541235]])

array([208.61932127])

In [23]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mean_absolute_error(y_test, yhat)

23.765162750800712

In [24]:
mean_squared_error(y_test, yhat)

967.9136826454857

In [25]:
r2_score(y_test, yhat)

0.7657164252115989

In [27]:
from sklearn.model_selection import cross_val_score
cv_results = cross_val_score(regr, x_train, y_train, cv = 50)
cv_results

array([0.74727805, 0.73534004, 0.69565197, 0.75014615, 0.90846681,
       0.62969061, 0.57467088, 0.63091762, 0.63332484, 0.63004756,
       0.82296712, 0.67740015, 0.80632714, 0.57942079, 0.9116687 ,
       0.72329786, 0.12659974, 0.54461172, 0.80486994, 0.61546706,
       0.65945072, 0.80035359, 0.8748674 , 0.72701682, 0.68571352,
       0.74936418, 0.85806533, 0.71594674, 0.5968282 , 0.84841741,
       0.77704539, 0.87299003, 0.70560508, 0.7612273 , 0.83928296,
       0.7311735 , 0.59784597, 0.75476564, 0.7104808 , 0.78377682,
       0.70824174, 0.72989991, 0.79881617, 0.94089162, 0.69473172,
       0.81376047, 0.72245127, 0.68776401, 0.69056642, 0.84280783])

In [28]:
np.mean(cv_results)

0.7245662668666006

In [29]:
np.min(cv_results)

0.1265997367045124

In [30]:
np.max(cv_results)

0.9408916217341986