importing necessary packages

In [243]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import statsmodels.api as sm

from sklearn.linear_model import LinearRegression
lr = LinearRegression()
labelencoder = LabelEncoder()

loading the csv file and deleting columns not needed

In [244]:
df = pd.read_csv("FuelConsumptionCo2.csv")

In [245]:
df.head()

Unnamed: 0,MODELYEAR,MAKE,MODEL,VEHICLECLASS,ENGINESIZE,CYLINDERS,TRANSMISSION,FUELTYPE,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
0,2014,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,2014,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,2014,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,2014,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,2014,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244


In [246]:
df['MODELYEAR'].unique()

array([2014])

There is only 1 MODELYEAR, hence it makes no sense to include it as a parameter

In [247]:
df = df.drop(['MODELYEAR'], axis=1)

We use labelencoder to assign unique values to all the categorical data types as we cant operate on variables that are string.

In [248]:
df['MAKE'], df['MODEL'], df['VEHICLECLASS'], df['TRANSMISSION'], df['FUELTYPE'] = \
labelencoder.fit_transform(df['MAKE']), labelencoder.fit_transform(df['MODEL']), labelencoder.fit_transform(df['VEHICLECLASS']), \
labelencoder.fit_transform(df['TRANSMISSION']), labelencoder.fit_transform(df['FUELTYPE'])

Next, we filter out the variables that have less correlation with the target parameter - CO2EMISSIONS

In [249]:
cor = df.corr()
cor_target = abs(cor["CO2EMISSIONS"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.5]
relevant_features

ENGINESIZE                  0.874154
CYLINDERS                   0.849685
FUELCONSUMPTION_CITY        0.898039
FUELCONSUMPTION_HWY         0.861748
FUELCONSUMPTION_COMB        0.892129
FUELCONSUMPTION_COMB_MPG    0.906394
CO2EMISSIONS                1.000000
Name: CO2EMISSIONS, dtype: float64

dropping the columns with low correlation, ones below 0.5

In [250]:
df = df.drop(['MAKE', 'MODEL', 'VEHICLECLASS', 'FUELTYPE', 'TRANSMISSION'], axis=1)

In [251]:
df.head()

Unnamed: 0,ENGINESIZE,CYLINDERS,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
0,2.0,4,9.9,6.7,8.5,33,196
1,2.4,4,11.2,7.7,9.6,29,221
2,1.5,4,6.0,5.8,5.9,48,136
3,3.5,6,12.7,9.1,11.1,25,255
4,3.5,6,12.1,8.7,10.6,27,244


We use min-max normalization. This step is really important. You could skip the following cell and try out all other cells to  get some absurd value as the MSE error

In [252]:
df=(df-df.min())/(df.max()-df.min())

In [253]:
df.head()

Unnamed: 0,ENGINESIZE,CYLINDERS,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
0,0.135135,0.111111,0.207031,0.115385,0.180095,0.44898,0.231579
1,0.189189,0.111111,0.257812,0.179487,0.232227,0.367347,0.297368
2,0.067568,0.111111,0.054688,0.057692,0.056872,0.755102,0.073684
3,0.337838,0.333333,0.316406,0.269231,0.303318,0.285714,0.386842
4,0.337838,0.333333,0.292969,0.24359,0.279621,0.326531,0.357895


In [254]:
y,x = df["CO2EMISSIONS"], df.drop(["CO2EMISSIONS"], axis=1)

splitting the dataset into train and test functions, fitting the default sklearn function for Linear Regression through the train dataset

In [259]:
x_, y_ = x.to_numpy(), y.to_numpy()
x_ = np.append(arr = np.ones((len(x_), 1)).astype(int), values = x, axis = 1) 

x_train,x_test,y_train,y_test = train_test_split(x_,y_, test_size = 0.2, random_state=21)

lr.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

Predicting the outputs for the test dataset and comparing it with the actual output to compute error using MSE function

In [260]:
y_test_pred = lr.predict(x_test)

In [261]:
mean_squared_error(y_test, y_test_pred)

0.0033393910980578094