importing necessary packages

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

loading the csv file and deleting columns not needed

In [3]:
df = pd.read_csv("FuelConsumptionCo2.csv")

In [4]:
df.head()

Unnamed: 0,MODELYEAR,MAKE,MODEL,VEHICLECLASS,ENGINESIZE,CYLINDERS,TRANSMISSION,FUELTYPE,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
0,2014,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,2014,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,2014,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,2014,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,2014,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244


In [5]:
df['MODELYEAR'].unique()

array([2014])

There is only 1 MODELYEAR, hence it makes no sense to include it as a parameter

In [6]:
df = df.drop(['MODELYEAR'], axis=1)

We use labelencoder to assign unique values to all the categorical data types as we cant operate on variables that are string.

In [7]:
df['MAKE'], df['MODEL'], df['VEHICLECLASS'], df['TRANSMISSION'], df['FUELTYPE'] = \
labelencoder.fit_transform(df['MAKE']), labelencoder.fit_transform(df['MODEL']), labelencoder.fit_transform(df['VEHICLECLASS']), \
labelencoder.fit_transform(df['TRANSMISSION']), labelencoder.fit_transform(df['FUELTYPE'])

Next, we filter out the variables that have less correlation with the target parameter - CO2EMISSIONS

In [8]:
cor = df.corr()
cor_target = abs(cor["CO2EMISSIONS"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.5]
relevant_features

ENGINESIZE                  0.874154
CYLINDERS                   0.849685
FUELCONSUMPTION_CITY        0.898039
FUELCONSUMPTION_HWY         0.861748
FUELCONSUMPTION_COMB        0.892129
FUELCONSUMPTION_COMB_MPG    0.906394
CO2EMISSIONS                1.000000
Name: CO2EMISSIONS, dtype: float64

dropping the columns with low correlation, ones below 0.5

In [9]:
df = df.drop(['MAKE', 'MODEL', 'VEHICLECLASS', 'FUELTYPE', 'TRANSMISSION'], axis=1)

In [10]:
df.head()

Unnamed: 0,ENGINESIZE,CYLINDERS,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
0,2.0,4,9.9,6.7,8.5,33,196
1,2.4,4,11.2,7.7,9.6,29,221
2,1.5,4,6.0,5.8,5.9,48,136
3,3.5,6,12.7,9.1,11.1,25,255
4,3.5,6,12.1,8.7,10.6,27,244


We use min-max normalization. This step is really important. You could skip the following cell and try out all other cells to  get some absurd value as the MSE error

In [11]:
df=(df-df.min())/(df.max()-df.min())

In [12]:
df.head()

Unnamed: 0,ENGINESIZE,CYLINDERS,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
0,0.135135,0.111111,0.207031,0.115385,0.180095,0.44898,0.231579
1,0.189189,0.111111,0.257812,0.179487,0.232227,0.367347,0.297368
2,0.067568,0.111111,0.054688,0.057692,0.056872,0.755102,0.073684
3,0.337838,0.333333,0.316406,0.269231,0.303318,0.285714,0.386842
4,0.337838,0.333333,0.292969,0.24359,0.279621,0.326531,0.357895


In [13]:
y,x = df["CO2EMISSIONS"], df.drop(["CO2EMISSIONS"], axis=1)

splitting the dataset into train and test functions

In [14]:
x_, y_ = x.to_numpy(), y.to_numpy()
x_ = np.append(arr = np.ones((len(x_), 1)).astype(int), values = x, axis = 1) 

x_train,x_test,y_train,y_test = train_test_split(x_,y_, test_size = 0.2, random_state=21)

In [15]:
a=0.01
X = []
for row in x_train:
    r = [1]
    for item in row:
        r.append(item)
    X.append(r)
    
X = np.asmatrix(X)

In [16]:
theta = np.zeros(((X[0].size), 1))

In [17]:
Y = y_train.reshape(-1,1)

In [18]:
h = np.dot(X, theta)
h.shape

(853, 1)

In [19]:
temp = np.zeros(theta.shape)
cost = np.sum (np.dot(np.transpose(h-Y), (h-Y)))*(1/(2*X.shape[0]))

In [20]:
temp = np.zeros(theta.shape)

The gradientDescent function and the subsequent code are similar to the ones we used for Linear Regression, except that here we represent theta as an array.

One thing I usually do is to examine and play with the shapes of all matrices and arrays and see how they could be combined to produce the required resultant matrix

In [21]:
def gradientDescent(theta, X):
    h = np.dot(X, theta)
    cost = np.sum(np.sum((h-Y)**2))*(1/(2*X.shape[0]))
    temp = theta - np.dot(X.T, h-Y) * (a/X.shape[0])
    theta = temp
    return(theta, X, cost)

In [29]:
oldCost = 0
theta = np.ones(theta.shape)
X = np.ones(X.shape)
for i in range(0, 10000):
    (theta, X, cost) = gradientDescent(theta, X)
    if i%1000 == 0:
        print(cost)
        print(theta)

28.955263880454513
[[0.92391923]
 [0.92391923]
 [0.92391923]
 [0.92391923]
 [0.92391923]
 [0.92391923]
 [0.92391923]
 [0.92391923]]
0.013847910580828569
[[0.04899041]
 [0.04899041]
 [0.04899041]
 [0.04899041]
 [0.04899041]
 [0.04899041]
 [0.04899041]
 [0.04899041]]
0.013847910580828569
[[0.04899041]
 [0.04899041]
 [0.04899041]
 [0.04899041]
 [0.04899041]
 [0.04899041]
 [0.04899041]
 [0.04899041]]
0.013847910580828569
[[0.04899041]
 [0.04899041]
 [0.04899041]
 [0.04899041]
 [0.04899041]
 [0.04899041]
 [0.04899041]
 [0.04899041]]
0.013847910580828569
[[0.04899041]
 [0.04899041]
 [0.04899041]
 [0.04899041]
 [0.04899041]
 [0.04899041]
 [0.04899041]
 [0.04899041]]
0.013847910580828569
[[0.04899041]
 [0.04899041]
 [0.04899041]
 [0.04899041]
 [0.04899041]
 [0.04899041]
 [0.04899041]
 [0.04899041]]
0.013847910580828569
[[0.04899041]
 [0.04899041]
 [0.04899041]
 [0.04899041]
 [0.04899041]
 [0.04899041]
 [0.04899041]
 [0.04899041]]
0.013847910580828569
[[0.04899041]
 [0.04899041]
 [0.04899041]
 

In [23]:
theta

array([[0.04899041],
       [0.04899041],
       [0.04899041],
       [0.04899041],
       [0.04899041],
       [0.04899041],
       [0.04899041],
       [0.04899041]])

predicting the y values for the test dataset and evaluating our model with the MSE function

In [24]:
X_test = []
for row in x_test:
    r = [1]
    for item in row:
        r.append(item)
    X_test.append(r)

In [25]:
mean_squared_error(np.dot(X_test, theta), y_test)

0.05530668773441035

You can achieve way better results using the built-in sklearn function.