# <center> Multiple Linear Regression on CO2 Emissions : Core Logic Approach </center>

## Importing Dependencies

In [1]:
import numpy as np
import pandas as pd
import math
import random

## Importing the dataset into the environment

In [2]:
co2Data = pd.read_csv("Datasets/CO2emission.csv") #loading dataset

## Inspecting the dataset

In [3]:
co2Data[:5] # head()

Unnamed: 0,Make,Model,VehicleClass,EngineSize,Cylinders,FuelConsumptionCity,CO2Emissions
0,ACURA,ILX,COMPACT,2.0,4,9.9,196
1,ACURA,ILX,COMPACT,2.4,4,11.2,221
2,ACURA,ILX HYBRID,COMPACT,1.5,4,6.0,136
3,ACURA,MDX 4WD,SUV - SMALL,3.5,6,12.7,255
4,ACURA,RDX AWD,SUV - SMALL,3.5,6,12.1,244


In [4]:
co2Data.shape # (rows, columns)

(7385, 7)

In [5]:
co2Data.isnull().sum()

Make                   0
Model                  0
VehicleClass           0
EngineSize             0
Cylinders              0
FuelConsumptionCity    0
CO2Emissions           0
dtype: int64

In [6]:
co2Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7385 entries, 0 to 7384
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Make                 7385 non-null   object 
 1   Model                7385 non-null   object 
 2   VehicleClass         7385 non-null   object 
 3   EngineSize           7385 non-null   float64
 4   Cylinders            7385 non-null   int64  
 5   FuelConsumptionCity  7385 non-null   float64
 6   CO2Emissions         7385 non-null   int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 404.0+ KB


In [7]:
co2Data.describe()

Unnamed: 0,EngineSize,Cylinders,FuelConsumptionCity,CO2Emissions
count,7385.0,7385.0,7385.0,7385.0
mean,3.160068,5.61503,12.556534,250.584699
std,1.35417,1.828307,3.500274,58.512679
min,0.9,3.0,4.2,96.0
25%,2.0,4.0,10.1,208.0
50%,3.0,6.0,12.1,246.0
75%,3.7,6.0,14.6,288.0
max,8.4,16.0,30.6,522.0


In [8]:
co2Data.duplicated().sum() #number of duplicated rows

1295

In [9]:
co2Data = co2Data.drop_duplicates() #dropping duplicated rows
co2Data.duplicated().sum() #number of duplicated rows

0

In [10]:
co2Data.shape

(6090, 7)

In [11]:
# Classes within the 'Make' attribute
print("The categories in the column {} are as follows = {}".format("'Make'",set(co2Data.Make)))
print("The count of categories in the column {} = {}".format("'Make'", len(set(co2Data.Make))))

The categories in the column 'Make' are as follows = {'MAZDA', 'HYUNDAI', 'JAGUAR', 'ASTON MARTIN', 'LINCOLN', 'NISSAN', 'DODGE', 'MERCEDES-BENZ', 'TOYOTA', 'BMW', 'VOLVO', 'FORD', 'FIAT', 'CHEVROLET', 'ROLLS-ROYCE', 'BUGATTI', 'CHRYSLER', 'AUDI', 'BUICK', 'KIA', 'GENESIS', 'MINI', 'SUBARU', 'SRT', 'GMC', 'SMART', 'JEEP', 'LAMBORGHINI', 'HONDA', 'ACURA', 'MITSUBISHI', 'INFINITI', 'PORSCHE', 'LAND ROVER', 'MASERATI', 'ALFA ROMEO', 'RAM', 'VOLKSWAGEN', 'CADILLAC', 'LEXUS', 'SCION', 'BENTLEY'}
The count of categories in the column 'Make' = 42


In [12]:
# Classes within the 'Model' attribute
print("The count of categories in the column {} = {}".format("'Model'", len(set(co2Data.Model))))
print("The categories in the column {} are as follows = {}".format("'Model'",set(co2Data.Model)))

The count of categories in the column 'Model' = 2053
The categories in the column 'Model' are as follows = {'F-150 RAPTOR 4X4', 'Transit Connect Wagon LWB', 'F-TYPE SVR AWD Coupe', 'A4', 'AMG SL 65', 'MACAN TURBO KIT', '528i SEDAN', 'AMG GLE 63 S 4MATIC Coupe', '230i COUPE', 'Range Rover 5.0 Supercharged', 'Kona', 'CT5-V AWD', 'F-TYPE R AWD Coupe', 'S 65 AMG COUPE', 'F-TYPE COUPE R-DYNAMIC AWD', 'AMG GT S COUPE', 'Giulia AWD', 'FRONTIER', 'Yukon XL FFV', 'M760Li xDrive', 'CAMRY XLE/XSE', 'S 560 CABRIOLET', 'E 400 COUPE', 'X3 xDrive30i', 'Sequoia 4WD', 'Q70', 'E 400 CABRIOLET', 'CAMARO ZL1', 'Forte', 'VELOSTER', 'X6 xDRIVE35i', 'Cayenne', 'LANCER RALLIART', 'CHALLENGER SRT HELLCAT', 'Taurus AWD', 'GRAND CHEROKEE 4X4 TRACKHAWK', 'PANAMERA', 'Discovery TD6 Diesel', 'ACTIVEHYBRID 3', 'ML 63 AMG 4MATIC', 'M240i xDRIVE COUPE', 'SONATA HYBRID', 'ATS', 'CHARGER SRT HELLCAT', '650i xDrive Gran Coupe', 'COOPER CONVERTIBLE', 'Journey FFV', 'Corvette', 'CRUZE DIESEL', 'Canyon 4WD', 'SIERRA eASSIST

In [13]:
# Classes within the 'VehicleClass' attribute
print("The categories in the column {} are as follows = {}".format("'VehicleClass'",set(co2Data.VehicleClass)))
print("The count of categories in the column {} = {}".format("'VehicleClass'", len(set(co2Data.VehicleClass))))

The categories in the column 'VehicleClass' are as follows = {'SUBCOMPACT', 'MINICOMPACT', 'PICKUP TRUCK - SMALL', 'SUV - STANDARD', 'MID-SIZE', 'COMPACT', 'VAN - PASSENGER', 'SUV - SMALL', 'MINIVAN', 'FULL-SIZE', 'PICKUP TRUCK - STANDARD', 'VAN - CARGO', 'SPECIAL PURPOSE VEHICLE', 'STATION WAGON - SMALL', 'STATION WAGON - MID-SIZE', 'TWO-SEATER'}
The count of categories in the column 'VehicleClass' = 16


## Target Encoding

- <i> Target encoding captures the relationship with the target variable without increasing dimensionality excessively, unlike one-hot encoding. </i>
- <i> In contrast, label encoding can mislead the model because it imposes an ordinal relationship. </i>

In [14]:
def targetEncodingScheme(feature):
    classesFromFeature = list(set(co2Data[feature]))
    classesFromFeature = { key : [] for key in classesFromFeature}
    for index in range(len(co2Data)):
        classesFromFeature[co2Data[feature].iloc[index]].append(co2Data['CO2Emissions'].iloc[index])
    classesFromFeature = {category: format((sum(emissions) / len(classesFromFeature[category])), ".2f") for category, emissions in classesFromFeature.items()}
    return classesFromFeature

In [15]:
classesFromMake = targetEncodingScheme('Make')
classesFromMake

{'MAZDA': '192.74',
 'HYUNDAI': '207.80',
 'JAGUAR': '271.88',
 'ASTON MARTIN': '344.28',
 'LINCOLN': '264.70',
 'NISSAN': '236.13',
 'DODGE': '276.90',
 'MERCEDES-BENZ': '279.20',
 'TOYOTA': '225.27',
 'BMW': '252.43',
 'VOLVO': '230.17',
 'FORD': '261.96',
 'FIAT': '192.29',
 'CHEVROLET': '264.81',
 'ROLLS-ROYCE': '389.38',
 'BUGATTI': '522.00',
 'CHRYSLER': '244.05',
 'AUDI': '252.80',
 'BUICK': '233.94',
 'KIA': '214.31',
 'GENESIS': '282.57',
 'MINI': '196.64',
 'SUBARU': '218.18',
 'SRT': '389.00',
 'GMC': '300.61',
 'SMART': '151.43',
 'JEEP': '250.49',
 'LAMBORGHINI': '402.27',
 'HONDA': '191.45',
 'ACURA': '221.08',
 'MITSUBISHI': '201.12',
 'INFINITI': '252.62',
 'PORSCHE': '258.55',
 'LAND ROVER': '287.85',
 'MASERATI': '320.04',
 'ALFA ROMEO': '231.32',
 'RAM': '294.18',
 'VOLKSWAGEN': '209.57',
 'CADILLAC': '266.23',
 'LEXUS': '242.11',
 'SCION': '198.12',
 'BENTLEY': '367.91'}

In [16]:
classesFromModel = targetEncodingScheme('Model')
classesFromModel

{'F-150 RAPTOR 4X4': '376.50',
 'Transit Connect Wagon LWB': '232.50',
 'F-TYPE SVR AWD Coupe': '299.00',
 'A4': '195.60',
 'AMG SL 65': '338.00',
 'MACAN TURBO KIT': '286.00',
 '528i SEDAN': '205.00',
 'AMG GLE 63 S 4MATIC Coupe': '357.00',
 '230i COUPE': '208.00',
 'Range Rover 5.0 Supercharged': '305.00',
 'Kona': '186.50',
 'CT5-V AWD': '276.00',
 'F-TYPE R AWD Coupe': '305.00',
 'S 65 AMG COUPE': '345.00',
 'F-TYPE COUPE R-DYNAMIC AWD': '265.00',
 'AMG GT S COUPE': '302.50',
 'Giulia AWD': '217.00',
 'FRONTIER': '285.50',
 'Yukon XL FFV': '306.00',
 'M760Li xDrive': '355.00',
 'CAMRY XLE/XSE': '173.00',
 'S 560 CABRIOLET': '276.00',
 'E 400 COUPE': '237.00',
 'X3 xDrive30i': '219.00',
 'Sequoia 4WD': '384.50',
 'Q70': '273.67',
 'E 400 CABRIOLET': '231.50',
 'CAMARO ZL1': '358.88',
 'Forte': '173.25',
 'VELOSTER': '182.29',
 'X6 xDRIVE35i': '267.00',
 'Cayenne': '267.00',
 'LANCER RALLIART': '267.00',
 'CHALLENGER SRT HELLCAT': '345.80',
 'Taurus AWD': '294.00',
 'GRAND CHEROKEE 4

In [17]:
classesFromVehicleClass = targetEncodingScheme('VehicleClass')
classesFromVehicleClass

{'SUBCOMPACT': '244.08',
 'MINICOMPACT': '236.63',
 'PICKUP TRUCK - SMALL': '276.33',
 'SUV - STANDARD': '306.55',
 'MID-SIZE': '222.82',
 'COMPACT': '218.04',
 'VAN - PASSENGER': '397.21',
 'SUV - SMALL': '236.37',
 'MINIVAN': '261.70',
 'FULL-SIZE': '264.52',
 'PICKUP TRUCK - STANDARD': '301.94',
 'VAN - CARGO': '361.50',
 'SPECIAL PURPOSE VEHICLE': '237.66',
 'STATION WAGON - SMALL': '202.54',
 'STATION WAGON - MID-SIZE': '241.21',
 'TWO-SEATER': '282.42'}

In [18]:
co2Data['Make'] = co2Data['Make'].map(classesFromMake).astype(float)
co2Data['Model'] = co2Data['Model'].map(classesFromModel).astype(float)
co2Data['VehicleClass'] = co2Data['VehicleClass'].map(classesFromVehicleClass).astype(float)

In [19]:
co2Data[:18]

Unnamed: 0,Make,Model,VehicleClass,EngineSize,Cylinders,FuelConsumptionCity,CO2Emissions
0,221.08,200.29,218.04,2.0,4,9.9,196
1,221.08,200.29,218.04,2.4,4,11.2,221
2,221.08,138.0,218.04,1.5,4,6.0,136
3,221.08,255.0,236.37,3.5,6,12.7,255
4,221.08,243.75,236.37,3.5,6,12.1,244
5,221.08,232.5,222.82,3.5,6,11.9,230
6,221.08,232.0,222.82,3.5,6,11.8,232
7,221.08,261.0,222.82,3.7,6,12.8,255
8,221.08,261.0,222.82,3.7,6,13.4,267
9,221.08,225.33,218.04,2.4,4,10.6,212


## Correlation Analysis

### Pearson's Correlation Coefficient

- The Pearson correlation coefficient \( r \) is a measure of the linear relationship between two variables. It is calculated using the formula:

$$ 
r = \frac{n \sum (XY) - \sum X \sum Y}{\sqrt{[n \sum (X^2) - (\sum X)^2][n \sum (Y^2) - (\sum Y)^2]}}
$$

- Dependent Variable    = CO2Emissions
- Independent Variables = EngineSize, Cylinders, FuelConsumptionCity

In [20]:
def Karl_Pearson_Correlation(indeFeature, depenFeature):
    X = co2Data[indeFeature]
    Y = co2Data[depenFeature]
    n = len(Y)        
    sumX = sum(X)
    sumY = sum(Y)
    sumXY = sum(X[:] * Y[:])
    sumXsq = sum(X[:] * X[:])
    sumYsq = sum(Y[:] * Y[:])
    numerator = (n * sumXY) - (sumX * sumY)
    denominator = (n * sumXsq - (sumX ** 2)) * (n * sumYsq - (sumY ** 2))
    if denominator != 0:
        corr = numerator / math.sqrt(denominator)
    else:
        corr = None
    return corr

In [21]:
correlations = {"EngineSize-CO2Emissions": [], "Cylinders-CO2Emissions": [], "FuelConsumptionCity-CO2Emissions": []}
for feature in ["EngineSize", "Cylinders", "FuelConsumptionCity"]:
    corr = Karl_Pearson_Correlation(feature, "CO2Emissions")
    correlations[f"{feature}-CO2Emissions"].append(corr)
    correlations[f"{feature}-CO2Emissions"].append(corr ** 2)
pd.DataFrame(correlations, index = ['Correlation (r)', 'Goodness of Fit (r^2)'])

Unnamed: 0,EngineSize-CO2Emissions,Cylinders-CO2Emissions,FuelConsumptionCity-CO2Emissions
Correlation (r),0.855194,0.834444,0.918415
Goodness of Fit (r^2),0.731356,0.696298,0.843487


In [22]:
#confusion matrix for examining multicollinearity
columns = co2Data.drop(columns = ['Make', 'Model', 'VehicleClass'], axis = 1).columns.tolist()
corr = {"EngineSize": [], "Cylinders": [], "FuelConsumptionCity": [], "CO2Emissions": []}
for x in columns:
    for y in columns:
        corr[x].append(Karl_Pearson_Correlation(x, y))
confusionMatrix = pd.DataFrame(corr, index = columns)
confusionMatrix

Unnamed: 0,EngineSize,Cylinders,FuelConsumptionCity,CO2Emissions
EngineSize,1.0,0.928464,0.834919,0.855194
Cylinders,0.928464,1.0,0.8015,0.834444
FuelConsumptionCity,0.834919,0.8015,1.0,0.918415
CO2Emissions,0.855194,0.834444,0.918415,1.0


## <center> The Multiple Linear Regression </center>

### ► Splitting the data into training and testing sets in an 80:20 ratio

In [23]:
n = int(len(co2Data) * 80/100)
indexForTrainingSet = random.sample(range(len(co2Data)), n)
print(f"the length of the sampled indices: {len(indexForTrainingSet)}")
print(f"Is duplicated? : {len(indexForTrainingSet) != len(set(indexForTrainingSet))}")
indexForTrainingSet[:10]

the length of the sampled indices: 4872
Is duplicated? : False


[4385, 5808, 5259, 1043, 374, 309, 2136, 1255, 1964, 925]

In [24]:
print(f"Is this exactly 80%? : {len(indexForTrainingSet) == n}")

Is this exactly 80%? : True


In [25]:
trainingSet = {'Make': [], 'Model': [], 'VehicleClass': [], 'EngineSize': [], 'Cylinders': [], 'FuelConsumptionCity': [], 'CO2Emissions': []}
testingSet = {'Make': [], 'Model': [], 'VehicleClass': [], 'EngineSize': [], 'Cylinders': [], 'FuelConsumptionCity': [], 'CO2Emissions': []}
for index in range(len(co2Data)):
    if index in indexForTrainingSet:
        trainingSet['Make'].append(co2Data.iloc[index]['Make'])
        trainingSet['Model'].append(co2Data.iloc[index]['Model'])
        trainingSet['VehicleClass'].append(co2Data.iloc[index]['VehicleClass'])
        trainingSet['EngineSize'].append(co2Data.iloc[index]['EngineSize'])
        trainingSet['Cylinders'].append(co2Data.iloc[index]['Cylinders'])
        trainingSet['FuelConsumptionCity'].append(co2Data.iloc[index]['FuelConsumptionCity'])
        trainingSet['CO2Emissions'].append(co2Data.iloc[index]['CO2Emissions'])
    elif index not in indexForTrainingSet:
        testingSet['Make'].append(co2Data.iloc[index]['Make'])
        testingSet['Model'].append(co2Data.iloc[index]['Model'])
        testingSet['VehicleClass'].append(co2Data.iloc[index]['VehicleClass'])
        testingSet['EngineSize'].append(co2Data.iloc[index]['EngineSize'])
        testingSet['Cylinders'].append(co2Data.iloc[index]['Cylinders'])
        testingSet['FuelConsumptionCity'].append(co2Data.iloc[index]['FuelConsumptionCity'])
        testingSet['CO2Emissions'].append(co2Data.iloc[index]['CO2Emissions'])
trainingSet = pd.DataFrame(trainingSet).reset_index(drop = True)
testingSet = pd.DataFrame(testingSet).reset_index(drop = True)

In [26]:
print(f"the length of the training set: {len(trainingSet)}")
print(f"the length of the testing set: {len(testingSet)}")
print(f"the ratio of the training set to the entire dataset: {int((len(trainingSet) / len(co2Data)) * 100)}%")
print(f"the ratio of the testing set to the entire dataset: {int((len(testingSet) / len(co2Data)) * 100)}%")

the length of the training set: 4872
the length of the testing set: 1218
the ratio of the training set to the entire dataset: 80%
the ratio of the testing set to the entire dataset: 20%


In [27]:
trainingSet[:5]

Unnamed: 0,Make,Model,VehicleClass,EngineSize,Cylinders,FuelConsumptionCity,CO2Emissions
0,221.08,200.29,218.04,2.0,4.0,9.9,196.0
1,221.08,200.29,218.04,2.4,4.0,11.2,221.0
2,221.08,255.0,236.37,3.5,6.0,12.7,255.0
3,221.08,243.75,236.37,3.5,6.0,12.1,244.0
4,221.08,232.5,222.82,3.5,6.0,11.9,230.0


In [28]:
testingSet[:5]

Unnamed: 0,Make,Model,VehicleClass,EngineSize,Cylinders,FuelConsumptionCity,CO2Emissions
0,221.08,138.0,218.04,1.5,4.0,6.0,136.0
1,221.08,225.33,218.04,2.4,4.0,11.2,225.0
2,344.28,350.6,282.42,4.7,8.0,18.1,354.0
3,252.8,215.5,244.08,2.0,4.0,10.8,214.0
4,252.8,230.56,222.82,3.0,6.0,12.8,251.0


### ► Model training

#### ➔ Calculating Coefficients for Multiple Linear Regression Using Ordinary Least Squares

In [29]:
matrixX = trainingSet[['Make', 'Model', 'VehicleClass', 'EngineSize', 'Cylinders', 'FuelConsumptionCity']][:5]
ones = np.ones((matrixX.shape[0],1))
matrixX = np.hstack((ones, matrixX))

In [30]:
matrixX

array([[  1.  , 221.08, 200.29, 218.04,   2.  ,   4.  ,   9.9 ],
       [  1.  , 221.08, 200.29, 218.04,   2.4 ,   4.  ,  11.2 ],
       [  1.  , 221.08, 255.  , 236.37,   3.5 ,   6.  ,  12.7 ],
       [  1.  , 221.08, 243.75, 236.37,   3.5 ,   6.  ,  12.1 ],
       [  1.  , 221.08, 232.5 , 222.82,   3.5 ,   6.  ,  11.9 ]])

In [31]:
matrixY = np.matrix(trainingSet['CO2Emissions'][:5]).T

In [32]:
matrixY

matrix([[196.],
        [221.],
        [255.],
        [244.],
        [230.]])

In [33]:
matrixXTrans = matrixX.T
matrixXTrans

array([[  1.  ,   1.  ,   1.  ,   1.  ,   1.  ],
       [221.08, 221.08, 221.08, 221.08, 221.08],
       [200.29, 200.29, 255.  , 243.75, 232.5 ],
       [218.04, 218.04, 236.37, 236.37, 222.82],
       [  2.  ,   2.4 ,   3.5 ,   3.5 ,   3.5 ],
       [  4.  ,   4.  ,   6.  ,   6.  ,   6.  ],
       [  9.9 ,  11.2 ,  12.7 ,  12.1 ,  11.9 ]])

In [34]:
matrix_XXT = np.matmul(matrixX, matrixXTrans)
matrix_XXT

array([[136652.9021, 136666.5721, 151646.1612, 149386.9587, 144177.2742],
       [136666.5721, 136682.0921, 151664.0712, 149404.0887, 144194.1442],
       [151646.1612, 151664.0712, 169982.6833, 167106.3133, 161032.2098],
       [149386.9587, 149404.0887, 167106.3133, 164356.8658, 158409.4448],
       [144177.2742, 144194.1442, 161032.2098, 158409.4448, 152772.2288]])

## <center> ......  An updated version will be published  ........... </center>