# <center> Multiple Linear Regression on CO2 Emissions : Core Logic Approach </center>

## Importing Dependencies

In [162]:
import numpy as np
import pandas as pd
import math
import random

## Importing the dataset into the environment

In [163]:
co2Data = pd.read_csv("Datasets/CO2emission.csv") #loading dataset

## Inspecting the dataset

In [164]:
co2Data[:5] # head()

Unnamed: 0,Make,Model,VehicleClass,EngineSize,Cylinders,FuelConsumptionCity,CO2Emissions
0,ACURA,ILX,COMPACT,2.0,4,9.9,196
1,ACURA,ILX,COMPACT,2.4,4,11.2,221
2,ACURA,ILX HYBRID,COMPACT,1.5,4,6.0,136
3,ACURA,MDX 4WD,SUV - SMALL,3.5,6,12.7,255
4,ACURA,RDX AWD,SUV - SMALL,3.5,6,12.1,244


In [165]:
co2Data.shape # (rows, columns)

(7385, 7)

In [166]:
co2Data.isnull().sum()

Make                   0
Model                  0
VehicleClass           0
EngineSize             0
Cylinders              0
FuelConsumptionCity    0
CO2Emissions           0
dtype: int64

In [167]:
co2Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7385 entries, 0 to 7384
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Make                 7385 non-null   object 
 1   Model                7385 non-null   object 
 2   VehicleClass         7385 non-null   object 
 3   EngineSize           7385 non-null   float64
 4   Cylinders            7385 non-null   int64  
 5   FuelConsumptionCity  7385 non-null   float64
 6   CO2Emissions         7385 non-null   int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 404.0+ KB


In [168]:
co2Data.describe()

Unnamed: 0,EngineSize,Cylinders,FuelConsumptionCity,CO2Emissions
count,7385.0,7385.0,7385.0,7385.0
mean,3.160068,5.61503,12.556534,250.584699
std,1.35417,1.828307,3.500274,58.512679
min,0.9,3.0,4.2,96.0
25%,2.0,4.0,10.1,208.0
50%,3.0,6.0,12.1,246.0
75%,3.7,6.0,14.6,288.0
max,8.4,16.0,30.6,522.0


In [169]:
co2Data.duplicated().sum() #number of duplicated rows

1295

In [170]:
co2Data = co2Data.drop_duplicates() #dropping duplicated rows
co2Data.duplicated().sum() #number of duplicated rows

0

In [171]:
co2Data.shape

(6090, 7)

In [172]:
# Classes within the 'Make' attribute
print("The categories in the column {} are as follows = {}".format("'Make'",set(co2Data.Make)))
print("The count of categories in the column {} = {}".format("'Make'", len(set(co2Data.Make))))

The categories in the column 'Make' are as follows = {'NISSAN', 'PORSCHE', 'CHRYSLER', 'FORD', 'LAND ROVER', 'FIAT', 'KIA', 'DODGE', 'BUGATTI', 'HYUNDAI', 'ASTON MARTIN', 'ACURA', 'SUBARU', 'JAGUAR', 'RAM', 'CHEVROLET', 'ROLLS-ROYCE', 'CADILLAC', 'VOLKSWAGEN', 'BMW', 'BENTLEY', 'AUDI', 'SRT', 'MASERATI', 'GMC', 'LAMBORGHINI', 'HONDA', 'INFINITI', 'MINI', 'SMART', 'TOYOTA', 'JEEP', 'LEXUS', 'SCION', 'ALFA ROMEO', 'VOLVO', 'LINCOLN', 'MERCEDES-BENZ', 'MITSUBISHI', 'GENESIS', 'BUICK', 'MAZDA'}
The count of categories in the column 'Make' = 42


In [173]:
# Classes within the 'Model' attribute
print("The count of categories in the column {} = {}".format("'Model'", len(set(co2Data.Model))))
print("The categories in the column {} are as follows = {}".format("'Model'",set(co2Data.Model)))

The count of categories in the column 'Model' = 2053
The categories in the column 'Model' are as follows = {'SAVANA 2500 PASSENGER', 'Edge', 'SANTA FE SPORT AWD', 'COOPER S ROADSTER', 'Wrangler Unlimited 4X4', '535i xDRIVE SEDAN', 'WRANGLER JK 4X4', 'SAVANA 1500 CARGO CONV', 'Sierra WT', 'AMG GLC 63 S 4MATIC+', 'SILVERADO 4WD FFV', '435i xDRIVE GRAN COUPE', 'STINGER AWD', 'Panamera 4S Executive', 'ProMaster City', '370Z', 'MAZDA6 TURBO', 'XTS AWD', 'Palisade', 'CAYENNE S HYBRID', 'M4 Cabriolet', 'TRANSIT CONNECT WAGON FFV', 'AMG SLC 43', 'GL 63 AMG', 'CRUZE', '911 Carrera 4 Cabriolet', 'Cayenne Turbo', 'S5', 'A4 ALLROAD QUATTRO', 'Range Rover Evoque', 'Macan', 'COOPER S COUPE', 'RANGE ROVER V8 5.0 SC FFV', 'JOHN COOPER WORKS 3 DOOR', 'M5 Sedan', 'Flex AWD GTDI', 'XJ R-Sport AWD', 'XJL PORTFOLIO 3.0 AWD', 'MDX SH-AWD', 'M550i xDrive', 'ACTIVEHYBRID 7L', '911 TURBO S CABRIOLET', 'Traverse AWD', 'AVENGER', 'FORTWO CABRIOLET', 'M240i xDRIVE COUPE', 'Edge AWD', 'iQ', 'AMG CLA 45', 'HURACAN 

In [174]:
# Classes within the 'VehicleClass' attribute
print("The categories in the column {} are as follows = {}".format("'VehicleClass'",set(co2Data.VehicleClass)))
print("The count of categories in the column {} = {}".format("'VehicleClass'", len(set(co2Data.VehicleClass))))

The categories in the column 'VehicleClass' are as follows = {'VAN - PASSENGER', 'COMPACT', 'PICKUP TRUCK - SMALL', 'STATION WAGON - SMALL', 'FULL-SIZE', 'SPECIAL PURPOSE VEHICLE', 'TWO-SEATER', 'MINICOMPACT', 'STATION WAGON - MID-SIZE', 'PICKUP TRUCK - STANDARD', 'MID-SIZE', 'VAN - CARGO', 'SUV - SMALL', 'MINIVAN', 'SUBCOMPACT', 'SUV - STANDARD'}
The count of categories in the column 'VehicleClass' = 16


## Target Encoding

- <i> Target encoding captures the relationship with the target variable without increasing dimensionality excessively, unlike one-hot encoding. </i>
- <i> In contrast, label encoding can mislead the model because it imposes an ordinal relationship. </i>

In [175]:
def targetEncodingScheme(feature):
    classesFromFeature = list(set(co2Data[feature]))
    classesFromFeature = { key : [] for key in classesFromFeature}
    for index in range(len(co2Data)):
        classesFromFeature[co2Data[feature].iloc[index]].append(co2Data['CO2Emissions'].iloc[index])
    classesFromFeature = {category: format((sum(emissions) / len(classesFromFeature[category])), ".2f") for category, emissions in classesFromFeature.items()}
    return classesFromFeature

In [176]:
classesFromMake = targetEncodingScheme('Make')
classesFromMake

{'NISSAN': '236.13',
 'PORSCHE': '258.55',
 'CHRYSLER': '244.05',
 'FORD': '261.96',
 'LAND ROVER': '287.85',
 'FIAT': '192.29',
 'KIA': '214.31',
 'DODGE': '276.90',
 'BUGATTI': '522.00',
 'HYUNDAI': '207.80',
 'ASTON MARTIN': '344.28',
 'ACURA': '221.08',
 'SUBARU': '218.18',
 'JAGUAR': '271.88',
 'RAM': '294.18',
 'CHEVROLET': '264.81',
 'ROLLS-ROYCE': '389.38',
 'CADILLAC': '266.23',
 'VOLKSWAGEN': '209.57',
 'BMW': '252.43',
 'BENTLEY': '367.91',
 'AUDI': '252.80',
 'SRT': '389.00',
 'MASERATI': '320.04',
 'GMC': '300.61',
 'LAMBORGHINI': '402.27',
 'HONDA': '191.45',
 'INFINITI': '252.62',
 'MINI': '196.64',
 'SMART': '151.43',
 'TOYOTA': '225.27',
 'JEEP': '250.49',
 'LEXUS': '242.11',
 'SCION': '198.12',
 'ALFA ROMEO': '231.32',
 'VOLVO': '230.17',
 'LINCOLN': '264.70',
 'MERCEDES-BENZ': '279.20',
 'MITSUBISHI': '201.12',
 'GENESIS': '282.57',
 'BUICK': '233.94',
 'MAZDA': '192.74'}

In [177]:
classesFromModel = targetEncodingScheme('Model')
classesFromModel

{'SAVANA 2500 PASSENGER': '426.00',
 'Edge': '226.50',
 'SANTA FE SPORT AWD': '259.83',
 'COOPER S ROADSTER': '186.75',
 'Wrangler Unlimited 4X4': '276.00',
 '535i xDRIVE SEDAN': '237.00',
 'WRANGLER JK 4X4': '298.50',
 'SAVANA 1500 CARGO CONV': '362.00',
 'Sierra WT': '275.00',
 'AMG GLC 63 S 4MATIC+': '309.00',
 'SILVERADO 4WD FFV': '301.30',
 '435i xDRIVE GRAN COUPE': '234.00',
 'STINGER AWD': '265.00',
 'Panamera 4S Executive': '261.00',
 'ProMaster City': '232.00',
 '370Z': '262.83',
 'MAZDA6 TURBO': '208.00',
 'XTS AWD': '270.75',
 'Palisade': '250.00',
 'CAYENNE S HYBRID': '251.00',
 'M4 Cabriolet': '289.50',
 'TRANSIT CONNECT WAGON FFV': '246.00',
 'AMG SLC 43': '236.00',
 'GL 63 AMG': '369.00',
 'CRUZE': '175.82',
 '911 Carrera 4 Cabriolet': '238.00',
 'Cayenne Turbo': '329.00',
 'S5': '256.44',
 'A4 ALLROAD QUATTRO': '223.50',
 'Range Rover Evoque': '225.00',
 'Macan': '264.00',
 'COOPER S COUPE': '186.75',
 'RANGE ROVER V8 5.0 SC FFV': '344.50',
 'JOHN COOPER WORKS 3 DOOR': 

In [178]:
classesFromVehicleClass = targetEncodingScheme('VehicleClass')
classesFromVehicleClass

{'VAN - PASSENGER': '397.21',
 'COMPACT': '218.04',
 'PICKUP TRUCK - SMALL': '276.33',
 'STATION WAGON - SMALL': '202.54',
 'FULL-SIZE': '264.52',
 'SPECIAL PURPOSE VEHICLE': '237.66',
 'TWO-SEATER': '282.42',
 'MINICOMPACT': '236.63',
 'STATION WAGON - MID-SIZE': '241.21',
 'PICKUP TRUCK - STANDARD': '301.94',
 'MID-SIZE': '222.82',
 'VAN - CARGO': '361.50',
 'SUV - SMALL': '236.37',
 'MINIVAN': '261.70',
 'SUBCOMPACT': '244.08',
 'SUV - STANDARD': '306.55'}

In [179]:
co2Data['Make'] = co2Data['Make'].map(classesFromMake).astype(float)
co2Data['Model'] = co2Data['Model'].map(classesFromModel).astype(float)
co2Data['VehicleClass'] = co2Data['VehicleClass'].map(classesFromVehicleClass).astype(float)

In [180]:
co2Data[:18]

Unnamed: 0,Make,Model,VehicleClass,EngineSize,Cylinders,FuelConsumptionCity,CO2Emissions
0,221.08,200.29,218.04,2.0,4,9.9,196
1,221.08,200.29,218.04,2.4,4,11.2,221
2,221.08,138.0,218.04,1.5,4,6.0,136
3,221.08,255.0,236.37,3.5,6,12.7,255
4,221.08,243.75,236.37,3.5,6,12.1,244
5,221.08,232.5,222.82,3.5,6,11.9,230
6,221.08,232.0,222.82,3.5,6,11.8,232
7,221.08,261.0,222.82,3.7,6,12.8,255
8,221.08,261.0,222.82,3.7,6,13.4,267
9,221.08,225.33,218.04,2.4,4,10.6,212


## Correlation Analysis

### Pearson's Correlation Coefficient

- The Pearson correlation coefficient \( r \) is a measure of the linear relationship between two variables. It is calculated using the formula:

$$ 
r = \frac{n \sum (XY) - \sum X \sum Y}{\sqrt{[n \sum (X^2) - (\sum X)^2][n \sum (Y^2) - (\sum Y)^2]}}
$$

- Dependent Variable    = CO2Emissions
- Independent Variables = EngineSize, Cylinders, FuelConsumptionCity

In [181]:
def Karl_Pearson_Correlation(indeFeature, depenFeature):
    X = co2Data[indeFeature]
    Y = co2Data[depenFeature]
    n = len(Y)        
    sumX = sum(X)
    sumY = sum(Y)
    sumXY = sum(X[:] * Y[:])
    sumXsq = sum(X[:] * X[:])
    sumYsq = sum(Y[:] * Y[:])
    numerator = (n * sumXY) - (sumX * sumY)
    denominator = (n * sumXsq - (sumX ** 2)) * (n * sumYsq - (sumY ** 2))
    if denominator != 0:
        corr = numerator / math.sqrt(denominator)
    else:
        corr = None
    return corr

In [182]:
correlations = {"EngineSize-CO2Emissions": [], "Cylinders-CO2Emissions": [], "FuelConsumptionCity-CO2Emissions": []}
for feature in ["EngineSize", "Cylinders", "FuelConsumptionCity"]:
    corr = Karl_Pearson_Correlation(feature, "CO2Emissions")
    correlations[f"{feature}-CO2Emissions"].append(corr)
    correlations[f"{feature}-CO2Emissions"].append(corr ** 2)
pd.DataFrame(correlations, index = ['Correlation (r)', 'Goodness of Fit (r^2)'])

Unnamed: 0,EngineSize-CO2Emissions,Cylinders-CO2Emissions,FuelConsumptionCity-CO2Emissions
Correlation (r),0.855194,0.834444,0.918415
Goodness of Fit (r^2),0.731356,0.696298,0.843487


In [183]:
#confusion matrix for examining multicollinearity
columns = co2Data.drop(columns = ['Make', 'Model', 'VehicleClass'], axis = 1).columns.tolist()
corr = {"EngineSize": [], "Cylinders": [], "FuelConsumptionCity": [], "CO2Emissions": []}
for x in columns:
    for y in columns:
        corr[x].append(Karl_Pearson_Correlation(x, y))
confusionMatrix = pd.DataFrame(corr, index = columns)
confusionMatrix

Unnamed: 0,EngineSize,Cylinders,FuelConsumptionCity,CO2Emissions
EngineSize,1.0,0.928464,0.834919,0.855194
Cylinders,0.928464,1.0,0.8015,0.834444
FuelConsumptionCity,0.834919,0.8015,1.0,0.918415
CO2Emissions,0.855194,0.834444,0.918415,1.0


## <center> The Multiple Linear Regression </center>

### ► Splitting the data into training and testing sets in an 80:20 ratio

In [184]:
n = int(len(co2Data) * 80/100)
indexForTrainingSet = random.sample(range(len(co2Data)), n)
print(f"the length of the sampled indices: {len(indexForTrainingSet)}")
print(f"Is duplicated? : {len(indexForTrainingSet) != len(set(indexForTrainingSet))}")
indexForTrainingSet[:10]

the length of the sampled indices: 4872
Is duplicated? : False


[3585, 3275, 5814, 5514, 408, 4300, 3805, 3398, 5412, 3]

In [185]:
print(f"Is this exactly 80%? : {len(indexForTrainingSet) == n}")

Is this exactly 80%? : True


In [186]:
trainingSet = {'Make': [], 'Model': [], 'VehicleClass': [], 'EngineSize': [], 'Cylinders': [], 'FuelConsumptionCity': [], 'CO2Emissions': []}
testingSet = {'Make': [], 'Model': [], 'VehicleClass': [], 'EngineSize': [], 'Cylinders': [], 'FuelConsumptionCity': [], 'CO2Emissions': []}
for index in range(len(co2Data)):
    if index in indexForTrainingSet:
        trainingSet['Make'].append(co2Data.iloc[index]['Make'])
        trainingSet['Model'].append(co2Data.iloc[index]['Model'])
        trainingSet['VehicleClass'].append(co2Data.iloc[index]['VehicleClass'])
        trainingSet['EngineSize'].append(co2Data.iloc[index]['EngineSize'])
        trainingSet['Cylinders'].append(co2Data.iloc[index]['Cylinders'])
        trainingSet['FuelConsumptionCity'].append(co2Data.iloc[index]['FuelConsumptionCity'])
        trainingSet['CO2Emissions'].append(co2Data.iloc[index]['CO2Emissions'])
    elif index not in indexForTrainingSet:
        testingSet['Make'].append(co2Data.iloc[index]['Make'])
        testingSet['Model'].append(co2Data.iloc[index]['Model'])
        testingSet['VehicleClass'].append(co2Data.iloc[index]['VehicleClass'])
        testingSet['EngineSize'].append(co2Data.iloc[index]['EngineSize'])
        testingSet['Cylinders'].append(co2Data.iloc[index]['Cylinders'])
        testingSet['FuelConsumptionCity'].append(co2Data.iloc[index]['FuelConsumptionCity'])
        testingSet['CO2Emissions'].append(co2Data.iloc[index]['CO2Emissions'])
trainingSet = pd.DataFrame(trainingSet).reset_index(drop = True)
testingSet = pd.DataFrame(testingSet).reset_index(drop = True)

In [187]:
print(f"the length of the training set: {len(trainingSet)}")
print(f"the length of the testing set: {len(testingSet)}")
print(f"the ratio of the training set to the entire dataset: {int((len(trainingSet) / len(co2Data)) * 100)}%")
print(f"the ratio of the testing set to the entire dataset: {int((len(testingSet) / len(co2Data)) * 100)}%")

the length of the training set: 4872
the length of the testing set: 1218
the ratio of the training set to the entire dataset: 80%
the ratio of the testing set to the entire dataset: 20%


In [188]:
trainingSet[:5]

Unnamed: 0,Make,Model,VehicleClass,EngineSize,Cylinders,FuelConsumptionCity,CO2Emissions
0,221.08,200.29,218.04,2.0,4.0,9.9,196.0
1,221.08,200.29,218.04,2.4,4.0,11.2,221.0
2,221.08,138.0,218.04,1.5,4.0,6.0,136.0
3,221.08,255.0,236.37,3.5,6.0,12.7,255.0
4,221.08,243.75,236.37,3.5,6.0,12.1,244.0


In [189]:
testingSet[:5]

Unnamed: 0,Make,Model,VehicleClass,EngineSize,Cylinders,FuelConsumptionCity,CO2Emissions
0,221.08,261.0,222.82,3.7,6.0,13.4,267.0
1,221.08,225.33,218.04,2.4,4.0,10.6,212.0
2,344.28,347.2,236.63,5.9,12.0,18.0,359.0
3,252.8,213.78,218.04,2.0,4.0,10.8,214.0
4,252.8,223.8,244.08,2.0,4.0,11.5,230.0


### ► Model training

#### ➔ Calculating Coefficients for Multiple Linear Regression Using Ordinary Least Squares

In [190]:
matrixX = trainingSet[['Make', 'Model', 'VehicleClass', 'EngineSize', 'Cylinders', 'FuelConsumptionCity']][:5]
ones = np.ones((matrixX.shape[0],1))
matrixX = np.hstack((ones, matrixX))

In [191]:
matrixX

array([[  1.  , 221.08, 200.29, 218.04,   2.  ,   4.  ,   9.9 ],
       [  1.  , 221.08, 200.29, 218.04,   2.4 ,   4.  ,  11.2 ],
       [  1.  , 221.08, 138.  , 218.04,   1.5 ,   4.  ,   6.  ],
       [  1.  , 221.08, 255.  , 236.37,   3.5 ,   6.  ,  12.7 ],
       [  1.  , 221.08, 243.75, 236.37,   3.5 ,   6.  ,  12.1 ]])

In [192]:
matrixY = np.matrix(trainingSet['CO2Emissions'][:5]).T

In [193]:
matrixY

matrix([[196.],
        [221.],
        [136.],
        [255.],
        [244.]])

In [194]:
matrixXTrans = matrixX.T
matrixXTrans

array([[  1.  ,   1.  ,   1.  ,   1.  ,   1.  ],
       [221.08, 221.08, 221.08, 221.08, 221.08],
       [200.29, 200.29, 138.  , 255.  , 243.75],
       [218.04, 218.04, 218.04, 236.37, 236.37],
       [  2.  ,   2.4 ,   1.5 ,   3.5 ,   3.5 ],
       [  4.  ,   4.  ,   4.  ,   6.  ,   6.  ],
       [  9.9 ,  11.2 ,   6.  ,  12.7 ,  12.1 ]])

In [195]:
matrix_XXT = np.matmul(matrixX, matrixXTrans)
matrix_XXT

array([[136652.9021, 136666.5721, 124137.228 , 151646.1612, 149386.9587],
       [136666.5721, 136682.0921, 124145.628 , 151664.0712, 149404.0887],
       [124137.228 , 124145.628 , 115517.058 , 135710.9312, 134154.8312],
       [151646.1612, 151664.0712, 135710.9312, 169982.6833, 167106.3133],
       [149386.9587, 149404.0887, 134154.8312, 167106.3133, 164356.8658]])

## <center> ......  Still on Developing ........... </center>