In [None]:
import pandas
import numpy as np
import matplotlib.pyplot as plot
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import os
import seaborn as sns
from sklearn.preprocessing import StandardScaler







In [None]:
# load the data 
data = pandas.read_csv('co2_emissions_data.csv')

print(data.head())



In [None]:
# step 1 : data analysis 


# check nulls in the data
print(data.isnull().sum())

# get description of the data to detect the scale of the data 
numericFeatures = data.select_dtypes(include=[np.number])
print(numericFeatures.describe())
 

In [None]:

sns.pairplot(data, diag_kind='hist')
plot.show()


In [None]:
# as the heatmap works numerical values so we well select the numerical values only
numeric_cols = data.select_dtypes(include=[np.number])
correlation_matrix = numeric_cols.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='summer')
plot.show()


In [None]:
"""Make                                0
Model                               0
Vehicle Class                       0
Engine Size(L)                      0
Cylinders                           0
Transmission                        0
Fuel Type                           0
Fuel Consumption City (L/100 km)    0
Fuel Consumption Hwy (L/100 km)     0
Fuel Consumption Comb (L/100 km)    0
Fuel Consumption Comb (mpg)         0
CO2 Emissions(g/km)                 0
Emission Class      """

#Step 2  Preprocessing
feature1  = 'Cylinders'
feature2 = 'Fuel Consumption Comb (L/100 km)'
output = 'CO2 Emissions(g/km)'
alpha = 0.1   # learning rate 0.1
maxIterations = 1000



data = shuffle(data, random_state=100)  # shuffle the data

trainingData, testData = train_test_split(data, test_size=0.3, random_state=100) # split the data into training and testing data

scaler = StandardScaler()
trainingData = pandas.DataFrame(scaler.fit_transform(trainingData[[feature1, feature2, output]]), columns=[feature1, feature2, output])
testData = pandas.DataFrame(scaler.fit_transform(testData[[feature1, feature2, output]]), columns=[feature1, feature2, output])

x1 = trainingData[feature1]
x2 = trainingData[feature2] 


print(x1.describe())
print(x2.describe())

# scaler = StandardScaler()

# x1Normalized = scaler.fit_transform(x1.values.reshape(-1, 1))
# x2Noralized = scaler.fit_transform(x2.values.reshape(-1, 1))

# print(x1Normalized[:5])
# print(x2Noralized[:5])


In [None]:
# Step 3 - Training the model

x = np.column_stack((x1, x2))
y = trainingData[output]  ## TODO : check if we need to normalize the y values

# y = (y - y.mean()) / y.std() 

# yMean = y.mean()
# yStd = y.std()

# y = (y - yMean) / yStd

# def denormalize_y(yNormalized, yMean, yStd):
#     return yNormalized * yStd + yMean


In [None]:

# print(x[:5])
# print(y[:5])

costs = []

def  fitGD(x, y, alpha, maxIterations):
    x = np.c_[np.ones(x.shape[0]), x]  # Add a column of ones to x for the bias term
    thetas = np.random.rand(x.shape[1]) # initialize thetas with zeros based on the number of features in the data + 1 for the bias
    for i in range(maxIterations):
        h = np.dot( x, thetas)
        for j in range(len(thetas)):
            partialDerivative = (1/len(y)) * np.sum((h - y) * x[:, j])
            thetas[j] = thetas[j] - alpha * partialDerivative
            cost = (1/len(y)) * np.sum(np.square(h - y))
            costs.append(cost)
            
    return thetas


thetas = fitGD(x, y, alpha, maxIterations)

plot.plot(costs)
plot.xlabel('Number of Iterations')
plot.ylabel('Cost')
plot.show()

print(thetas)




In [215]:

# Prepare the test data
x1Test = testData[feature1]
x2Test = testData[feature2]

# x1TestNormalized = scaler.transform(x1Test.values.reshape(-1, 1))
# x2TestNormalized = scaler.transform(x2Test.values.reshape(-1, 1))

xTest = np.column_stack((x1Test, x2Test))
xTest = np.c_[np.ones(xTest.shape[0]), xTest]  # Add a column of ones to x_test for the bias term

# Predict the CO2 emissions for the test set
yTest = testData[output]



yPred = np.dot( xTest, thetas)



# # Inverse transform only the output column
# yPred = pandas.DataFrame(scaler.inverse_transform(testData[[feature1, feature2, output]])[:, 2].reshape(-1, 1), columns=[output])
# yTest = pandas.DataFrame(scaler.inverse_transform(testData[[feature1, feature2, output]])[:, 2].reshape(-1, 1), columns=[output])

# print(yPred[:5])
# print(yTest[:5])


# yPred = denormalize_y(yPred, yMean, yStd)



# Calculate the R2 score
r2 = r2_score(yTest, yPred)
print(f'R2 score: {r2}')



print(thetas)

## TODO check this : The logistic regression model should be a stochastic gradient
## descent classifier



R2 score: 0.8812404252877073
[-1.17576158e-16  2.93912439e-01  6.88954267e-01]
