In [1]:
import math, copy
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

file_path = 'AirQualityUCI.xlsx'

# Load the Excel file into a DataFrame
data = pd.read_excel(file_path)

# Display the DataFrame
data.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2004-03-10,18:00:00,2.6,1360.0,150,11.881723,1045.5,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
1,2004-03-10,19:00:00,2.0,1292.25,112,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487
2,2004-03-10,20:00:00,2.2,1402.0,88,8.997817,939.25,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239
3,2004-03-10,21:00:00,2.2,1375.5,80,9.228796,948.25,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713
4,2004-03-10,22:00:00,1.6,1272.25,51,6.518224,835.5,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794


In [2]:
data.isnull().sum()

Date             0
Time             0
CO(GT)           0
PT08.S1(CO)      0
NMHC(GT)         0
C6H6(GT)         0
PT08.S2(NMHC)    0
NOx(GT)          0
PT08.S3(NOx)     0
NO2(GT)          0
PT08.S4(NO2)     0
PT08.S5(O3)      0
T                0
RH               0
AH               0
dtype: int64

In [3]:
data['Time'] = pd.to_numeric(data['Time'], errors='coerce')
nan_indices = data['Time'].isna()  # Find indices of NaN values
num_nans = nan_indices.sum()  # Count the number of NaN values
random_numbers = np.random.rand(num_nans)  # Generate random numbers between 0 and 1

# Replace NaN values with random numbers
data.loc[nan_indices, 'Time'] = random_numbers
data.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2004-03-10,0.154416,2.6,1360.0,150,11.881723,1045.5,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
1,2004-03-10,0.954006,2.0,1292.25,112,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487
2,2004-03-10,0.181431,2.2,1402.0,88,8.997817,939.25,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239
3,2004-03-10,0.668073,2.2,1375.5,80,9.228796,948.25,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713
4,2004-03-10,0.375807,1.6,1272.25,51,6.518224,835.5,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794


In [4]:
data.isnull().sum()

Date             0
Time             0
CO(GT)           0
PT08.S1(CO)      0
NMHC(GT)         0
C6H6(GT)         0
PT08.S2(NMHC)    0
NOx(GT)          0
PT08.S3(NOx)     0
NO2(GT)          0
PT08.S4(NO2)     0
PT08.S5(O3)      0
T                0
RH               0
AH               0
dtype: int64

In [5]:
data.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2004-03-10,0.154416,2.6,1360.0,150,11.881723,1045.5,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
1,2004-03-10,0.954006,2.0,1292.25,112,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487
2,2004-03-10,0.181431,2.2,1402.0,88,8.997817,939.25,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239
3,2004-03-10,0.668073,2.2,1375.5,80,9.228796,948.25,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713
4,2004-03-10,0.375807,1.6,1272.25,51,6.518224,835.5,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794


In [6]:
data.dtypes

Date             datetime64[ns]
Time                    float64
CO(GT)                  float64
PT08.S1(CO)             float64
NMHC(GT)                  int64
C6H6(GT)                float64
PT08.S2(NMHC)           float64
NOx(GT)                 float64
PT08.S3(NOx)            float64
NO2(GT)                 float64
PT08.S4(NO2)            float64
PT08.S5(O3)             float64
T                       float64
RH                      float64
AH                      float64
dtype: object

In [7]:
data['Date'] = (data['Date'] - data['Date'].min()) / (data['Date'].max() - data['Date'].min())
data.dtypes

Date             float64
Time             float64
CO(GT)           float64
PT08.S1(CO)      float64
NMHC(GT)           int64
C6H6(GT)         float64
PT08.S2(NMHC)    float64
NOx(GT)          float64
PT08.S3(NOx)     float64
NO2(GT)          float64
PT08.S4(NO2)     float64
PT08.S5(O3)      float64
T                float64
RH               float64
AH               float64
dtype: object

In [8]:
data.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,0.0,0.154416,2.6,1360.0,150,11.881723,1045.5,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
1,0.0,0.954006,2.0,1292.25,112,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487
2,0.0,0.181431,2.2,1402.0,88,8.997817,939.25,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239
3,0.0,0.668073,2.2,1375.5,80,9.228796,948.25,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713
4,0.0,0.375807,1.6,1272.25,51,6.518224,835.5,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794


In [9]:
data = pd.DataFrame(data)

x = data.iloc[0:,0:12]
x

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3)
0,0.0,0.154416,2.6,1360.00,150,11.881723,1045.50,166.0,1056.25,113.0,1692.00,1267.50
1,0.0,0.954006,2.0,1292.25,112,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25
2,0.0,0.181431,2.2,1402.00,88,8.997817,939.25,131.0,1140.00,114.0,1554.50,1074.00
3,0.0,0.668073,2.2,1375.50,80,9.228796,948.25,172.0,1092.00,122.0,1583.75,1203.25
4,0.0,0.375807,1.6,1272.25,51,6.518224,835.50,131.0,1205.00,116.0,1490.00,1110.00
...,...,...,...,...,...,...,...,...,...,...,...,...
9352,1.0,0.768221,3.1,1314.25,-200,13.529605,1101.25,471.7,538.50,189.8,1374.25,1728.50
9353,1.0,0.484562,2.4,1162.50,-200,11.355157,1027.00,353.3,603.75,179.2,1263.50,1269.00
9354,1.0,0.287625,2.4,1142.00,-200,12.374538,1062.50,293.0,603.25,174.7,1240.75,1092.00
9355,1.0,0.424150,2.1,1002.50,-200,9.547187,960.50,234.5,701.50,155.7,1041.00,769.75


In [10]:
x['13'] =1
x

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),13
0,0.0,0.154416,2.6,1360.00,150,11.881723,1045.50,166.0,1056.25,113.0,1692.00,1267.50,1
1,0.0,0.954006,2.0,1292.25,112,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25,1
2,0.0,0.181431,2.2,1402.00,88,8.997817,939.25,131.0,1140.00,114.0,1554.50,1074.00,1
3,0.0,0.668073,2.2,1375.50,80,9.228796,948.25,172.0,1092.00,122.0,1583.75,1203.25,1
4,0.0,0.375807,1.6,1272.25,51,6.518224,835.50,131.0,1205.00,116.0,1490.00,1110.00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9352,1.0,0.768221,3.1,1314.25,-200,13.529605,1101.25,471.7,538.50,189.8,1374.25,1728.50,1
9353,1.0,0.484562,2.4,1162.50,-200,11.355157,1027.00,353.3,603.75,179.2,1263.50,1269.00,1
9354,1.0,0.287625,2.4,1142.00,-200,12.374538,1062.50,293.0,603.25,174.7,1240.75,1092.00,1
9355,1.0,0.424150,2.1,1002.50,-200,9.547187,960.50,234.5,701.50,155.7,1041.00,769.75,1


In [11]:
x = np.array(x)
x

array([[0.00000000e+00, 1.54415696e-01, 2.60000000e+00, ...,
        1.69200000e+03, 1.26750000e+03, 1.00000000e+00],
       [0.00000000e+00, 9.54006086e-01, 2.00000000e+00, ...,
        1.55875000e+03, 9.72250000e+02, 1.00000000e+00],
       [0.00000000e+00, 1.81431355e-01, 2.20000000e+00, ...,
        1.55450000e+03, 1.07400000e+03, 1.00000000e+00],
       ...,
       [1.00000000e+00, 2.87625154e-01, 2.40000000e+00, ...,
        1.24075000e+03, 1.09200000e+03, 1.00000000e+00],
       [1.00000000e+00, 4.24150227e-01, 2.10000000e+00, ...,
        1.04100000e+03, 7.69750000e+02, 1.00000000e+00],
       [1.00000000e+00, 2.13213372e-01, 2.20000000e+00, ...,
        1.12850000e+03, 8.16000000e+02, 1.00000000e+00]])

In [12]:
y = np.array(data.iloc[0:,12:13])
y

array([[13.5999999 ],
       [13.29999995],
       [11.9000001 ],
       ...,
       [26.89999962],
       [28.32499981],
       [28.50000048]])

In [29]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

# Cost function
# def cost(data, params):
#     total_cost = 0
#     for i in range(9000):
#         total_cost += (1/9000) * ((data[i] * params).sum() - y[i]) ** 2
#     return total_cost
def cost(x, y, params):
    total_cost = 0
    m = len(y)
    for i in range(m):
        total_cost += (1 / (2 * m)) * ((np.dot(x[i], params) - y[i]) ** 2)
    return total_cost

# Gradient descent
# def gd(data, params, lrate, iter_value):
#     for i in range(iter_value):
#         slopes = np.zeros(13)  # Initialize slopes as a NumPy array
#         for j in range(9000):
#             for k in range(13):
#                 slopes[k] += (1/9000) * ((data[j] * params).sum() - y[j]) * data[j][k]
#         params = params - lrate * slopes
#         print("Cost : ",cost(data, params))
#     return params
def gd(x, y, params, lrate, iter_value):
    m = len(y)
    for i in range(iter_value):
        slopes = np.zeros(13)  # Initialize slopes as a NumPy array
        for j in range(m):
            prediction = np.dot(x[j], params)
            error = prediction - y[j]
            for k in range(13):
                slopes[k] += (1 / m) * error * x[j][k]
        params = params - lrate * slopes
        if i % 500 == 0:
            print(f"Iteration {i}: Cost {cost(x, y, params)}")
    return params

# Running gradient descent
params = np.zeros(13)
lrate = 0.001
iter_value = 5000
params = gd(X_train, y_train,  params, lrate, iter_value)
print("Parameters : ",params)


Iteration 0: Cost [6.65838524e+09]


  slopes[k] += (1 / m) * error * x[j][k]


KeyboardInterrupt: 