In [1]:
import math, copy
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

file_path = 'AirQualityUCI.xlsx'

# Load the Excel file into a DataFrame
data = pd.read_excel(file_path)

# Display the DataFrame
data.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2004-03-10,18:00:00,2.6,1360.0,150,11.881723,1045.5,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
1,2004-03-10,19:00:00,2.0,1292.25,112,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487
2,2004-03-10,20:00:00,2.2,1402.0,88,8.997817,939.25,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239
3,2004-03-10,21:00:00,2.2,1375.5,80,9.228796,948.25,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713
4,2004-03-10,22:00:00,1.6,1272.25,51,6.518224,835.5,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794


In [2]:
data.dtypes

Date             datetime64[ns]
Time                     object
CO(GT)                  float64
PT08.S1(CO)             float64
NMHC(GT)                  int64
C6H6(GT)                float64
PT08.S2(NMHC)           float64
NOx(GT)                 float64
PT08.S3(NOx)            float64
NO2(GT)                 float64
PT08.S4(NO2)            float64
PT08.S5(O3)             float64
T                       float64
RH                      float64
AH                      float64
dtype: object

In [3]:

# Assuming df is your DataFrame
# Convert timestamp dtype columns to numeric
timestamp_cols = data.select_dtypes(include=['datetime64']).columns
for col in timestamp_cols:
    data[col] = pd.to_numeric(data[col])

# Convert float dtype columns to numeric
float_cols = data.select_dtypes(include=['float64']).columns
for col in float_cols:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Convert object dtype columns to numeric
object_cols = data.select_dtypes(include=['object']).columns
for col in object_cols:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Displaying the converted DataFrame
data.head()


Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,1078876800000000000,,2.6,1360.0,150,11.881723,1045.5,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
1,1078876800000000000,,2.0,1292.25,112,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487
2,1078876800000000000,,2.2,1402.0,88,8.997817,939.25,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239
3,1078876800000000000,,2.2,1375.5,80,9.228796,948.25,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713
4,1078876800000000000,,1.6,1272.25,51,6.518224,835.5,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794


In [4]:
data.dtypes

Date               int64
Time             float64
CO(GT)           float64
PT08.S1(CO)      float64
NMHC(GT)           int64
C6H6(GT)         float64
PT08.S2(NMHC)    float64
NOx(GT)          float64
PT08.S3(NOx)     float64
NO2(GT)          float64
PT08.S4(NO2)     float64
PT08.S5(O3)      float64
T                float64
RH               float64
AH               float64
dtype: object

In [5]:
from sklearn.model_selection import train_test_split


def compute_cost(x, y, w, b):
    m = x.shape[0] 
    cost = 0
    
    for i in range(m):
        f_wb = np.dot(w, x[i]) + b
        cost += (f_wb - y[i]) ** 2
    total_cost = 1 / (2 * m) * cost

    return total_cost

def compute_gradient(x, y, w, b): 
    m = x.shape[0]    
    dj_dw = np.zeros_like(w)
    dj_db = 0
    
    for i in range(m):  
        f_wb = np.dot(w, x[i]) + b 
        dj_dw_i = (f_wb - y[i]) * x[i]
        dj_db_i = f_wb - y[i]
        dj_dw += dj_dw_i
        dj_db += dj_db_i
    dj_dw /= m
    dj_db /= m
        
    return dj_dw, dj_db

def gradient_descent(x, y, w_in, b_in, alpha, num_iters, cost_function, gradient_function): 
    w = copy.deepcopy(w_in)
    J_history = []
    p_history = []
    b = b_in
    w = w_in
    
    for i in range(num_iters):
        dj_dw, dj_db = gradient_function(x, y, w , b)     

        b -= alpha * dj_db                            
        w -= alpha * dj_dw

        J_history.append(cost_function(x, y, w , b))
        p_history.append([w, b])

        if i % math.ceil(num_iters / 10) == 0:
            print(f"Iteration {i:4}: Cost {J_history[-1]:0.2e} ",
                  f"dj_dw: {np.linalg.norm(dj_dw): 0.3e}, dj_db: {dj_db: 0.3e}  ",
                  f"w: {np.linalg.norm(w): 0.3e}, b:{b: 0.5e}")
 
    return w, b, J_history, p_history

x = np.array(data.iloc[:, 0:12])  # Selecting all rows and the first 12 columns for features
y = np.array(data.iloc[0:,12:13])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

# Assuming x_train has shape (m, 12) and y_train has shape (m,)
# Initialize parameters
w_init = np.zeros(12)  # Assuming 12 features
b_init = 0
# Some gradient descent settings
iterations = 5000
tmp_alpha = 1.0e-10
# Run gradient descent
w_final, b_final, J_hist, p_hist = gradient_descent(x_train ,y_train, w_init, b_init, tmp_alpha, 
                                                    iterations, compute_cost, compute_gradient)
print(f"(w,b) found by gradient descent: ({w_final}, {b_final})")


TypeError: unsupported format string passed to numpy.ndarray.__format__