In [1]:
import math, copy
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

file_path = 'AirQualityUCI.xlsx'

# Load the Excel file into a DataFrame
data = pd.read_excel(file_path)

# Display the DataFrame
data.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2004-03-10,18:00:00,2.6,1360.0,150,11.881723,1045.5,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
1,2004-03-10,19:00:00,2.0,1292.25,112,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487
2,2004-03-10,20:00:00,2.2,1402.0,88,8.997817,939.25,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239
3,2004-03-10,21:00:00,2.2,1375.5,80,9.228796,948.25,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713
4,2004-03-10,22:00:00,1.6,1272.25,51,6.518224,835.5,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794


In [2]:
data.isnull().sum()

Date             0
Time             0
CO(GT)           0
PT08.S1(CO)      0
NMHC(GT)         0
C6H6(GT)         0
PT08.S2(NMHC)    0
NOx(GT)          0
PT08.S3(NOx)     0
NO2(GT)          0
PT08.S4(NO2)     0
PT08.S5(O3)      0
T                0
RH               0
AH               0
dtype: int64

In [3]:
data.dtypes

Date             datetime64[ns]
Time                     object
CO(GT)                  float64
PT08.S1(CO)             float64
NMHC(GT)                  int64
C6H6(GT)                float64
PT08.S2(NMHC)           float64
NOx(GT)                 float64
PT08.S3(NOx)            float64
NO2(GT)                 float64
PT08.S4(NO2)            float64
PT08.S5(O3)             float64
T                       float64
RH                      float64
AH                      float64
dtype: object

In [4]:
timestamp_cols = data.select_dtypes(include=['datetime64']).columns
for col in timestamp_cols:
    data[col] = pd.to_numeric(data[col])
    
object_cols = data.select_dtypes(include=['object']).columns
data = data.drop(object_cols, axis=1)
    
data.head()

Unnamed: 0,Date,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,1078876800000000000,2.6,1360.0,150,11.881723,1045.5,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
1,1078876800000000000,2.0,1292.25,112,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487
2,1078876800000000000,2.2,1402.0,88,8.997817,939.25,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239
3,1078876800000000000,2.2,1375.5,80,9.228796,948.25,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713
4,1078876800000000000,1.6,1272.25,51,6.518224,835.5,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794


In [7]:

# Drop rows with NaN values
data.dropna(inplace=True)

# Separate features (X) and target (y)
X = np.array(data.iloc[:, 0:11])  # Selecting all rows and the first 12 columns for features
y = np.array(data.iloc[0:,11:12])

# Split data into training and testing sets (75% training, 25% testing)
split_index = int(0.75 * len(data))
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

# Feature Scaling
X_train_scaled = (X_train - np.mean(X_train, axis=0)) / np.std(X_train, axis=0)
X_test_scaled = (X_test - np.mean(X_test, axis=0)) / np.std(X_test, axis=0)

# Define functions for multivariate linear regression

def compute_cost(X, y, w, b):
    m = X.shape[0] 
    cost = np.sum((np.dot(X, w) + b - y) ** 2)
    return cost / (2 * m)

def compute_gradient(X, y, w, b):
    m = X.shape[0]
    dw = np.dot(X.T, (np.dot(X, w) + b - y)) / m
    db = np.sum(np.dot(X, w) + b - y) / m
    return dw, db

def gradient_descent(X, y, w_init, b_init, alpha, num_iters):
    w = copy.deepcopy(w_init)
    b = b_init
    m = X.shape[0]
    J_history = []
    for i in range(num_iters):
        dw, db = compute_gradient(X, y, w, b)
        w -= alpha * dw
        b -= alpha * db
        cost = compute_cost(X, y, w, b)
        J_history.append(cost)
        if i % math.ceil(num_iters / 10) == 0:
            print(f"Iteration {i:4}: Cost {cost:0.2e}")
    return w, b, J_history

# Initialize parameters
w_init = np.zeros(X_train_scaled.shape[1])
b_init = 0

# Gradient Descent settings
iterations = 5000
learning_rate = 0.0001

# Run Gradient Descent
w_final, b_final, J_hist = gradient_descent(X_train_scaled, y_train, w_init, b_init, learning_rate, iterations)

# Plot cost versus iteration
plt.plot(J_hist)
plt.title("Cost vs. Iteration")
plt.xlabel("Iteration")
plt.ylabel("Cost")
plt.show()

# Test the model on the testing set
y_pred = np.dot(X_test_scaled, w_final) + b_final

# Calculate test performance (e.g., mean squared error)
mse = np.mean((y_test - y_pred) ** 2)
print(f"Test Mean Squared Error: {mse}")

# # Plot the predicted values vs. actual values
# plt.scatter(y_test, y_pred)
# plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red')  # Plot the 45-degree line
# # plt.xlabel("Actual Values")
# # plt.ylabel("Predicted Values")
# # plt.title("Actual vs. Predicted Values")
# plt.show()

  X_test_scaled = (X_test - np.mean(X_test, axis=0)) / np.std(X_test, axis=0)


ValueError: operands could not be broadcast together with shapes (11,) (11,7017) (11,) 

In [None]:
data = np.array(data)

# Split the data into training and testing sets
data_tr = data[:30]  # Training data
data_ts = data[30:]  # Testing data

# Extract features (x) and target (y) for training and testing
x_train = data_tr[:, :12]  # Features for training
y_train = data_tr[:, 12]   # Target for training
x_test = data_ts[:, :12]   # Features for testing
y_test = data_ts[:, 12]    # Target for testing

# Function to compute the cost (Mean Squared Error)
def compute_cost(x, y, w, b):
    m = x.shape[0]
    f_wb = np.dot(x, w) + b
    cost = np.sum((f_wb - y) ** 2) / (2 * m)
    return cost

# Function to compute the gradient
def compute_gradient(x, y, w, b):
    m = x.shape[0]
    f_wb = np.dot(x, w) + b
    dj_dw = np.dot(x.T, (f_wb - y)) / m
    dj_db = np.sum(f_wb - y) / m
    return dj_dw, dj_db

# Gradient descent function
def gradient_descent(x, y, w_in, b_in, alpha, num_iters, cost_function, gradient_function):
    w = copy.deepcopy(w_in)
    b = b_in
    J_history = []

    for i in range(num_iters):
        dj_dw, dj_db = gradient_function(x, y, w, b)
        w -= alpha * dj_dw
        b -= alpha * dj_db

        cost = cost_function(x, y, w, b)
        J_history.append(cost)

        if i % (num_iters // 10) == 0:
            print(f"Iteration {i:4}: Cost {cost:0.2e}")

    return w, b, J_history

# Initialize parameters
w_init = np.zeros(x_train.shape[1])  # Initialize weights to zeros
b_init = 0  # Initialize bias to zero

# Gradient descent settings
iterations = 5000
alpha = 0.0001

# Run gradient descent for training
w_final, b_final, J_hist_train = gradient_descent(x_train, y_train, w_init, b_init, alpha, iterations, compute_cost, compute_gradient)

# Compute predictions for training and testing
y_train_pred = np.dot(x_train, w_final) + b_final
y_test_pred = np.dot(x_test, w_final) + b_final

# Compute Mean Squared Error (MSE) for training and testing
mse_train = np.mean((y_train_pred - y_train) ** 2)
mse_test = np.mean((y_test_pred - y_test) ** 2)

print(f"Mean Squared Error (MSE) for training data: {mse_train}")
print(f"Mean Squared Error (MSE) for testing data: {mse_test}")