In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb  # Import XGBoost
import os
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import xgboost as xgb
import time

# Record the start time
start_time = time.time()

# Read data from Excel file
file_path = ''

df = pd.read_excel(file_path)

# Convert 'iranidate' to datetime and drop rows with invalid dates
df['iranidate'] = pd.to_datetime(df['iranidate'], errors='coerce')
df = df.dropna(subset=['iranidate'])

# Function to convert date values to numeric representation
def convert_to_numeric_date(date):
    base_date = datetime(2000, 1, 1)
    return (date - base_date).days

df['iranidate'] = df['iranidate'].apply(convert_to_numeric_date)

# Encode 'node' column
label_encoder = LabelEncoder()
df['node_encoded'] = label_encoder.fit_transform(df['node'])

# Handling NaN values by finding the nearest non-empty cells in the same column
for col in ['CO', 'O3', 'SO2', 'PM10', 'PM2.5', 'AQI']:
    for idx, value in df[col].items():
        if pd.isnull(value):  # Check for NaN values
            upper_cell = df[col].iloc[:idx].last_valid_index()  # Find the first non-empty cell above
            lower_cell = df[col].iloc[idx + 1:].first_valid_index()  # Find the first non-empty cell below
            if upper_cell is not None and lower_cell is not None:  # If both upper and lower cells are found
                avg = (df.at[upper_cell, col] + df.at[lower_cell, col]) / 2  # Calculate the average
                df.at[idx, col] = avg  # Fill the empty cell with the average


# Check sample threshold
your_threshold_value = 100
if len(df) < your_threshold_value:
    print("Insufficient samples in the dataset.")
else:
    output_columns = ['CO', 'O3', 'NO2', 'SO2', 'PM10', 'PM2.5', 'AQI', 'Highest tempreture: 12pm', 'Wind:km/h']
    unique_nodes = df['node'].unique()

    
    for node_value in unique_nodes:
        node_df = df[df['node'] == node_value]
        for output_column in output_columns:
            if node_df[output_column].isnull().any():
                print(f"NaN values found in {output_column} for Node {node_value}. Handle missing values before modeling.")
            else:
                X = node_df[['iranidate', 'node_encoded']]
                Y = node_df[[output_column]]
                X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

                scaler = StandardScaler()
                X_train_scaled = scaler.fit_transform(X_train)
                X_test_scaled = scaler.transform(X_test)

                # Use XGBoost model instead of LSTM
                model = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                                         max_depth = 5, alpha = 10, n_estimators = 100)

                model.fit(X_train_scaled, Y_train.values.ravel())  # Fit XGBoost model

                Y_pred = model.predict(X_test_scaled)
                Y_pred = np.clip(Y_pred, a_min=0, a_max=None)

                mse = mean_squared_error(Y_test, Y_pred)
                rmse = np.sqrt(mse)
                r2 = r2_score(Y_test, Y_pred)
                evs = explained_variance_score(Y_test, Y_pred)

                print(f"For Node {node_value} and output column {output_column}:")
                print(f"Mean Squared Error (MSE): {mse}") 
                print(f"Root Mean Squared Error (RMSE): {rmse}")
                print(f"R-squared Score: {r2}")
                print(f"Explained Variance Score: {evs}")

                # Record the end time
end_time = time.time()

# Calculate the duration of the program execution
duration_seconds = end_time - start_time

# Print the duration of the program execution
print(f"Duration of the program execution: {duration_seconds} seconds")

2024-02-20 21:15:06.116839: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


For Node اقدسیه.منطقه1 and output column CO:
Mean Squared Error (MSE): 52.2541058549973
Root Mean Squared Error (RMSE): 7.228700149750113
R-squared Score: 0.353086761503526
Explained Variance Score: 0.35316849303861186
For Node اقدسیه.منطقه1 and output column O3:
Mean Squared Error (MSE): 337.4835357256914
Root Mean Squared Error (RMSE): 18.370724964619427
R-squared Score: 0.7235773886747228
Explained Variance Score: 0.7257126126431405
For Node اقدسیه.منطقه1 and output column NO2:
Mean Squared Error (MSE): 444.4083255603937
Root Mean Squared Error (RMSE): 21.080994415833274
R-squared Score: 0.26873131690692065
Explained Variance Score: 0.275142838929904
For Node اقدسیه.منطقه1 and output column SO2:
Mean Squared Error (MSE): 46.29400070561974
Root Mean Squared Error (RMSE): 6.803969481532066
R-squared Score: 0.373595845478783
Explained Variance Score: 0.38638797509246403
For Node اقدسیه.منطقه1 and output column PM10:
Mean Squared Error (MSE): 133.61824715387297
Root Mean Squared Error (