In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
# Import the time module
import time

# Record the start time
start_time = time.time()

# Read data from Excel file
file_path = ''
df = pd.read_excel(file_path)

# Convert 'iranidate' to datetime and drop rows with invalid dates
df['iranidate'] = pd.to_datetime(df['iranidate'], errors='coerce')
df = df.dropna(subset=['iranidate'])

# Function to convert date values to numeric representation
def convert_to_numeric_date(date):
    base_date = datetime(2000, 1, 1)
    return (date - base_date).days

df['iranidate'] = df['iranidate'].apply(convert_to_numeric_date)

# Encode 'node' column
label_encoder = LabelEncoder()
df['node_encoded'] = label_encoder.fit_transform(df['node'])

# Handling NaN values by finding the nearest non-empty cells in the same column 
for col in ['CO', 'O3', 'SO2', 'PM10', 'PM2.5', 'AQI']:
    for idx, value in df[col].items():
        if pd.isnull(value):
            upper_cell = df[col].iloc[:idx].last_valid_index()
            lower_cell = df[col].iloc[idx + 1:].first_valid_index()
            if upper_cell is not None and lower_cell is not None:
                avg = (df.at[upper_cell, col] + df.at[lower_cell, col]) / 2
                df.at[idx, col] = avg

# Check sample threshold
your_threshold_value = 100
if len(df) < your_threshold_value:
    print("Insufficient samples in the dataset.")
else:
    output_columns = ['CO', 'O3', 'NO2', 'SO2', 'PM10', 'PM2.5', 'AQI', 'Highest tempreture: 12pm', 'Wind:km/h']
    unique_nodes = df['node'].unique()  

    # Dictionary to store weighted predictions for each node and column
    weighted_predictions = {}

    for node_value in unique_nodes:
        node_df = df[df['node'] == node_value]  
        
        # Dictionary to store predictions and true values for each column
        columns_data = {}

        for output_column in output_columns:
            if node_df[output_column].isnull().any():
                print(f"NaN values found in {output_column} for Node {node_value}. Handle missing values before modeling.")
            else:
                X = node_df[['iranidate', 'node_encoded']]
                Y = node_df[[output_column]]  
                X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
                
                # XGBoost model as the first model
                xgb_model_1 = xgb.XGBRegressor(colsample_bytree=0.8, learning_rate=0.1, max_depth=5, alpha=10, n_estimators=100)
                xgb_model_1.fit(X_train, Y_train)
                Y1_pred_xgb = xgb_model_1.predict(X_test)

                # XGBoost model as the secondary model, taking Y1_pred_xgb as input
                xgb_model_2 = xgb.XGBRegressor(colsample_bytree=0.8, learning_rate=0.1, max_depth=5, alpha=10, n_estimators=100)
                xgb_model_2.fit(np.concatenate((X_test, Y1_pred_xgb.reshape(-1, 1)), axis=1), Y_test)
                Y2_pred_xgb = xgb_model_2.predict(np.concatenate((X_test, Y1_pred_xgb.reshape(-1, 1)), axis=1))

                # Calculate errors for the third XGBoost model
                mse_xgb_1 = mean_squared_error(Y_test, Y1_pred_xgb)
                mse_xgb_2 = mean_squared_error(Y_test, Y2_pred_xgb)
                rmse_xgb_2 = np.sqrt(mse_xgb_2)
                r2_xgb_2 = r2_score(Y_test, Y2_pred_xgb)
                evs_xgb_2 = explained_variance_score(Y_test, Y2_pred_xgb)

                # Store predictions and true values for each column in the dictionary
                columns_data[output_column] = (Y2_pred_xgb, Y_test)

                # Output results for the local model with weighted predictions
                print(f"Local Model Results for Node {node_value} - {output_column} with Weighted Predictions:")
                print(f"MSE for XGB_1: {mse_xgb_1}")
                print(f"MSE for XGB_2: {mse_xgb_2}")
                print(f"RMSE for XGB_2: {rmse_xgb_2}")
                print(f"R2 for XGB_2: {r2_xgb_2}")
                print(f"EVS for XGB_2: {evs_xgb_2}")

                # Store weighted predictions for each node and column
                if node_value not in weighted_predictions:
                    weighted_predictions[node_value] = {output_column: (Y2_pred_xgb, Y_test)}
                else:
                    weighted_predictions[node_value][output_column] = (Y2_pred_xgb, Y_test)

# Aggregate predictions for the global model using weighted average
global_X_test = []
global_Y_test = []

for node, columns in weighted_predictions.items():
    for column, (node_prediction, Y_test) in columns.items():
        global_X_test.extend(node_prediction * r2_xgb_2)  # Apply weights using the R2 score from the third XGBoost model
        global_Y_test.extend(Y_test.values)

# Create and train the global model (Random Forest)
global_model = RandomForestRegressor(random_state=42)
global_model.fit(np.array(global_X_test).reshape(-1, 1), global_Y_test)  # Reshape for RF

# Predict using the global model
global_Y_pred = global_model.predict(np.array(global_X_test).reshape(-1, 1))
global_mse = mean_squared_error(global_Y_test, global_Y_pred)

# Print results for the global model
print("Global Model Results:")
print(f"Mean Squared Error (MSE) for Global Model: {global_mse}")

# Record the end time
end_time = time.time()

# Calculate the duration of the program execution
duration_seconds = end_time - start_time

# Print the duration of the program execution
print(f"Duration of the program execution: {duration_seconds} seconds")



2024-02-27 00:03:51.417456: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Local Model Results for Node اقدسیه.منطقه1 - CO with Weighted Predictions:
MSE for XGB_1: 52.2541058549973
MSE for XGB_2: 15.116398722483718
RMSE for XGB_2: 3.887981317146948
R2 for XGB_2: 0.812856840779131
EVS for XGB_2: 0.8128932616389081
Local Model Results for Node اقدسیه.منطقه1 - O3 with Weighted Predictions:
MSE for XGB_1: 337.4835357256914
MSE for XGB_2: 23.880381419412664
RMSE for XGB_2: 4.886755715135827
R2 for XGB_2: 0.980440297992009
EVS for XGB_2: 0.9804469685305613
Local Model Results for Node اقدسیه.منطقه1 - NO2 with Weighted Predictions:
MSE for XGB_1: 444.4083255603937
MSE for XGB_2: 42.09529000349086
RMSE for XGB_2: 6.4880883165606535
R2 for XGB_2: 0.9307326944281319
EVS for XGB_2: 0.9307328126846613
Local Model Results for Node اقدسیه.منطقه1 - SO2 with Weighted Predictions:
MSE for XGB_1: 46.29400070561974
MSE for XGB_2: 9.333661788075924
RMSE for XGB_2: 3.0551042188566866
R2 for XGB_2: 0.8737062161007625
EVS for XGB_2: 0.8737207582559169
Local Model Results for Node 

  return fit_method(estimator, *args, **kwargs)


Global Model Results:
Mean Squared Error (MSE) for Global Model: 5.191857072402621
Duration of the program execution: 17.415807962417603 seconds
