In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
# Import the time module
import time

# Record the start time
start_time = time.time()

# Specify the folder path where the 22 files are located
folder_path = ''

# Initialize an empty list to store all the dataframes
all_dfs = []

# Loop through each file in the specified folder
for filename in os.listdir(folder_path):
    if filename.endswith(".xlsx"):
        file_path = os.path.join(folder_path, filename)
        
        # Read the Excel file into a dataframe
        df = pd.read_excel(file_path)
        
        # Convert 'iranidate' to datetime and handle missing values
        df['iranidate'] = pd.to_datetime(df['iranidate'], errors='coerce')
        
        def convert_to_numeric_date(date):
            base_date = datetime(2000, 1, 1)
            return (date - base_date).days
        df['iranidate'] = df['iranidate'].apply(convert_to_numeric_date)
        
        # Encode 'node' column
        label_encoder = LabelEncoder()
        df['node_encoded'] = label_encoder.fit_transform(df['node'])
        
        # Handling NaN values by finding the nearest non-empty cells in the same column
        for col in ['CO', 'O3', 'SO2', 'PM10', 'PM2.5', 'AQI']:
            for idx, value in df[col].items():
                if pd.isnull(value):  
                    upper_cell = df[col].iloc[:idx].last_valid_index()  
                    lower_cell = df[col].iloc[idx + 1:].first_valid_index()  
                    if upper_cell is not None and lower_cell is not None:  
                        avg = (df.at[upper_cell, col] + df.at[lower_cell, col]) / 2  
                        df.at[idx, col] = avg  
        
        # Append the modified dataframe to the list
        all_dfs.append(df)

# Concatenate all dataframes into a single dataframe
df = pd.concat(all_dfs, ignore_index=True)

# Check sample threshold
your_threshold_value = 100
if len(df) < your_threshold_value:
    print("Insufficient samples in the dataset.")
else:
    output_columns = ['CO', 'O3', 'NO2', 'SO2', 'PM10', 'PM2.5', 'AQI', 'Highest tempreture: 12pm', 'Wind:km/h']
    unique_nodes = df['node_encoded'].unique()  

    # Dictionary to store weighted predictions for each node and column
    weighted_predictions = {}

    # Dictionary to store weighted predictions for each node and column
    weighted_predictions = {}
    xgb_model_1 = xgb.XGBRegressor(colsample_bytree=0.8, learning_rate=0.3, max_depth=10, alpha=10, n_estimators=200)
    xgb_model_2 = xgb.XGBRegressor(colsample_bytree=0.8, learning_rate=0.3, max_depth=10, alpha=10, n_estimators=200)
    xgb_model_3 = xgb.XGBRegressor(colsample_bytree=0.8, learning_rate=0.3, max_depth=10, alpha=10, n_estimators=200)

    for node_value in unique_nodes:
        node_df = df[df['node_encoded'] == node_value]  
        
        # Dictionary to store predictions and true values for each column
        columns_data = {}

        for output_column in output_columns:
            if node_df[output_column].isnull().any():
                print(f"NaN values found in {output_column} for Node {node_value}. Handle missing values before modeling.")
            else:
                # Modify X to include the 'node' column along with 'iranidate'
                X = node_df[['iranidate', 'node_encoded']]
                Y = node_df[[output_column]]  
                X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

                xgb_model_1.fit(X_train, Y_train)
                Y1_pred_xgb = xgb_model_1.predict(X_test)
            
                xgb_model_2.fit(np.concatenate((X_test, Y1_pred_xgb.reshape(-1, 1)), axis=1), Y_test)
                Y2_pred_xgb = xgb_model_2.predict(np.concatenate((X_test, Y1_pred_xgb.reshape(-1, 1)), axis=1))

                xgb_model_3.fit(np.concatenate((X_test, Y2_pred_xgb.reshape(-1, 1)), axis=1), Y_test)
                Y3_pred_xgb = xgb_model_3.predict(np.concatenate((X_test, Y2_pred_xgb.reshape(-1, 1)), axis=1))
        

                # Calculate errors for the third XGBoost model
                mse_xgb_1 = mean_squared_error(Y_test, Y1_pred_xgb)
                mse_xgb_2 = mean_squared_error(Y_test, Y2_pred_xgb)
                mse_xgb_3 = mean_squared_error(Y_test, Y3_pred_xgb)
                rmse_xgb_3 = np.sqrt(mse_xgb_3)
                r2_xgb_1 = r2_score(Y_test, Y1_pred_xgb)
                r2_xgb_2 = r2_score(Y_test, Y2_pred_xgb)
                r2_xgb_3 = r2_score(Y_test, Y3_pred_xgb)
                evs_xgb_3 = explained_variance_score(Y_test, Y3_pred_xgb)

                # Store predictions and true values for each column in the dictionary
                columns_data[output_column] = (Y3_pred_xgb, Y_test)

                # Output results for the local model with weighted predictions
                print(f"Local Model Results for Node {node_value} - {output_column} with Weighted Predictions:")
                print(f"MSE for XGBoost (first Model): {mse_xgb_1}")
                print(f"MSE for XGBoost (second Model): {mse_xgb_2}")
                print(f"MSE for XGBoost (Third Model): {mse_xgb_3}")
                print(f"RMSE for XGBoost (Third Model): {rmse_xgb_3}")
                print(f"R2 for XGBoost (First Model): {r2_xgb_1}")
                print(f"R2 for XGBoost (Second Model): {r2_xgb_2}")
                print(f"R2 for XGBoost (Third Model): {r2_xgb_3}")
                print(f"EVS for XGBoost (Third Model): {evs_xgb_3}")

                # Store weighted predictions for each node and column
                if node_value not in weighted_predictions:
                    weighted_predictions[node_value] = {output_column: (Y3_pred_xgb, Y_test)}
                else:
                    weighted_predictions[node_value][output_column] = (Y3_pred_xgb, Y_test)

# Aggregate predictions for the global model using weighted average
global_X_test = []
global_Y_test = []

for node, columns in weighted_predictions.items():
    for column, (node_prediction, Y_test) in columns.items():
        global_X_test.extend(node_prediction * r2_xgb_3)  # Apply weights using the R2 score from the third XGBoost model
        global_Y_test.extend(Y_test.values)

# Create and train the global model (Random Forest)
global_model = RandomForestRegressor(random_state=42)
global_model.fit(np.array(global_X_test).reshape(-1, 1), global_Y_test)  # Reshape for RF

# Predict using the global model
global_Y_pred = global_model.predict(np.array(global_X_test).reshape(-1, 1))
global_mse = mean_squared_error(global_Y_test, global_Y_pred)

# Print results for the global model
print("Global Model Results:")
print(f"Mean Squared Error (MSE) for Global Model: {global_mse}")


# Record the end time
end_time = time.time()

# Calculate the duration of the program execution
duration_seconds = end_time - start_time

# Print the duration of the program execution
print(f"Duration of the program execution: {duration_seconds} seconds")




Local Model Results for Node 0 - CO with Weighted Predictions:
MSE for XGBoost (first Model): 108.87770980872973
MSE for XGBoost (second Model): 79.09272084888384
MSE for XGBoost (Third Model): 78.32795453676799
RMSE for XGBoost (Third Model): 8.850308160553958
R2 for XGBoost (Third Model): 0.5536969548584336
EVS for XGBoost (Third Model): 0.553696955066949
Local Model Results for Node 0 - O3 with Weighted Predictions:
MSE for XGBoost (first Model): 286.0230706674861
MSE for XGBoost (second Model): 195.98568458529672
MSE for XGBoost (Third Model): 195.41329190107254
RMSE for XGBoost (Third Model): 13.979030434943352
R2 for XGBoost (Third Model): 0.8476986587614481
EVS for XGBoost (Third Model): 0.8476986587619935
Local Model Results for Node 0 - NO2 with Weighted Predictions:
MSE for XGBoost (first Model): 421.2842300152824
MSE for XGBoost (second Model): 307.9520590697534
MSE for XGBoost (Third Model): 307.02740523439917
RMSE for XGBoost (Third Model): 17.522197500153887
R2 for XGBoos

  return fit_method(estimator, *args, **kwargs)


Global Model Results:
Mean Squared Error (MSE) for Global Model: 173.01246429349823
Duration of the program execution: 20.72672200202942 seconds
