In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
# Import the time module
import time

# Record the start time
start_time = time.time()

# Read data from Excel file
file_path = ''
df = pd.read_excel(file_path)

# Convert 'iranidate' to datetime and drop rows with invalid dates
df['iranidate'] = pd.to_datetime(df['iranidate'], errors='coerce')
df = df.dropna(subset=['iranidate'])

# Function to convert date values to numeric representation
def convert_to_numeric_date(date):
    base_date = datetime(2000, 1, 1)
    return (date - base_date).days

df['iranidate'] = df['iranidate'].apply(convert_to_numeric_date)

# Encode 'node' column
label_encoder = LabelEncoder()
df['node_encoded'] = label_encoder.fit_transform(df['node'])

# Handling NaN values by finding the nearest non-empty cells in the same column 
for col in ['CO', 'O3', 'SO2', 'PM10', 'PM2.5', 'AQI']:
    for idx, value in df[col].items():
        if pd.isnull(value):
            upper_cell = df[col].iloc[:idx].last_valid_index()
            lower_cell = df[col].iloc[idx + 1:].first_valid_index()
            if upper_cell is not None and lower_cell is not None:
                avg = (df.at[upper_cell, col] + df.at[lower_cell, col]) / 2
                df.at[idx, col] = avg

# Check sample threshold
your_threshold_value = 100
if len(df) < your_threshold_value:
    print("Insufficient samples in the dataset.")
else:
    output_columns = ['CO', 'O3', 'NO2', 'SO2', 'PM10', 'PM2.5', 'AQI', 'Highest tempreture: 12pm']
    unique_nodes = df['node'].unique()  

    # Dictionary to store weighted predictions for each node and column
    weighted_predictions = {}
                
    for node_value in unique_nodes:
        node_df = df[df['node'] == node_value]  
        
        # Dictionary to store predictions and true values for each column
        columns_data = {}

        for output_column in output_columns:
            if node_df[output_column].isnull().any():
                print(f"NaN values found in {output_column} for Node {node_value}. Handle missing values before modeling.")
            else:
                # Modify X to include the 'node' column along with 'iranidate'
                X = node_df[['iranidate', 'node_encoded']]
                Y = node_df[[output_column]]  
                X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
                xgb_model_1 = xgb.XGBRegressor(colsample_bytree=0.8, learning_rate=0.3, max_depth=10, alpha=10, n_estimators=200)
                xgb_model_1.fit(X_train, Y_train)
                Y1_pred_xgb = xgb_model_1.predict(X_test)
                xgb_model_2 = xgb.XGBRegressor(colsample_bytree=0.8, learning_rate=0.3, max_depth=10, alpha=10, n_estimators=200)
                xgb_model_2.fit(np.concatenate((X_test, Y1_pred_xgb.reshape(-1, 1)), axis=1), Y_test)
                Y2_pred_xgb = xgb_model_2.predict(np.concatenate((X_test, Y1_pred_xgb.reshape(-1, 1)), axis=1))
                xgb_model_3 = xgb.XGBRegressor(colsample_bytree=0.8, learning_rate=0.3, max_depth=10, alpha=10, n_estimators=200)
                xgb_model_3.fit(np.concatenate((X_test, Y2_pred_xgb.reshape(-1, 1)), axis=1), Y_test)
                Y3_pred_xgb = xgb_model_3.predict(np.concatenate((X_test, Y2_pred_xgb.reshape(-1, 1)), axis=1))
                xgb_model_4 = xgb.XGBRegressor(colsample_bytree=0.8, learning_rate=0.3, max_depth=10, alpha=10, n_estimators=200)
                xgb_model_4.fit(np.concatenate((X_test, Y3_pred_xgb.reshape(-1, 1)), axis=1), Y_test)
                Y4_pred_xgb = xgb_model_4.predict(np.concatenate((X_test, Y3_pred_xgb.reshape(-1, 1)), axis=1))

                # Calculate errors for the fourth XGBoost model
                mse_xgb_1 = mean_squared_error(Y_test, Y1_pred_xgb)
                mse_xgb_2 = mean_squared_error(Y_test, Y2_pred_xgb)
                mse_xgb_3 = mean_squared_error(Y_test, Y3_pred_xgb)
                mse_xgb_4 = mean_squared_error(Y_test, Y4_pred_xgb)
                rmse_xgb_4 = np.sqrt(mse_xgb_4)
                r2_xgb_4 = r2_score(Y_test, Y4_pred_xgb)
                evs_xgb_4 = explained_variance_score(Y_test, Y4_pred_xgb)

                # Store predictions and true values for each column in the dictionary
                columns_data[output_column] = (Y4_pred_xgb, Y_test)

                # Output results for the local model with weighted predictions
                print(f"Local Model Results for Node {node_value} - {output_column} with Weighted Predictions:")
                print(f"MSE for XGB_1: {mse_xgb_1}")
                print(f"MSE for XGB_2: {mse_xgb_2}")
                print(f"MSE for XGB_3: {mse_xgb_3}")
                print(f"MSE for XGB_4: {mse_xgb_4}")
                print(f"RMSE for XGB_4: {rmse_xgb_4}")
                print(f"R2 for XGB_4: {r2_xgb_4}")
                print(f"EVS for XGB_4: {evs_xgb_4}")

                # Store weighted predictions for each node and column
                if node_value not in weighted_predictions:
                    weighted_predictions[node_value] = {output_column: (Y4_pred_xgb, Y_test)}
                else:
                    weighted_predictions[node_value][output_column] = (Y4_pred_xgb, Y_test)


# Aggregate predictions for the global model using weighted average
global_X_test = []
global_Y_test = []

for node, columns in weighted_predictions.items():
    for column, (node_prediction, Y_test) in columns.items():
        global_X_test.extend(node_prediction * r2_xgb_4)  # Apply weights using the R2 score from the fourth XGBoost model
        global_Y_test.extend(Y_test.values)

# Create and train the global model (Random Forest)
global_model = RandomForestRegressor(random_state=42)
global_model.fit(np.array(global_X_test).reshape(-1, 1), global_Y_test)  # Reshape for RF

# Predict using the global model
global_Y_pred = global_model.predict(np.array(global_X_test).reshape(-1, 1))
global_mse = mean_squared_error(global_Y_test, global_Y_pred)

# Print results for the global model
print("Global Model Results:")
print(f"Mean Squared Error (MSE) for Global Model: {global_mse}")

# Record the end time
end_time = time.time()

# Calculate the duration of the program execution
duration_seconds = end_time - start_time

# Print the duration of the program execution
print(f"Duration of the program execution: {duration_seconds} seconds")


2024-02-27 22:55:22.614336: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Local Model Results for Node اقدسیه.منطقه1 - CO with Weighted Predictions:
MSE for XGB_1: 51.02335610081563
MSE for XGB_2: 12.96339893055318
MSE for XGB_3: 5.473984936654395
MSE for XGB_4: 4.501487987712225
RMSE for XGB_4: 2.1216710366388623
R2 for XGB_4: 0.9442709405407346
EVS for XGB_4: 0.9442713475910993
Local Model Results for Node اقدسیه.منطقه1 - O3 with Weighted Predictions:
MSE for XGB_1: 415.29830034606607
MSE for XGB_2: 16.294605543988634
MSE for XGB_3: 8.004895184759615
MSE for XGB_4: 6.041959425388637
RMSE for XGB_4: 2.458039752605445
R2 for XGB_4: 0.995051212799771
EVS for XGB_4: 0.9950512156419704
Local Model Results for Node اقدسیه.منطقه1 - NO2 with Weighted Predictions:
MSE for XGB_1: 456.89028626028204
MSE for XGB_2: 18.92046274248124
MSE for XGB_3: 8.38755324626844
MSE for XGB_4: 5.942090836809127
RMSE for XGB_4: 2.437640424018507
R2 for XGB_4: 0.9902223592783201
EVS for XGB_4: 0.9902223659370338
Local Model Results for Node اقدسیه.منطقه1 - SO2 with Weighted Prediction

  return fit_method(estimator, *args, **kwargs)


Global Model Results:
Mean Squared Error (MSE) for Global Model: 1.3866003143517984
Duration of the program execution: 42.953099966049194 seconds
