### Import libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import matplotlib.pyplot as plt

### Load dataset

In [5]:
file_path = "./data/1991-2005/monthly/solar_dataset.csv"
data = pd.read_csv(file_path)

### Data Preprocessing

In [6]:
# # Convert Year and Month into a single feature 'YearMonth'
# data['YearMonth'] = data['Year'].astype(str) + data['Month'].astype(str).str.zfill(2)
# data['YearMonth'] = pd.to_datetime(data['YearMonth'], format='%Y%m')
# 
# # Drop original Year and Month columns
# data = data.drop(['Year', 'Month'], axis=1)

# Feature Scaling for coordinate and solar irradiation inputs
features = ['Azimuth (deg)', 'Longitude', 'Elevation', 'Latitude', 'Year', 'Month']
target = 'Merged Glo (Wh/m^2)'

X = data[features]
y = data[target]

# Normalize/Standardize the coordinate data
scaler = StandardScaler()
X[['Azimuth (deg)', 'Longitude', 'Elevation', 'Latitude']] = scaler.fit_transform(X[['Azimuth (deg)', 'Longitude', 'Elevation', 'Latitude']])


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


### Train Test Validation Split

In [7]:
# Train-Validation-Test Split (60% Train, 20% Validation, 20% Test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.6, random_state=42)


### Model Training with XGBoost

In [None]:
xgboost_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
xgboost_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10, verbose=True)

### Prediction and Evaluation of the XGBoost Model

In [9]:
# Prediction and Evaluation on Test Data
y_pred = xgboost_model.predict(X_test)

# Evaluation Metrics on Test Data
mse_test = mean_squared_error(y_test, y_pred)
r2_test = r2_score(y_test, y_pred)

print(f"Mean Squared Error on Test Data: {mse_test}")
print(f"R-squared on Test Data: {r2_test}")

model_id                                                    rmse          mse      mae    rmsle    mean_residual_deviance
StackedEnsemble_BestOfFamily_5_AutoML_1_20241007_170129  11532.7  1.33004e+08  6896.7       nan               1.33004e+08
StackedEnsemble_AllModels_3_AutoML_1_20241007_170129     12744.9  1.62433e+08  7418.34      nan               1.62433e+08
StackedEnsemble_AllModels_4_AutoML_1_20241007_170129     12745.4  1.62446e+08  7418.17      nan               1.62446e+08
StackedEnsemble_BestOfFamily_4_AutoML_1_20241007_170129  12800.4  1.63851e+08  7333.92      nan               1.63851e+08
GBM_grid_1_AutoML_1_20241007_170129_model_11             12848.3  1.65078e+08  7327.33      nan               1.65078e+08
GBM_grid_1_AutoML_1_20241007_170129_model_9              14487.9  2.099e+08    8211.98      nan               2.099e+08
GBM_grid_1_AutoML_1_20241007_170129_model_5              15016.7  2.25501e+08  9013.27      nan               2.25501e+08
StackedEnsemble_BestOfFami

### Retrain the XGboost model on the train+test dataset 

In [None]:
X_train_full = pd.concat([X_train, X_test])
y_train_full = pd.concat([y_train, y_test])
xgboost_model.fit(X_train_full, y_train_full)

### Save the trained model

In [None]:
model_filename = 'xgboost_solar_model.pkl'
joblib.dump(xgboost_model, model_filename)
print(f"Model saved to {model_filename}")

### Plotting Actual vs Predicted

In [None]:
# Plotting Actual vs Predicted on Test Data
plt.figure(figsize=(10, 6))
plt.plot(y_test.values, label='Actual')
plt.plot(y_pred, label='Predicted', linestyle='--')
plt.legend()
plt.xlabel('Samples')
plt.ylabel('Merged Glo (Wh/m^2)')
plt.title('Actual vs Predicted Solar Irradiation (Test Data)')
plt.show()

In [None]:
#print maximum and minimum predicted values
print("Maximum predicted value:", max(y_pred))
print("Minimum predicted value:", min(y_pred))