In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, PowerTransformer, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb

In [4]:
# ----------------- Load Data -----------------
df = pd.read_excel("Solardata_final_1.xlsx")
df["Date and Time"] = pd.to_datetime(df["Date and Time"])
df["hour"] = df["Date and Time"].dt.hour
df["dayofweek"] = df["Date and Time"].dt.dayofweek
df["month"] = df["Date and Time"].dt.month

In [5]:
target = "Output Power (kW)"
X = df.drop(columns=["SL No.", "Date and Time", target])
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# ----------------- Preprocessing -----------------
num_cols = X.select_dtypes(include=["number"]).columns
cat_cols = X.select_dtypes(include=["object", "category"]).columns

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("power", PowerTransformer()),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, num_cols),
    ("cat", categorical_transformer, cat_cols)
])

In [8]:
# ----------------- Model -----------------
xgb_pipeline = Pipeline([
    ("preproc", preprocessor),
    ("xgb", xgb.XGBRegressor(random_state=42, n_jobs=-1, verbosity=0))
])

In [9]:
# ----------------- Hyperparameter Tuning -----------------

# "xgb__n_estimators" →     NUMBER OF TREES (BOOSTING ROUNDS).
# "xgb__max_depth" →        MAXIMUM DEPTH OF TREES (CONTROLS COMPLEXITY).
# "xgb__learning_rate" →    HOW MUCH EACH TREE CONTRIBUTES (LOWER = SLOWER BUT MORE ACCURATE).
# "xgb__subsample" →        FRACTION OF TRAINING SAMPLES USED PER TREE (LIKE BAGGING).
# "xgb__colsample_bytree" → FRACTION OF FEATURES USED PER TREE.


param_dist = {
    "xgb__n_estimators": [100, 200, 400],
    "xgb__max_depth": [3, 5, 7, 9],
    "xgb__learning_rate": [0.01, 0.05, 0.1],
    "xgb__subsample": [0.7, 0.8, 1.0],
    "xgb__colsample_bytree": [0.7, 0.8, 1.0]
}

search = RandomizedSearchCV(
    xgb_pipeline,       # PIPELINE THAT CONTAINS PREPROCESSING + XGBOOST MODEL.
    param_distributions=param_dist, n_iter=15, cv=3, scoring="neg_mean_absolute_error",
    random_state=42,    # ENSURES REPRODUCIBILITY. SETTING  random_state=42 (OR ANY NUMBER) MAKES THE PROCESS DETERMINISTIC AND REPRODUCIBLE.
    n_jobs=-1,          # USE ALL CPU CORES FOR SPEED.
    verbose=1           # PRINTS PROGRESS INFO. verbose=0 : NO OUTPUT. verbose=1 : PRINTS BASIC PROGRESS INFO (E.G., “FITTING 3 FOLDS FOR EACH OF 15 CANDIDATES…”). HIGHER VALUES (LIKE 2 OR 3) : MORE DETAILED LOGGING.
)

search.fit(X_train, y_train) # RUNS THE HYPERPARAMETER SEARCH ON YOUR TRAINING DATA.

best_xgb = search.best_estimator_
preds = best_xgb.predict(X_test)

Fitting 3 folds for each of 15 candidates, totalling 45 fits


In [10]:
print("Best Hyperparameters:", search.best_params_)
print("MAE:", mean_absolute_error(y_test, preds))
print("RMSE:", mean_squared_error(y_test, preds))
print("R²:", r2_score(y_test, preds))

Best Hyperparameters: {'xgb__subsample': 1.0, 'xgb__n_estimators': 200, 'xgb__max_depth': 5, 'xgb__learning_rate': 0.05, 'xgb__colsample_bytree': 1.0}
MAE: 9.77991716999412
RMSE: 314.3452828322736
R²: 0.999992012737639
