In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, PowerTransformer, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
# ------------------------ LOAD DATA ------------------------
df = pd.read_excel("Solardata_final_1.xlsx")
df["Date and Time"] = pd.to_datetime(df["Date and Time"])
df["hour"] = df["Date and Time"].dt.hour
df["dayofweek"] = df["Date and Time"].dt.dayofweek
df["month"] = df["Date and Time"].dt.month

In [3]:
target = "Output Power (kW)"
X = df.drop(columns=["SL No.", "Date and Time", target])
y = df[target]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# -------------------------- PREPROCESSING --------------------------
num_cols = X.select_dtypes(include=["number"]).columns
cat_cols = X.select_dtypes(include=["object", "category"]).columns

In [7]:
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("power", PowerTransformer()),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

In [8]:
preprocessor = ColumnTransformer([
    ("num", numeric_transformer, num_cols),
    ("cat", categorical_transformer, cat_cols)
])

In [9]:
# --------------------------- MODEL ---------------------------
rf_pipeline = Pipeline([
    ("preproc", preprocessor),
    ("rf", RandomForestRegressor(random_state=42, n_jobs=-1))
])

In [10]:
# ------------------------ HYPERPARAMETER TUNING ------------------------

# n_estimators:      HOW MANY TREES SHOULD THE FOREST HAVE? MORE TREES = USUALLY BETTER ACCURACY BUT SLOWER.
# max_depth:         HOW DEEP SHOULD EACH TREE GO? DEEPER TREES CAN CAPTURE MORE DETAIL BUT MAY OVERFIT.
# min_samples_split: MINIMUM NUMBER OF DATA POINTS NEEDED TO SPLIT A NODE INTO TWO. IF TOO SMALL → OVERFIT; TOO LARGE → UNDERFIT.
# min_samples_leaf:  MINIMUM NUMBER OF DATA POINTS THAT A LEAF (END OF BRANCH) CAN HAVE. PREVENTS VERY TINY LEAVES THAT OVERFIT.

param_dist = {
    "rf__n_estimators": [100, 200, 400],
    "rf__max_depth": [None, 10, 20, 30],
    "rf__min_samples_split": [2, 5, 10],
    "rf__min_samples_leaf": [1, 2, 4]
}

# PICKS RANDOM COMBINATIONS
search = RandomizedSearchCV(
    rf_pipeline, param_distributions=param_dist,
    n_iter=15,  # 15 RANDOM COMBINATIONS.
    cv=3,       # 3-FOLD CROSS-VALIDATION: IT SPLITS THE TRAINING DATA INTO 3 PARTS, TRAINS ON 2, TESTS ON 1, AND REPEATS 3 TIMES. THIS CHECKS IF THE MODEL WORKS WELL ON DIFFERENT SUBSETS OF DATA.
    scoring="neg_mean_absolute_error", # TRYING TO MAKE THE ERRORS AS SMALL AS POSSIBLE
    random_state=42,
    n_jobs=-1,
    verbose=1
)

search.fit(X_train, y_train)

Fitting 3 folds for each of 15 candidates, totalling 45 fits


In [11]:
# -------------------- EVALUATE --------------------
best_rf = search.best_estimator_
preds = best_rf.predict(X_test)

In [13]:
print("Best Hyperparameters:", search.best_params_)
print("MAE:", mean_absolute_error(y_test, preds))
print("RMSE:", mean_squared_error(y_test, preds))
print("R²:", r2_score(y_test, preds))

Best Hyperparameters: {'rf__n_estimators': 400, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 2, 'rf__max_depth': 30}
MAE: 0.3398457062402131
RMSE: 2.6142287364810812
R²: 0.9999999335745375
