In [None]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# 1. Load dataset
df = pd.read_csv("solarpowergeneration.csv")

# 2. Rename columns
df.columns = [
    "distance_to_solar_noon", "temperature", "wind_direction", "wind_speed",
    "sky_cover", "visibility", "humidity", "average_wind_speed",
    "average_pressure", "power_generated"
]

# 3. Fill missing values (corrected warning-free method)
df['average_wind_speed'] = df['average_wind_speed'].fillna(df['average_wind_speed'].median())

# 4. Cap outliers
df_final = df.copy()
features_to_cap = ['wind_direction', 'visibility', 'humidity', 
                   'wind_speed', 'average_pressure', 'average_wind_speed']

for col in features_to_cap:
    Q1 = df_final[col].quantile(0.25)
    Q3 = df_final[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df_final[col] = np.where(df_final[col] < lower_bound, lower_bound,
                             np.where(df_final[col] > upper_bound, upper_bound, df_final[col]))

# 5. Log transform target
df_final['power_generated'] = df_final['power_generated'].astype(np.float64)
df_final['power_generated'] = np.where(df_final['power_generated'] < 0, np.nan, df_final['power_generated'])
df_final['power_generated'] = df_final['power_generated'].fillna(1e-10)
df_final['power_generated'] = np.log(df_final['power_generated'].clip(lower=1e-10))

# 6. Drop multicollinear feature
cols_to_drop = ['average_wind_speed']
df_final.drop(columns=[col for col in cols_to_drop if col in df_final.columns], inplace=True)

# 7. Scale features
scaler = StandardScaler()
df_final_scaled = pd.DataFrame(scaler.fit_transform(df_final), columns=df_final.columns)

# 8. Split data
X = df_final_scaled.drop(columns=['power_generated'])
y = df_final_scaled['power_generated']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=70)

# 9. Grid Search with Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestRegressor(random_state=70)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# 10. Evaluate model
best_rf_model = grid_search.best_estimator_
y_pred = best_rf_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)

# 11. Save model, scaler, and feature names
with open("random_forest_model.pkl", "wb") as f:
    pickle.dump(best_rf_model, f)

with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

with open("features.pkl", "wb") as f:
    pickle.dump(X.columns.tolist(), f)

print("OVER")

Fitting 5 folds for each of 81 candidates, totalling 405 fits
