In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

In [13]:
df = pd.read_csv("updated_power_generation_data.csv")
print("Dataset shape:", df.shape)
df.head()

Dataset shape: (4213, 24)


Unnamed: 0,temperature_2_m_above_gnd,relative_humidity_2_m_above_gnd,mean_sea_level_pressure_MSL,total_precipitation_sfc,snowfall_amount_sfc,total_cloud_cover_sfc,high_cloud_cover_high_cld_lay,medium_cloud_cover_mid_cld_lay,low_cloud_cover_low_cld_lay,shortwave_radiation_backwards_sfc,...,wind_speed_900_mb,wind_direction_900_mb,wind_gust_10_m_above_gnd,angle_of_incidence,zenith,azimuth,generated_power_kw,location,latitude,longitude
0,2.17,31,1035.0,0.0,0.0,0.0,0,0,0,0.0,...,6.62,337.62,24.48,58.753108,83.237322,128.33543,454.10095,"Shahpur, Madhya Pradesh",21.2293,76.1703
1,2.31,27,1035.1,0.0,0.0,0.0,0,0,0,1.78,...,4.61,321.34,21.96,45.408585,75.143041,139.6553,1411.9994,"Sittwe, Rakhine",17.821,90.7426
2,3.65,33,1035.4,0.0,0.0,0.0,0,0,0,108.58,...,3.76,286.7,14.04,32.848282,68.820648,152.53769,2214.8493,"Doshi, Wilayat-e Baghlan",35.3876,68.6836
3,5.82,30,1035.4,0.0,0.0,0.0,0,0,0,258.1,...,3.08,339.44,19.8,22.699288,64.883536,166.90159,2527.6092,"Bhopal, Madhya Pradesh",23.2394,77.2415
4,7.73,27,1034.4,0.0,0.0,0.0,0,0,0,375.58,...,6.62,22.38,16.56,19.199908,63.795208,182.13526,2640.2034,"Paradip Garh, Odisha",19.4967,88.1186


In [4]:
# 3. Quick EDA summary
print(df.info())
display(df.describe())
missing = df.isnull().sum()
print("\nMissing values per column:\n", missing[missing>0])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4213 entries, 0 to 4212
Data columns (total 21 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   temperature_2_m_above_gnd          4213 non-null   float64
 1   relative_humidity_2_m_above_gnd    4213 non-null   int64  
 2   mean_sea_level_pressure_MSL        4213 non-null   float64
 3   total_precipitation_sfc            4213 non-null   float64
 4   snowfall_amount_sfc                4213 non-null   float64
 5   total_cloud_cover_sfc              4213 non-null   float64
 6   high_cloud_cover_high_cld_lay      4213 non-null   int64  
 7   medium_cloud_cover_mid_cld_lay     4213 non-null   int64  
 8   low_cloud_cover_low_cld_lay        4213 non-null   int64  
 9   shortwave_radiation_backwards_sfc  4213 non-null   float64
 10  wind_speed_10_m_above_gnd          4213 non-null   float64
 11  wind_direction_10_m_above_gnd      4213 non-null   float

Unnamed: 0,temperature_2_m_above_gnd,relative_humidity_2_m_above_gnd,mean_sea_level_pressure_MSL,total_precipitation_sfc,snowfall_amount_sfc,total_cloud_cover_sfc,high_cloud_cover_high_cld_lay,medium_cloud_cover_mid_cld_lay,low_cloud_cover_low_cld_lay,shortwave_radiation_backwards_sfc,...,wind_direction_10_m_above_gnd,wind_speed_80_m_above_gnd,wind_direction_80_m_above_gnd,wind_speed_900_mb,wind_direction_900_mb,wind_gust_10_m_above_gnd,angle_of_incidence,zenith,azimuth,generated_power_kw
count,4213.0,4213.0,4213.0,4213.0,4213.0,4213.0,4213.0,4213.0,4213.0,4213.0,...,4213.0,4213.0,4213.0,4213.0,4213.0,4213.0,4213.0,4213.0,4213.0,4213.0
mean,15.068111,51.361025,1019.337812,0.031759,0.002808,34.05699,14.458818,20.023499,21.373368,387.759036,...,195.078452,18.978483,191.166862,16.36319,192.447911,20.583489,50.83749,59.980947,169.167651,1134.347313
std,8.853677,23.525864,7.022867,0.170212,0.038015,42.843638,30.711707,36.387948,38.013885,278.459293,...,106.626782,11.99996,108.760021,9.88533,106.516195,12.648899,26.638965,19.857711,64.568385,937.957247
min,-5.35,7.0,997.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.54,0.0,1.12,0.0,1.12,0.72,3.755323,17.727761,54.379093,0.000595
25%,8.39,32.0,1014.5,0.0,0.0,0.0,0.0,0.0,0.0,142.4,...,153.19,10.14,130.24,9.18,148.22,11.16,29.408181,45.291631,114.1366,231.70045
50%,14.75,48.0,1018.1,0.0,0.0,8.7,0.0,0.0,0.0,381.81,...,191.77,16.24,187.77,14.49,187.99,18.0,47.335557,62.142611,163.24165,971.64265
75%,21.29,70.0,1023.6,0.0,0.0,100.0,9.0,10.0,10.0,599.86,...,292.07,26.14,292.04,21.97,288.0,27.0,69.197492,74.346737,225.08562,2020.9667
max,34.9,100.0,1046.8,3.2,1.68,100.0,100.0,100.0,100.0,952.3,...,360.0,66.88,360.0,61.11,360.0,84.96,121.63592,128.41537,289.04518,3056.7941



Missing values per column:
 Series([], dtype: int64)


In [14]:
X = df.drop(columns=["generated_power_kw"])
y = df["generated_power_kw"]

In [6]:
# 5. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)

Train shape: (3370, 20) Test shape: (843, 20)


In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [8]:
# 7. Train the model
pipeline.fit(X_train, y_train)
print('Training completed')

Training completed


In [16]:
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[("num", numeric_transformer, numeric_features)]
)

In [17]:
model = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                          ("model", model)])

In [18]:
pipeline.fit(X_train, y_train)

In [19]:
preds = pipeline.predict(X_test)
print("MAE:", mean_absolute_error(y_test, preds))
print("RMSE:", mean_squared_error(y_test, preds, squared=False))
print("R²:", r2_score(y_test, preds))

MAE: 259.9340212025408
RMSE: 408.68511089921105
R²: 0.8171536453605311




In [20]:
joblib.dump(pipeline, "solar_power_model.pkl")
print("Model saved as solar_power_model.pkl")

Model saved as solar_power_model.pkl
