# Solar Power Generation â€” Regression Project (Lite)

Target: `power-generated`  
Dataset: `solarpowergeneration (1).csv`

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import pickle

df = pd.read_csv(r"/mnt/data/solarpowergeneration (1).csv")
df.head()


## EDA

In [None]:

# EDA: target histogram
plt.figure(figsize=(6,3))
df['power-generated'].hist(bins=30)
plt.title('Distribution of power-generated')
plt.show()

# Correlation heatmap (numeric only)
corr = df.corr(numeric_only=True)
plt.figure(figsize=(6,4))
plt.imshow(corr, aspect='auto')
plt.colorbar()
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.index)), corr.index)
plt.title('Correlation Heatmap')
plt.show()

# Top-3 correlated features vs target
cwt = corr['power-generated'].drop(labels=['power-generated']).abs().sort_values(ascending=False)
top3 = list(cwt.head(3).index)
for c in top3:
    plt.figure(figsize=(6,3))
    plt.scatter(df[c], df['power-generated'], s=8)
    plt.xlabel(c); plt.ylabel('power-generated'); plt.title(f'{c} vs power-generated')
    plt.show()


## Modeling & Results

In [None]:

target = 'power-generated'
X = df.drop(columns=[target])
y = df[target]

ridge_alphas = np.logspace(-3, 3, 21)
lasso_alphas = np.logspace(-3, 1, 15)

models = {
    "LinearRegression": Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler()), ("model", LinearRegression())]),
    "RidgeCV": Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler()), ("model", RidgeCV(alphas=ridge_alphas, cv=5))]),
    "LassoCV": Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler()), ("model", LassoCV(alphas=lasso_alphas, cv=5, max_iter=20000, random_state=42))]),
    "RandomForest": Pipeline([("imputer", SimpleImputer(strategy="median")), ("model", RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=1))]),
    "GradientBoosting": Pipeline([("imputer", SimpleImputer(strategy="median")), ("model", GradientBoostingRegressor(random_state=42))])
}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from pprint import pprint
results = []
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for name, pipe in models.items():
    cv = cross_val_score(pipe, X, y, cv=kf, scoring="r2", n_jobs=1)
    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_test)
    results.append({
        "model": name,
        "cv_r2_mean": float(np.mean(cv)),
        "cv_r2_std": float(np.std(cv)),
        "test_MAE": float(mean_absolute_error(y_test, pred)),
        "test_RMSE": float(mean_squared_error(y_test, pred, squared=False)),
        "test_R2": float(r2_score(y_test, pred))
    })

import pandas as pd
res_df = pd.DataFrame(results).sort_values("test_R2", ascending=False)
res_df


## Persist Best Model + Residuals

In [None]:

best_name = res_df.iloc[0]['model']
print('Best model:', best_name)

# Refit on full data and save
if best_name == "LinearRegression":
    best_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler()), ("model", LinearRegression())])
elif best_name == "RidgeCV":
    best_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler()), ("model", RidgeCV(alphas=np.logspace(-3, 3, 21), cv=5))])
elif best_name == "LassoCV":
    best_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler()), ("model", LassoCV(alphas=np.logspace(-3, 1, 15), cv=5, max_iter=20000, random_state=42))])
elif best_name == "RandomForest":
    best_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")), ("model", RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=1))])
else:
    best_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")), ("model", GradientBoostingRegressor(random_state=42))])

best_pipe.fit(X, y)
import pickle, os
model_path = "best_model_lite.pkl"
with open(model_path, "wb") as f:
    pickle.dump(best_pipe, f)

# Residuals plot
pred_all = best_pipe.predict(X)
residuals = y - pred_all
plt.figure(figsize=(6,3))
plt.scatter(pred_all, residuals, s=8)
plt.axhline(0, linestyle="--")
plt.xlabel("Predicted"); plt.ylabel("Residuals"); plt.title(f"Residuals ({best_name})")
plt.show()

model_path
