# Solar Power Prediction Model
## Renewable Energy Forecasting using Machine Learning

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 1. Data Loading and Preprocessing

In [None]:
# Load your dataset
# df = pd.read_csv('your_solar_data.csv')

# Sample data generation for demonstration
np.random.seed(42)
n_samples = 2000
dates = pd.date_range('2023-01-01', periods=n_samples, freq='H')

df = pd.DataFrame({
    'Date and Time': dates,
    'Global Irradiance (GHI)': np.random.normal(400, 200, n_samples).clip(0, 1000),
    'Direct Normal Irradiance (DNI)': np.random.normal(300, 150, n_samples).clip(0, 800),
    'Azimuth Angle': np.random.uniform(0, 360, n_samples),
    'Dry Bulb Temperature': np.random.normal(25, 10, n_samples),
    'Wet Bulb Temperature': np.random.normal(20, 8, n_samples),
    'Dew Point Temperature': np.random.normal(15, 8, n_samples),
    'Relative Humidity': np.random.uniform(30, 90, n_samples),
    'Cloud Coverage': np.random.uniform(0, 100, n_samples)
})

# Create realistic solar power output
df['Solar Power (kWh)'] = (
    0.005 * df['Global Irradiance (GHI)'] + 
    0.003 * df['Direct Normal Irradiance (DNI)'] +
    0.01 * df['Dry Bulb Temperature'] -
    0.002 * df['Cloud Coverage'] +
    np.random.normal(0, 0.5, n_samples)
).clip(0, None)

print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

# Handle missing values
df = df.fillna(df.median(numeric_only=True))

# Basic statistics
df.describe()

## 2. Feature Engineering

In [None]:
# Convert datetime and extract features
df['DateTime'] = pd.to_datetime(df['Date and Time'])
df['Hour'] = df['DateTime'].dt.hour
df['Day'] = df['DateTime'].dt.day
df['Month'] = df['DateTime'].dt.month
df['DayOfYear'] = df['DateTime'].dt.dayofyear
df['Season'] = df['Month'].map({12:0, 1:0, 2:0, 3:1, 4:1, 5:1, 
                               6:2, 7:2, 8:2, 9:3, 10:3, 11:3})

# Cyclical encoding for time features
df['Hour_sin'] = np.sin(2 * np.pi * df['Hour'] / 24)
df['Hour_cos'] = np.cos(2 * np.pi * df['Hour'] / 24)
df['Month_sin'] = np.sin(2 * np.pi * df['Month'] / 12)
df['Month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)

print("New features created:")
print(df[['Hour', 'Month', 'Season', 'Hour_sin', 'Hour_cos']].head())

## 3. Exploratory Data Analysis

In [None]:
# Correlation matrix
numeric_cols = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numeric_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Solar power distribution by hour
plt.figure(figsize=(12, 4))
df.groupby('Hour')['Solar Power (kWh)'].mean().plot(kind='bar')
plt.title('Average Solar Power Output by Hour')
plt.xlabel('Hour of Day')
plt.ylabel('Solar Power (kWh)')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

## 4. Model Training and Evaluation

In [None]:
# Prepare features
feature_cols = [
    'Global Irradiance (GHI)', 'Direct Normal Irradiance (DNI)',
    'Azimuth Angle', 'Dry Bulb Temperature', 'Wet Bulb Temperature',
    'Dew Point Temperature', 'Relative Humidity', 'Cloud Coverage',
    'Hour', 'Month', 'Season', 'DayOfYear',
    'Hour_sin', 'Hour_cos', 'Month_sin', 'Month_cos'
]

X = df[feature_cols]
y = df['Solar Power (kWh)']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=feature_cols)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

In [None]:
# Train models
models = {}

# Random Forest
models['Random Forest'] = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
models['Random Forest'].fit(X_train, y_train)

# Linear Regression
models['Linear Regression'] = LinearRegression()
models['Linear Regression'].fit(X_train, y_train)

print("Models trained successfully!")

In [None]:
# Evaluate models
results = {}

for name, model in models.items():
    y_pred = model.predict(X_test)
    
    results[name] = {
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
        'MAE': mean_absolute_error(y_test, y_pred),
        'R²': r2_score(y_test, y_pred),
        'predictions': y_pred
    }

# Display results
results_df = pd.DataFrame({name: {k: v for k, v in metrics.items() if k != 'predictions'} 
                          for name, metrics in results.items()}).T
print("Model Performance:")
print(results_df.round(4))

## 5. Results Visualization

In [None]:
# Plot actual vs predicted
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

for i, (name, metrics) in enumerate(results.items()):
    axes[i].scatter(y_test, metrics['predictions'], alpha=0.6, s=20)
    axes[i].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    axes[i].set_xlabel('Actual Solar Power (kWh)')
    axes[i].set_ylabel('Predicted Solar Power (kWh)')
    axes[i].set_title(f'{name}\nR² = {metrics["R²"]:.3f}, RMSE = {metrics["RMSE"]:.3f}')
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Feature importance (Random Forest)
rf_model = models['Random Forest']
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=True)

plt.figure(figsize=(10, 8))
plt.barh(feature_importance['feature'], feature_importance['importance'])
plt.xlabel('Feature Importance')
plt.title('Random Forest Feature Importance')
plt.tight_layout()
plt.show()

print("Top 5 Most Important Features:")
print(feature_importance.tail().iloc[::-1])

## 6. Model Deployment Preparation

In [None]:
# Save the best model and scaler
import joblib

best_model_name = max(results.keys(), key=lambda x: results[x]['R²'])
best_model = models[best_model_name]

# Save model and scaler
joblib.dump(best_model, 'solar_power_model.pkl')
joblib.dump(scaler, 'feature_scaler.pkl')

print(f"Best model ({best_model_name}) saved as 'solar_power_model.pkl'")
print(f"Feature scaler saved as 'feature_scaler.pkl'")
print(f"Best model R² score: {results[best_model_name]['R²']:.4f}")