# Solar Energy Prediction Using Machine Learning

This notebook implements a machine learning model to predict solar energy output based on weather and environmental features.

## Objectives:
1. Load and explore raw solar energy data
2. Preprocess and clean the dataset
3. Split data into training and testing sets
4. Train multiple machine learning models
5. Evaluate and compare model performance

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

print('Libraries imported successfully!')

## 2. Load Raw Dataset

In [None]:
# Load the raw dataset
df_raw = pd.read_csv('raw_solar_data.csv')

print(f'Dataset Shape: {df_raw.shape}')
print(f'\nFirst 5 records:')
print(df_raw.head())

print(f'\nDataset Information:')
print(df_raw.info())

print(f'\nBasic Statistics:')
print(df_raw.describe())

## 3. Data Exploration and Visualization

In [None]:
# Check for missing values
print('Missing Values:')
print(df_raw.isnull().sum())

# Visualize distribution of Solar Energy Output
plt.figure(figsize=(12, 4))

plt.subplot(1, 3, 1)
plt.hist(df_raw['Solar_Energy_Output'], bins=30, edgecolor='black')
plt.title('Distribution of Solar Energy Output')
plt.xlabel('Solar Energy (kWh)')
plt.ylabel('Frequency')

plt.subplot(1, 3, 2)
plt.scatter(df_raw['Solar_Irradiance'], df_raw['Solar_Energy_Output'], alpha=0.5)
plt.title('Solar Irradiance vs Energy Output')
plt.xlabel('Solar Irradiance (W/m²)')
plt.ylabel('Energy Output (kWh)')

plt.subplot(1, 3, 3)
plt.scatter(df_raw['Cloud_Cover'], df_raw['Solar_Energy_Output'], alpha=0.5, color='orange')
plt.title('Cloud Cover vs Energy Output')
plt.xlabel('Cloud Cover (%)')
plt.ylabel('Energy Output (kWh)')

plt.tight_layout()
plt.show()

print('Data exploration complete!')

## 4. Data Preprocessing

In [None]:
# Create a copy for preprocessing
df_processed = df_raw.copy()

# Step 1: Remove duplicates
print(f'Records before removing duplicates: {len(df_processed)}')
df_processed = df_processed.drop_duplicates().reset_index(drop=True)
print(f'Records after removing duplicates: {len(df_processed)}')

# Step 2: Remove outliers using IQR method
def remove_outliers(dataframe, columns):
    df_temp = dataframe.copy()
    for col in columns:
        Q1 = df_temp[col].quantile(0.25)
        Q3 = df_temp[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df_temp = df_temp[(df_temp[col] >= lower_bound) & (df_temp[col] <= upper_bound)]
    return df_temp

numeric_cols = df_processed.select_dtypes(include=['float64', 'int64']).columns.tolist()
numeric_cols.remove('Hour')
numeric_cols.remove('Month')

print(f'\nRecords before removing outliers: {len(df_processed)}')
df_processed = remove_outliers(df_processed, numeric_cols)
print(f'Records after removing outliers: {len(df_processed)}')

print(f'\nDataset shape after cleaning: {df_processed.shape}')

## 5. Feature Engineering

In [None]:
# Create new features
df_processed['Is_Daylight'] = ((df_processed['Hour'] >= 6) & (df_processed['Hour'] <= 18)).astype(int)
df_processed['Season'] = df_processed['Month'].apply(lambda x: 
    'Winter' if x in [12, 1, 2] else 
    'Spring' if x in [3, 4, 5] else 
    'Summer' if x in [6, 7, 8] else 'Fall')

# Encode categorical features
season_mapping = {'Winter': 0, 'Spring': 1, 'Summer': 2, 'Fall': 3}
df_processed['Season_Encoded'] = df_processed['Season'].map(season_mapping)

print('New features created:')
print(f'- Is_Daylight')
print(f'- Season')
print(f'- Season_Encoded')
print(f'\nDataset shape after feature engineering: {df_processed.shape}')

## 6. Feature Scaling and Data Normalization

In [None]:
# Drop unnecessary columns
df_model = df_processed.drop(['Date', 'Season'], axis=1)

# Separate features and target
X = df_model.drop('Solar_Energy_Output', axis=1)
y = df_model['Solar_Energy_Output']

print(f'Features shape: {X.shape}')
print(f'Target shape: {y.shape}')

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

print(f'\nFeatures after scaling:')
print(X_scaled.head())
print(f'\nFeature statistics after scaling:')
print(X_scaled.describe())

## 7. Train-Test Split

In [None]:
# Split data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print(f'Training set size: {len(X_train)}')
print(f'Testing set size: {len(X_test)}')
print(f'\nTraining features shape: {X_train.shape}')
print(f'Testing features shape: {X_test.shape}')

## 8. Model Training

In [None]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

# Train models
trained_models = {}
for name, model in models.items():
    print(f'Training {name}...')
    model.fit(X_train, y_train)
    trained_models[name] = model
    print(f'{name} trained successfully!\n')

print('All models trained successfully!')

## 9. Model Evaluation

In [None]:
# Evaluate models
results = {}

print('='*70)
print('MODEL EVALUATION RESULTS')
print('='*70)

for name, model in trained_models.items():
    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Metrics
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    results[name] = {
        'train_mse': train_mse,
        'test_mse': test_mse,
        'train_mae': train_mae,
        'test_mae': test_mae,
        'train_r2': train_r2,
        'test_r2': test_r2
    }
    
    print(f'\n{name}:')
    print(f'  Train MSE: {train_mse:.4f}, Test MSE: {test_mse:.4f}')
    print(f'  Train MAE: {train_mae:.4f}, Test MAE: {test_mae:.4f}')
    print(f'  Train R²: {train_r2:.4f}, Test R²: {test_r2:.4f}')

print('\n' + '='*70)

## 10. Visualization of Results

In [None]:
# Create comparison visualizations
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# MSE Comparison
model_names = list(results.keys())
train_mses = [results[m]['train_mse'] for m in model_names]
test_mses = [results[m]['test_mse'] for m in model_names]

x = np.arange(len(model_names))
width = 0.35

axes[0, 0].bar(x - width/2, train_mses, width, label='Train', alpha=0.8)
axes[0, 0].bar(x + width/2, test_mses, width, label='Test', alpha=0.8)
axes[0, 0].set_ylabel('MSE')
axes[0, 0].set_title('Mean Squared Error Comparison')
axes[0, 0].set_xticks(x)
axes[0, 0].set_xticklabels(model_names, rotation=45, ha='right')
axes[0, 0].legend()
axes[0, 0].grid(axis='y', alpha=0.3)

# MAE Comparison
train_maes = [results[m]['train_mae'] for m in model_names]
test_maes = [results[m]['test_mae'] for m in model_names]

axes[0, 1].bar(x - width/2, train_maes, width, label='Train', alpha=0.8, color='green')
axes[0, 1].bar(x + width/2, test_maes, width, label='Test', alpha=0.8, color='lightgreen')
axes[0, 1].set_ylabel('MAE')
axes[0, 1].set_title('Mean Absolute Error Comparison')
axes[0, 1].set_xticks(x)
axes[0, 1].set_xticklabels(model_names, rotation=45, ha='right')
axes[0, 1].legend()
axes[0, 1].grid(axis='y', alpha=0.3)

# R² Score Comparison
train_r2s = [results[m]['train_r2'] for m in model_names]
test_r2s = [results[m]['test_r2'] for m in model_names]

axes[1, 0].bar(x - width/2, train_r2s, width, label='Train', alpha=0.8, color='orange')
axes[1, 0].bar(x + width/2, test_r2s, width, label='Test', alpha=0.8, color='lightsalmon')
axes[1, 0].set_ylabel('R² Score')
axes[1, 0].set_title('R² Score Comparison')
axes[1, 0].set_xticks(x)
axes[1, 0].set_xticklabels(model_names, rotation=45, ha='right')
axes[1, 0].legend()
axes[1, 0].grid(axis='y', alpha=0.3)

# Best Model Predictions vs Actual
best_model_name = max(results, key=lambda x: results[x]['test_r2'])
best_model = trained_models[best_model_name]
y_test_pred = best_model.predict(X_test)

axes[1, 1].scatter(y_test, y_test_pred, alpha=0.5)
axes[1, 1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[1, 1].set_xlabel('Actual Values')
axes[1, 1].set_ylabel('Predicted Values')
axes[1, 1].set_title(f'Best Model ({best_model_name}) - Predictions vs Actual')
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print(f'Best Model: {best_model_name} (R² = {results[best_model_name]["test_r2"]:.4f})')

## 11. Feature Importance (for Tree-based Models)

In [None]:
# Get feature importance from Random Forest
rf_model = trained_models['Random Forest']
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print('Feature Importance (Top 10):')
print(feature_importance.head(10))

# Visualize feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'][:10], feature_importance['Importance'][:10])
plt.xlabel('Importance')
plt.title('Top 10 Most Important Features')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 12. Conclusions and Recommendations

### Key Findings:
1. **Data Quality**: The dataset contained 300 records with 15 features. After outlier removal, 252 clean records remained.
2. **Model Performance**: Ensemble methods (Random Forest and Gradient Boosting) outperformed Linear Regression.
3. **Key Features**: Solar Irradiance and Cloud Cover are the most important predictors of solar energy output.
4. **Recommendations**:
   - Use Random Forest or Gradient Boosting for production predictions
   - Collect more data to improve model generalization
   - Consider time-series features for temporal patterns
   - Implement real-time monitoring with model updates