In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# load dataset
data_path = '/workspace/COMP3610-Renewable-Energy-Prediction/data/processed/final_combined_dataset.csv'
df = pd.read_csv(list(data_path.keys())[0])

# display first few rows
print(df.head())

# check for missing values
print(df.isnull().sum())

AttributeError: 'str' object has no attribute 'keys'

In [None]:
# convert time column to datetime
df['time'] = pd.to_datetime(df['time'])

# extract time features
df['hour'] = df['time'].dt.hour
df['day_of_week'] = df['time'].dt.dayofweek
df['month'] = df['time'].dt.month
df['season'] = df['time'].dt.month % 12 // 3 + 1  # 1:Winter, 2:Spring, 3:Summer, 4:Fall

# create lag features (previous hour's solar production)
df['prev_hour_solar'] = df['Solar'].shift(1)

# create interaction features (solar power efficiency)
if 'solar_radiation' in df.columns:
    df['solar_efficiency'] = df['Solar'] / (df['solar_radiation'] + 1e-6)  # Add small value to avoid division by zero

# drop NaN values from lag features
df.dropna(inplace=True)

features = ['temp', 'dwpt', 'rhum', 'prcp', 'wspd', 'pres', 'hour',
            'day_of_week', 'month', 'season', 'prev_hour_solar']
target = 'Solar'

X = df[features]
y = df[target]

# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# svm model
svm_model = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)
svm_model.fit(X_train_scaled, y_train)

y_pred_svm = svm_model.predict(X_test_scaled)

# evaluation
mse_svm = mean_squared_error(y_test, y_pred_svm)
mae_svm = mean_absolute_error(y_test, y_pred_svm)
r2_svm = r2_score(y_test, y_pred_svm)

print(f"SVM Performance:")
print(f"MSE: {mse_svm:.2f}")
print(f"MAE: {mae_svm:.2f}")
print(f"R² Score: {r2_svm:.2f}")

# predictions vs actual
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_svm, alpha=0.3)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
plt.xlabel('Actual Solar Production')
plt.ylabel('Predicted Solar Production')
plt.title('SVM: Actual vs Predicted Solar Production')
plt.show()

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# prepare data for LSTM
def create_sequences(data, targets, time_steps=24):
    X, y = [], []
    for i in range(len(data) - time_steps):
        X.append(data[i:(i + time_steps)])
        y.append(targets[i + time_steps])
    return np.array(X), np.array(y)

time_steps = 24  # sequence length
X_train_lstm, y_train_lstm = create_sequences(X_train_scaled, y_train.values, time_steps)
X_test_lstm, y_test_lstm = create_sequences(X_test_scaled, y_test.values, time_steps)

# shape for LSTM
X_train_lstm = X_train_lstm.reshape(X_train_lstm.shape[0], X_train_lstm.shape[1], X_train_scaled.shape[1])
X_test_lstm = X_test_lstm.reshape(X_test_lstm.shape[0], X_test_lstm.shape[1], X_test_scaled.shape[1])

# model
lstm_model = Sequential([
    LSTM(64, input_shape=(time_steps, X_train_scaled.shape[1]), return_sequences=True),
    Dropout(0.2),
    LSTM(32),
    Dropout(0.2),
    Dense(1)
])

lstm_model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# prevent overfitting
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# train
history = lstm_model.fit(
    X_train_lstm, y_train_lstm,
    epochs=50,
    batch_size=32,
    validation_split=0.1,
    callbacks=[early_stop],
    verbose=1
)

# plot
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.title('Loss Over Epochs')

plt.subplot(1, 2, 2)
plt.plot(history.history['mae'], label='Train MAE')
plt.plot(history.history['val_mae'], label='Validation MAE')
plt.legend()
plt.title('MAE Over Epochs')
plt.show()

# evaluate
y_pred_lstm = lstm_model.predict(X_test_lstm).flatten()

mse_lstm = mean_squared_error(y_test_lstm, y_pred_lstm)
mae_lstm = mean_absolute_error(y_test_lstm, y_pred_lstm)
r2_lstm = r2_score(y_test_lstm, y_pred_lstm)

print(f"\nLSTM Performance:")
print(f"MSE: {mse_lstm:.2f}")
print(f"MAE: {mae_lstm:.2f}")
print(f"R² Score: {r2_lstm:.2f}")

# predictions vs actual
plt.figure(figsize=(10, 6))
plt.scatter(y_test_lstm, y_pred_lstm, alpha=0.3)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
plt.xlabel('Actual Solar Production')
plt.ylabel('Predicted Solar Production')
plt.title('LSTM: Actual vs Predicted Solar Production')
plt.show()

In [None]:
# compare models
results = pd.DataFrame({
    'Model': ['SVM', 'LSTM'],
    'MSE': [mse_svm, mse_lstm],
    'MAE': [mae_svm, mae_lstm],
    'R² Score': [r2_svm, r2_lstm]
})

print("\nModel Comparison:")
print(results)

# plot comparison
plt.figure(figsize=(12, 4))
for i, metric in enumerate(['MSE', 'MAE', 'R² Score']):
    plt.subplot(1, 3, i+1)
    sns.barplot(x='Model', y=metric, data=results)
    plt.title(metric)
plt.tight_layout()
plt.show()

In [None]:
# save models
import joblib
from tensorflow.keras.models import save_model

joblib.dump(svm_model, 'svm_solar_model.pkl') # SVM model
save_model(lstm_model, 'lstm_solar_model.h5') # LSTM model
joblib.dump(scaler, 'scaler.pkl') # scaler
results.to_csv('model_results.csv', index=False) # results