In [12]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX

# --- 1. Load and Prepare Data ---
try:
    # Load your Excel data. This assumes the file is in the same directory.
    df = pd.read_excel('solar_data.xlsx')
except FileNotFoundError:
    print("Error: 'solar_data.xlsx' not found. Please ensure the file is in the correct directory.")
    exit()


# Combine 'Date' and 'Time' columns and convert them to a proper datetime format.
# This is essential for time series analysis.
df['datetime'] = pd.to_datetime(df['Date'].astype(str) + ' ' + df['Time'].astype(str))

# Set the 'datetime' column as the index of the DataFrame.
df.set_index('datetime', inplace=True)

# Clean the data by removing any rows where energy production is missing.
df.dropna(subset=['Energy Produced (Wh)'], inplace=True)


# --- 2. Aggregate to Daily Totals (The Correct Step for Annual Forecasting) ---
# We resample the 15-minute data into daily ('D') intervals by summing up
# the energy produced each day. This smooths out intraday noise and makes
# it possible to model the much more important annual seasonal pattern.
daily_data = df['Energy Produced (Wh)'].resample('D').sum()

# For a sanity check, display the first few rows of the aggregated daily data.
print("--- Aggregated Daily Data (First 5 Days) ---")
print(daily_data.head())
print("-" * 40)


# --- 3. Define and Train the SARIMAX Model ---
# We need at least two full years of data (730 days) to accurately
# detect and model an annual pattern.
if len(daily_data) < 730:
    print("\nWarning: You have less than two years of data.")
    print("The model may struggle to find a reliable annual pattern.")

# Define the model.
# order=(1,1,1) is a common baseline for the non-seasonal part.
# seasonal_order=(1,1,1,365) is for the seasonal part.
# The '365' is the KEY parameter. It tells the model to look for a pattern
# that repeats every 365 days (i.e., an annual pattern).
model = SARIMAX(daily_data,
                order=(1, 1, 1),
                seasonal_order=(1, 1, 1, 365))



--- Aggregated Daily Data (First 5 Days) ---
datetime
2023-01-06    14000
2023-01-07    11848
2023-01-08    10876
2023-01-09    10940
2023-01-10     8132
Freq: D, Name: Energy Produced (Wh), dtype: int64
----------------------------------------


In [None]:
print("\nFitting SARIMAX model. This is computationally intensive and may take several minutes...")
# Fit the model to the daily data. disp=False hides convergence messages.
results = model.fit(disp=False)
print("Model fitting complete.")

# Print the model summary to inspect coefficients and diagnostics.
print(results.summary())


# --- 4. Forecast and Visualize ---
# Forecast the energy production for the next 365 days.
forecast_steps = 365
forecast = results.get_forecast(steps=forecast_steps)

# Get the mean of the forecast and the confidence interval.
forecast_mean = forecast.predicted_mean
forecast_conf_int = forecast.conf_int()

# Create a plot to visualize the results.
plt.figure(figsize=(16, 8))

# Plot the historical daily data.
plt.plot(daily_data.index, daily_data, label='Historical Daily Production', color='royalblue')

# Plot the forecasted values.
plt.plot(forecast_mean.index, forecast_mean, label=f'Forecast for Next {forecast_steps} Days', color='red', linestyle='--')

# Shade the area for the 95% confidence interval.
plt.fill_between(forecast_conf_int.index,
                 forecast_conf_int.iloc[:, 0],
                 forecast_conf_int.iloc[:, 1], color='pink', alpha=0.6, label='95% Confidence Interval')

plt.title('Solar Energy Production: Historical Data and Annual Forecast', fontsize=16)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Total Daily Energy Produced (Wh)', fontsize=12)
plt.legend(fontsize=10)
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

# Save the figure to a file.
plt.savefig("annual_solar_forecast.png")
print(f"\nForecast plot saved to annual_solar_forecast.png")