In [12]:
import pandas as pd
import numpy as np
import joblib
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error

# Load and filter the dataset
data = pd.read_excel('final.xlsx')

# Filter only non-renewable energy plants (Gas, Nuclear, Thermal)
non_renewable_data = data[data['Type Of Station'].isin(['Gas', 'Nuclear', 'Thermal'])].copy()

# Ensure 'Date' column is in datetime format
non_renewable_data['Date'] = pd.to_datetime(non_renewable_data['Date'])

# Set the 'Date' column as index
non_renewable_data = non_renewable_data.set_index('Date')

# Function to check if a plant is operational and fill missing values
def preprocess_plant_data(group):
    # Ensure the index is unique before reindexing
    group = group.loc[~group.index.duplicated(keep='first')]
    
    # Fill missing dates with 0 if no data
    group = group.asfreq('D').fillna(0)
    
    # Fill missing values based on whether the plant is operational
    if group['Declared Capability (MWh)'].eq(0).all():
        group['Declared Capability (MWh)'].fillna(0, inplace=True)
    else:
        # Fill missing values with the average of the week (7-day rolling average)
        group['Declared Capability (MWh)'] = group['Declared Capability (MWh)'].fillna(
            group['Declared Capability (MWh)'].rolling(window=7, min_periods=1).mean()
        )
    
    return group

# Apply preprocessing to each plant
non_renewable_data = non_renewable_data.groupby('Station').apply(preprocess_plant_data)

# Drop rows with missing values in other important columns
non_renewable_data.dropna(subset=['Declared Capability (MWh)'], inplace=True)

# Aggregate data at the daily level
daily_data = non_renewable_data.groupby('Date')['Declared Capability (MWh)'].sum()

# Train-test split
train_size = int(len(daily_data) * 0.8)
train, test = daily_data[:train_size], daily_data[train_size:]

# Build and train the ARIMA model
model = ARIMA(train, order=(5,1,0))  # Example order; adjust based on model selection criteria
model_fit = model.fit()  # Removed disp=0

# Make predictions
predictions = model_fit.forecast(steps=len(test))

# Calculate the error
error = mean_squared_error(test, predictions)
print(f'Mean Squared Error: {error}')

# Save the trained model
joblib.dump(model_fit, 'energy_consumption_arima_model.joblib')
print("Model training complete and files saved.")


  group = group.asfreq('D').fillna(0)
  group = group.asfreq('D').fillna(0)
  group = group.asfreq('D').fillna(0)
  group = group.asfreq('D').fillna(0)
  group = group.asfreq('D').fillna(0)
  group = group.asfreq('D').fillna(0)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  group['Declared Capability (MWh)'].fillna(0, inplace=True)
  group = group.asfreq('D').fillna(0)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] =

Mean Squared Error: 94290548262.4169
Model training complete and files saved.


  non_renewable_data = non_renewable_data.groupby('Station').apply(preprocess_plant_data)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
