In [2]:
import pandas as pd
import numpy as np

# We will need these for modeling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [3]:
# Load the cleaned and aggregated daily data
df_daily = pd.read_csv('G:/MyProjects/german_energy_forecasting/data/processed/german_energy_daily.csv', index_col=0, parse_dates=True)

# Display the first 5 rows and a summary of the data
print("First 5 rows of the loaded daily data:")
print(df_daily.head())
print("\nDataFrame Info:")
df_daily.info()

First 5 rows of the loaded daily data:
               Biomass       Demand      Fossil     Nuclear      Solar  \
DateTime                                                                 
2021-01-01  113.159940   920.231628  517.806845  195.615402  21.556413   
2021-01-02  113.798372   986.910056  570.113227  195.100922  15.845982   
2021-01-03  113.417942  1006.676751  442.111483  182.142490   6.666677   
2021-01-04  112.539105  1228.483919  715.765355  181.634842   5.371087   
2021-01-05  111.863338  1257.987824  787.522772  187.590538   5.970385   

                  Wind  
DateTime                
2021-01-01   79.767715  
2021-01-02   92.913480  
2021-01-03  440.403073  
2021-01-04  387.644842  
2021-01-05  366.177548  

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 365 entries, 2021-01-01 to 2021-12-31
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Biomass  365 non-null    float64
 1   Demand  

In [4]:
# Define our target variable (what we want to predict)
y = df_daily['Demand']

# Define our feature variables (what we'll use to predict)
# We will use all other columns to predict 'Demand'
X = df_daily.drop(columns=['Demand'])

In [5]:
# Split the data with an 80/20 ratio for training/testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)

Shape of X_train: (292, 5)
Shape of X_test: (73, 5)


In [6]:
random_state=42 

In [7]:
# Initialize the Linear Regression model
model = LinearRegression()

# Train the model using the training data
model.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [8]:
# Use the trained model to make predictions on the test set
y_pred = model.predict(X_test)

In [9]:
# Calculate the Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate the Root Mean Squared Error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Mean Absolute Error (MAE):", mae)
print("Root Mean Squared Error (RMSE):", rmse)

Mean Absolute Error (MAE): 48.34601894410412
Root Mean Squared Error (RMSE): 58.23225886055277


In [10]:
import pickle

# Save the trained model to a file
with open('../models/linear_regression_model.pkl', 'wb') as f:
    pickle.dump(model, f)

FileNotFoundError: [Errno 2] No such file or directory: '../models/linear_regression_model.pkl'

In [11]:
import pickle

# Save the trained model to a file
with open('G:/MyProjects/german_energy_forecasting/models/linear_regression_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [13]:
# Create a DataFrame for our predicted values
df_predictions = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

# Reset the index of the DataFrame to make 'DateTime' a column again
df_predictions = df_predictions.reset_index()

# Save this new DataFrame to a CSV file for Power BI
df_predictions.to_csv('G:/MyProjects/german_energy_forecasting/data/processed/energy_predictions.csv', index=False)