# Heating plant

We have a dataset from a real heating plant located in a medium-sized city in Europe. The heating plant heats water and distributes the heat around the city. Our goal is to predict temperature of the returning water based on actual and historical power settings of the plant, output water temperature and the outside temperatures measured at different locations in the city.

## Data import

In [8]:
import pandas as pd
import plotly.express as px

frames = []
for i in range (0,10):
    frames.append(pd.read_excel("power_plant.xlsx", sheet_name=i))

In [9]:
data = frames[0][['ts','power12']].merge(frames[1][['ts','power3']], on='ts').merge(frames[2][['ts','power4']], on='ts')
data = data.merge(frames[3][['ts','temp1']], on='ts')
data = data.merge(frames[4][['ts','temp2']], on='ts')
data = data.merge(frames[5][['ts','temp3']], on='ts')
data = data.merge(frames[6][['ts','temp4']], on='ts')
data = data.merge(frames[7][['ts','temp5']], on='ts')
data = data.merge(frames[8][['ts','temp_in']], on='ts')
data = data.merge(frames[9][['ts','temp_out']], on='ts')

data['ts'] = pd.to_datetime(data['ts'])
data.set_index('ts', inplace=True)
#data.index.freq = 'H'

px.scatter(data["temp_in"])


## Data cleaning and preparation




In [4]:
from datetime import timedelta

# Assuming 'ts' is the datetime index of the DataFrame
data['gap'] = (data.index.to_series() - data.index.to_series().shift(1)) != timedelta(hours=1)

# create a list of data frames based on the gap positions

attributes = ['power12', 'power3', 'power4', 'temp1', 'temp2', 'temp3', 'temp4', 'temp5', 'temp_in', 'temp_out']

dflist = []

start = 0
for stop in range(1, len(data)):
    if data.iloc[stop]['gap']:
        dflist.append(data[start:stop][attributes])
        start = stop
len(dflist)

w = 5 #window size
s = 1 #step
X_all = []
y_all = []

for df in dflist:
    for i in range(0, len(df)-w-1, s):
        X_all.append(df[i:i+w].values)
        y_all.append(df.iloc[i+w]['temp_in'])

In [5]:
df_tin = data["temp_in"]
df_tin

ts
2018-12-31 20:00:00+00:00    45.346010
2018-12-31 21:00:00+00:00    45.350819
2018-12-31 22:00:00+00:00    45.396163
2018-12-31 23:00:00+00:00    45.402380
2019-01-01 00:00:00+00:00    45.440688
                               ...    
2020-09-07 06:00:00+00:00     0.000000
2020-09-07 07:00:00+00:00     0.000000
2020-09-07 08:00:00+00:00     0.000000
2020-09-07 09:00:00+00:00     0.000000
2020-09-07 10:00:00+00:00     0.000000
Name: temp_in, Length: 13829, dtype: float64

# Time series decomposition

In [6]:
from statsmodels.tsa.seasonal import seasonal_decompose
import plotly.express as px

# Decomposition of the time series
decomposition = seasonal_decompose(data["temp_in"], model='additive')

# Extracting the trend, seasonality, and residuals
trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid

trend

ValueError: You must specify a period or x must be a pandas object with a PeriodIndex or a DatetimeIndex with a freq not set to None

In [None]:
import numpy as np

X_all = np.array(X_all)
y_all = np.array(y_all)

print (X_all.shape)
print (y_all.shape)

In [None]:
from statsmodels.tsa.arima.model import ARIMA

y_all = np.array(y_all)

train_size = int(len(y_all) * 0.8)  # 80% of data used for training
train, test = y_all[:train_size], y_all[train_size:]

# Fit an ARIMA model
p = 1
d = 0
q = 6
model = ARIMA(train, order=(p, d, q))
model_fit = model.fit()

predictions = model_fit.forecast(steps=len(test))



In [None]:
import plotly.express as px

# Create a DataFrame for visualization
df = pd.DataFrame({
    'Actual': test,
    'Predicted': predictions,
    'Time': range(len(test))
})

# Calculate the difference
df['Difference'] = df['Actual'] - df['Predicted']

# Create a line plot for actual vs predicted
fig = px.line(df, x='Time', y=['Actual', 'Predicted'])

# Add a bar plot for the difference
fig.add_bar(x=df['Time'], y=df['Difference'], name='Difference')

# Update layout for clarity
fig.update_layout(
    title="ARIMA Model Predictions vs Actual",
    xaxis_title="Time",
    yaxis_title="Values",
    legend_title="Legend",
    barmode='overlay'  # Overlays the bar plot on the line plot
)

# Show the plot
fig.show()


In [None]:
import random
from sklearn.ensemble import GradientBoostingRegressor


mean_baseline = []
last_values_baseline = []
ml_model = []
iters = 20
train_size = int(0.9*len(X_all))

X_all_flat = X_all.reshape(X_all.shape[0], -1)
y_all_baseline = X_all_flat[:, -2]

for i in range(iters):
    all_data = list(zip(X_all_flat, y_all, y_all_baseline))
    random.shuffle(all_data)
    X_all_flat_rand, y_all_rand, y_all_rand_baseline = zip(*all_data)
    X_train = np.array(X_all_flat_rand[:train_size])
    y_train = np.array(y_all_rand[:train_size])
    X_test = np.array(X_all_flat_rand[train_size:])
    y_test = np.array(y_all_rand[train_size:])
    y_baseline = np.array(y_all_rand_baseline[train_size:])
    
    print ("Training iteration {}.".format(i+1))
    regr = GradientBoostingRegressor(n_estimators=500)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    
    m = np.mean(y_train)
    y_mean = np.array([m for i in range(len(y_test))])
    
    mean_baseline.append(rmse(y_mean, y_test))
    last_values_baseline.append(rmse(y_baseline, y_test))
    ml_model.append(rmse(y_pred, y_test))