## Final models on simulated datasets (Training on zone 1 and testing on zone 8)
### Energy consumption normalized and GHI added 

##### *Same experiments as in file RF_OneClass_Final.ipynb to compare the results*

In [363]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, accuracy_score

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from collections import Counter

## 1) Baseline 
### Indoor climate and heating consumption

In [364]:
df_simulated = pd.read_csv('/Users/sararhiger/Desktop/Master thesis /Data/Processed data/Model improvements/Fault4_baseline.csv')

In [365]:
# Define the start and end times for training data
start_time_train = "2023-01-01 00:00:00"  
end_time_train = "2023-12-31 23:00:00"  

# Define the start and end times for testing data
start_time_test = "2023-01-01 00:00:00"  
end_time_test = "2023-12-31 23:00:00"

# Filter DataFrame for timestamps within the range
df_train_simulated = df_simulated[(df_simulated['Time'] >= start_time_train) & (df_simulated['Time'] <= end_time_train)]

df_test_simulated = df_simulated[(df_simulated['Time'] >= start_time_test) & (df_simulated['Time'] <= end_time_test)]

# Zone 1
X_train_zone1_simulated = df_train_simulated[['Mean air temperature, degC, zone 1', 'Relative humidity, %, zone 1', 'CO2, ppm (vol), zone 1', 'Local heating units, W, zone 1']]
y_train_zone1_simulated = df_train_simulated['FDD, zone 1']
X_test_zone1_simulated = df_test_simulated[['Mean air temperature, degC, zone 1', 'Relative humidity, %, zone 1', 'CO2, ppm (vol), zone 1', 'Local heating units, W, zone 1']]
y_test_zone1_simulated = df_test_simulated['FDD, zone 1']

# Zone 8
X_train_zone8_simulated = df_train_simulated[['Mean air temperature, degC, zone 8', 'Relative humidity, %, zone 8', 'CO2, ppm (vol), zone 8', 'Local heating units, W, zone 8']]
y_train_zone8_simulated = df_train_simulated['FDD, zone 8']
X_test_zone8_simulated = df_test_simulated[['Mean air temperature, degC, zone 8', 'Relative humidity, %, zone 8', 'CO2, ppm (vol), zone 8', 'Local heating units, W, zone 8']]
y_test_zone8_simulated = df_test_simulated['FDD, zone 8']

In [366]:
train_x = X_train_zone1_simulated
train_y = y_train_zone1_simulated

test_x = X_test_zone8_simulated
test_y = y_test_zone8_simulated

# Define zones and variables for plots 
test_temp = 'Mean air temperature, degC, zone 8'
test_heat = 'Local heating units, W, zone 8'

In [367]:
# Create figure
fig = go.Figure()

# Plot the temperature curve
fig.add_trace(go.Scatter(
    x=df_simulated['Time'], 
    y=df_simulated['Mean air temperature, degC, zone 1'], 
    mode='lines', 
    name='Zone 1 faulty operation open window during heating'
))

# Find timestamps where "FDD, zone 1" is NOT 0
fault_indices = df_simulated['FDD, zone 1'] != 0
fault_times = df_simulated.loc[fault_indices, 'Time']
fault_temps = df_simulated.loc[fault_indices, 'Mean air temperature, degC, zone 1']

# Add markers for fault occurrences
fig.add_trace(go.Scatter(
    x=fault_times,
    y=fault_temps,
    mode='markers',
    marker=dict(color='red', size=10, symbol='x'),
    name='Fault implemented (FDD ≠ 0)'
))

# Customize layout
fig.update_layout(
    title="Temperature with fault xx implemented in Zone 1",
    xaxis_title="Time",
    yaxis_title="Temperature (°C)",
    legend_title="Legend",
    template="plotly"
)

# Show the plot
fig.show()

### Random forest 

In [368]:
# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)#, class_weight='balanced')

# Train the model
rf_model.fit(train_x, train_y)

# Make predictions
y_pred_train = rf_model.predict(train_x)
y_pred = rf_model.predict(test_x)

# Evaluate the model
print("Accuracy:", accuracy_score(train_y, y_pred_train))
print("Train Report:\n", classification_report(train_y, y_pred_train))
print()
print("Accuracy:", accuracy_score(test_y, y_pred))
print("\nClassification Report:\n", classification_report(test_y, y_pred))

print(rf_model.feature_importances_)

Accuracy: 1.0
Train Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8572
           4       1.00      1.00      1.00       188

    accuracy                           1.00      8760
   macro avg       1.00      1.00      1.00      8760
weighted avg       1.00      1.00      1.00      8760


Accuracy: 0.9932648401826484

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8697
           4       0.55      0.33      0.42        63

    accuracy                           0.99      8760
   macro avg       0.77      0.67      0.71      8760
weighted avg       0.99      0.99      0.99      8760

[0.4140711  0.03755824 0.0295789  0.51879177]


In [369]:
# Store all runs
results = []

for i in range(10):
    rf_model = RandomForestClassifier()
    rf_model.fit(train_x, train_y)

    y_pred = rf_model.predict(test_x)
    report = classification_report(test_y, y_pred, output_dict=True)

    # Store per-class metrics
    run_result = {'run': i + 1}
    for label in report:
        if label not in ('accuracy', 'macro avg', 'weighted avg'):
            run_result[f'precision_class_{label}'] = report[label]['precision']
            run_result[f'recall_class_{label}'] = report[label]['recall']
            run_result[f'f1_class_{label}'] = report[label]['f1-score']
    
    results.append(run_result)

# Results as DataFrame
results_df = pd.DataFrame(results)

# Summary (mean and std)
summary = results_df.describe().loc[['mean', 'std']]

print("\nSummary (Mean ± Std):")
print(summary)

print("\nAll Per-Class Results:")
print(results_df)


Summary (Mean ± Std):
          run  precision_class_0  recall_class_0  f1_class_0  \
mean  5.50000           0.996999        0.956870     0.97585   
std   3.02765           0.002161        0.050651     0.02575   

      precision_class_4  recall_class_4  f1_class_4  
mean           0.373249        0.588889    0.277651  
std            0.329723        0.303987    0.147864  

All Per-Class Results:
   run  precision_class_0  recall_class_0  f1_class_0  precision_class_4  \
0    1           0.999357        0.894216    0.943868           0.059305   
1    2           0.996955        0.978843    0.987816           0.167421   
2    3           0.995867        0.997355    0.996611           0.540000   
3    4           0.999379        0.925722    0.961141           0.082386   
4    5           0.994396        0.999655    0.997018           0.823529   
5    6           0.994962        0.999080    0.997017           0.703704   
6    7           0.999356        0.891802    0.942520           0.

In [370]:
# Compute confusion matrix
cm = confusion_matrix(test_y, y_pred)

# Ensure labels are 0 and 1
labels = [0, 4]  

# Create the confusion matrix heatmap
fig = go.Figure(data=go.Heatmap(
    z=cm,
    x=labels,  # Predicted labels
    y=labels,  # Actual labels
    colorscale='Blues',
    showscale=True,
    text=cm,  # Display numbers in the cells
    texttemplate="%{text}",  # Ensure counts appear inside the cells
    hovertemplate='Actual: %{y}<br>Predicted: %{x}<br>Count: %{z}<extra></extra>'
))

# Update layout for binary classification (0 and 1 on axes)
fig.update_layout(
    title="Confusion Matrix for zone 8 opened window",
    width=400,   # Reduce width
    height=400,
    xaxis=dict(
        title="Predicted Label",
        tickmode='array',        # Custom tick labels
        tickvals=[0, 4],         # Only show 0 and 1
        showgrid=False
    ),
    yaxis=dict(
        title="Actual Label",
        tickmode='array',
        tickvals=[0, 4],         # Only show 0 and 1
        showgrid=False
    ),
    template="plotly_white"
)

# Show the plot
fig.show()

In [371]:
fig = go.Figure()

# Base temperature curve
fig.add_trace(go.Scatter(
    x=df_simulated['Time'], 
    y=df_simulated[test_temp], 
    mode='lines', 
    name='Zone 8 faulty operation'
))

# Convert y_pred to a Pandas Series with the correct index
y_pred_series = pd.Series(y_pred, index=test_y.index)

# Find indices for different cases
correct_1_indices = (test_y == 4) & (y_pred_series == 4)  # True 1, Pred 1 -> Green
false_negative_indices = (test_y == 4) & (y_pred_series == 0)  # True 1, Pred 0 -> Red
false_positive_indices = (test_y == 0) & (y_pred_series == 4)  # True 0, Pred 1 -> Orange

# Extract timestamps
correct_1_timestamps = df_simulated.loc[test_y.index, 'Time'][correct_1_indices]
false_negative_timestamps = df_simulated.loc[test_y.index, 'Time'][false_negative_indices]
false_positive_timestamps = df_simulated.loc[test_y.index, 'Time'][false_positive_indices]

# Extract corresponding temperatures
correct_1_temps = df_simulated.loc[test_y.index, test_temp][correct_1_indices]
false_negative_temps = df_simulated.loc[test_y.index, test_temp][false_negative_indices]
false_positive_temps = df_simulated.loc[test_y.index, test_temp][false_positive_indices]

# Add markers for correct predictions (Green)
fig.add_trace(go.Scatter(
    x=correct_1_timestamps,
    y=correct_1_temps,
    mode='markers',
    marker=dict(color='green', size=10),
    name='Correct (1 predicted as 1)'
))

# Add markers for false negatives (Red)
fig.add_trace(go.Scatter(
    x=false_negative_timestamps,
    y=false_negative_temps,
    mode='markers',
    marker=dict(color='red', size=10, symbol='x'),
    name='False Negative (1 predicted as 0)'
))

# Add markers for false positives (Orange)
fig.add_trace(go.Scatter(
    x=false_positive_timestamps,
    y=false_positive_temps,
    mode='markers',
    marker=dict(color='orange', size=10, symbol='triangle-up'),
    name='False Positive (0 predicted as 1)'
))

# Customize layout
fig.update_layout(
    title="Temperatures with faulty operation in zone 8 & Prediction Markers",
    xaxis_title="Time",
    yaxis_title="Temperature (°C)",
    legend_title="Legend",
    template="plotly"
)

# Show the plot
fig.show()

### XGboost

## 2) Baseline + heat

In [372]:
df_simulated = pd.read_csv('/Users/sararhiger/Desktop/Master thesis /Data/Processed data/Model improvements/Fault4_baseline_heat.csv')


In [373]:
# Define the start and end times for training data
start_time_train = "2023-01-01 00:00:00"  
end_time_train = "2023-12-31 23:00:00"  

# Define the start and end times for testing data
start_time_test = "2023-01-01 00:00:00"  
end_time_test = "2023-12-31 23:00:00"

# Filter DataFrame for timestamps within the range
df_train_simulated = df_simulated[(df_simulated['Time'] >= start_time_train) & (df_simulated['Time'] <= end_time_train)]

df_test_simulated = df_simulated[(df_simulated['Time'] >= start_time_test) & (df_simulated['Time'] <= end_time_test)]

# Zone 1
X_train_zone1_simulated = df_train_simulated[['Mean air temperature, degC, zone 1', 'Relative humidity, %, zone 1', 'CO2, ppm (vol), zone 1', 'Local heating units, W, zone 1']]
y_train_zone1_simulated = df_train_simulated['FDD, zone 1']
X_test_zone1_simulated = df_test_simulated[['Mean air temperature, degC, zone 1', 'Relative humidity, %, zone 1', 'CO2, ppm (vol), zone 1', 'Local heating units, W, zone 1']]
y_test_zone1_simulated = df_test_simulated['FDD, zone 1']

# Zone 8
X_train_zone8_simulated = df_train_simulated[['Mean air temperature, degC, zone 8', 'Relative humidity, %, zone 8', 'CO2, ppm (vol), zone 8', 'Local heating units, W, zone 8']]
y_train_zone8_simulated = df_train_simulated['FDD, zone 8']
X_test_zone8_simulated = df_test_simulated[['Mean air temperature, degC, zone 8', 'Relative humidity, %, zone 8', 'CO2, ppm (vol), zone 8', 'Local heating units, W, zone 8']]
y_test_zone8_simulated = df_test_simulated['FDD, zone 8']

In [374]:
train_x = X_train_zone1_simulated
train_y = y_train_zone1_simulated

test_x = X_test_zone8_simulated
test_y = y_test_zone8_simulated

# Define zones and variables for plots 
test_temp = 'Mean air temperature, degC, zone 8'
test_heat = 'Local heating units, W, zone 8'

In [375]:
# Create figure
fig = go.Figure()

# Plot the temperature curve
fig.add_trace(go.Scatter(
    x=df_simulated['Time'], 
    y=df_simulated['Mean air temperature, degC, zone 1'], 
    mode='lines', 
    name='Zone 1 faulty operation'
))

# Find timestamps where "FDD, zone 1" is NOT 0
fault_indices = df_simulated['FDD, zone 1'] != 0
fault_times = df_simulated.loc[fault_indices, 'Time']
fault_temps = df_simulated.loc[fault_indices, 'Mean air temperature, degC, zone 1']

# Add markers for fault occurrences
fig.add_trace(go.Scatter(
    x=fault_times,
    y=fault_temps,
    mode='markers',
    marker=dict(color='red', size=10, symbol='x'),
    name='Fault implemented (FDD ≠ 0)'
))

# Customize layout
fig.update_layout(
    title="Temperature with fault xx implemented in Zone 1",
    xaxis_title="Time",
    yaxis_title="Temperature (°C)",
    legend_title="Legend",
    template="plotly"
)

# Show the plot
fig.show()

### Random forest

In [376]:
# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)#, class_weight='balanced')

# Train the model
rf_model.fit(train_x, train_y)

# Make predictions
y_pred_train = rf_model.predict(train_x)
y_pred = rf_model.predict(test_x)

# Evaluate the model
print("Accuracy:", accuracy_score(train_y, y_pred_train))
print("Train Report:\n", classification_report(train_y, y_pred_train))
print()
print("Accuracy:", accuracy_score(test_y, y_pred))
print("\nClassification Report:\n", classification_report(test_y, y_pred))

print(rf_model.feature_importances_)

Accuracy: 1.0
Train Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8572
           4       1.00      1.00      1.00       188

    accuracy                           1.00      8760
   macro avg       1.00      1.00      1.00      8760
weighted avg       1.00      1.00      1.00      8760


Accuracy: 0.9948630136986302

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8697
           4       0.88      0.33      0.48        63

    accuracy                           0.99      8760
   macro avg       0.94      0.67      0.74      8760
weighted avg       0.99      0.99      0.99      8760

[0.4140711  0.03755824 0.0295789  0.51879177]


In [377]:
# Store all runs
results = []

for i in range(10):
    rf_model = RandomForestClassifier()
    rf_model.fit(train_x, train_y)

    y_pred = rf_model.predict(test_x)
    report = classification_report(test_y, y_pred, output_dict=True)

    # Store per-class metrics
    run_result = {'run': i + 1}
    for label in report:
        if label not in ('accuracy', 'macro avg', 'weighted avg'):
            run_result[f'precision_class_{label}'] = report[label]['precision']
            run_result[f'recall_class_{label}'] = report[label]['recall']
            run_result[f'f1_class_{label}'] = report[label]['f1-score']
    
    results.append(run_result)

# Results as DataFrame
results_df = pd.DataFrame(results)

# Summary (mean and std)
summary = results_df.describe().loc[['mean', 'std']]

print("\nSummary (Mean ± Std):")
print(summary)

print("\nAll Per-Class Results:")
print(results_df)


Summary (Mean ± Std):
          run  precision_class_0  recall_class_0  f1_class_0  \
mean  5.50000           0.998967        0.998942    0.998953   
std   3.02765           0.001367        0.001728    0.001007   

      precision_class_4  recall_class_4  f1_class_4  
mean           0.883656        0.857143    0.850485  
std            0.130382        0.189740    0.155736  

All Per-Class Results:
   run  precision_class_0  recall_class_0  f1_class_0  precision_class_4  \
0    1           0.995079        0.999770    0.997419           0.909091   
1    2           0.999425        0.998505    0.998965           0.816901   
2    3           0.999422        0.994136    0.996772           0.532110   
3    4           0.999310        0.999540    0.999425           0.934426   
4    5           0.999425        0.999655    0.999540           0.950820   
5    6           0.999425        0.999540    0.999483           0.935484   
6    7           0.999425        0.999540    0.999483           0.

In [378]:
# Compute confusion matrix
cm = confusion_matrix(test_y, y_pred)

# Ensure labels are 0 and 1
labels = [0, 4]  

# Create the confusion matrix heatmap
fig = go.Figure(data=go.Heatmap(
    z=cm,
    x=labels,  # Predicted labels
    y=labels,  # Actual labels
    colorscale='Blues',
    showscale=True,
    text=cm,  # Display numbers in the cells
    texttemplate="%{text}",  # Ensure counts appear inside the cells
    hovertemplate='Actual: %{y}<br>Predicted: %{x}<br>Count: %{z}<extra></extra>'
))

# Update layout for binary classification (0 and 1 on axes)
fig.update_layout(
    title="Confusion Matrix for zone 8 thermostat fault",
    width=500,   # Reduce width
    height=400,
    xaxis=dict(
        title="Predicted Label",
        tickmode='array',        # Custom tick labels
        tickvals=[0, 4],         # Only show 0 and 1
        showgrid=False
    ),
    yaxis=dict(
        title="Actual Label",
        tickmode='array',
        tickvals=[0, 4],         # Only show 0 and 1
        showgrid=False
    ),
    template="plotly_white"
)

# Show the plot
fig.show()

In [379]:
fig = go.Figure()

# Base temperature curve
fig.add_trace(go.Scatter(
    x=df_simulated['Time'], 
    y=df_simulated[test_temp], 
    mode='lines', 
    name='Temperature (C°)',
    yaxis='y1'
    ))

# Convert y_pred to a Pandas Series with the correct index
y_pred_series = pd.Series(y_pred, index=test_y.index)

# Find indices for different cases
correct_1_indices = (test_y == 4) & (y_pred_series == 4)  # True 1, Pred 1 -> Green
false_negative_indices = (test_y == 4) & (y_pred_series == 0)  # True 1, Pred 0 -> Red
false_positive_indices = (test_y == 0) & (y_pred_series == 4)  # True 0, Pred 1 -> Orange

# Extract timestamps
correct_1_timestamps = df_simulated.loc[test_y.index, 'Time'][correct_1_indices]
false_negative_timestamps = df_simulated.loc[test_y.index, 'Time'][false_negative_indices]
false_positive_timestamps = df_simulated.loc[test_y.index, 'Time'][false_positive_indices]

# Extract corresponding temperatures
correct_1_temps = df_simulated.loc[test_y.index, test_temp][correct_1_indices]
false_negative_temps = df_simulated.loc[test_y.index, test_temp][false_negative_indices]
false_positive_temps = df_simulated.loc[test_y.index, test_temp][false_positive_indices]

# Add markers for correct predictions (Green)
fig.add_trace(go.Scatter(
    x=correct_1_timestamps,
    y=correct_1_temps,
    mode='markers',
    marker=dict(color='green', size=10),
    name='Correct (2 predicted as 2)'
))

# Add markers for false negatives (Red)
fig.add_trace(go.Scatter(
    x=false_negative_timestamps,
    y=false_negative_temps,
    mode='markers',
    marker=dict(color='red', size=10, symbol='x'),
    name='False Negative (2 predicted as 0)'
))

# Add markers for false positives (Orange)
fig.add_trace(go.Scatter(
    x=false_positive_timestamps,
    y=false_positive_temps,
    mode='markers',
    marker=dict(color='orange', size=10, symbol='triangle-up'),
    name='False Positive (0 predicted as 2)'
))

# Customize layout
#fig.update_layout(
#    title="Temperature with low setpoint fault simulated in Zone 8 with prediction markers",
#    xaxis_title="Time",
 #   yaxis_title="Temperature (°C)",
   # legend_title="Legend",
    #template="plotly"
#)

# Update layout with secondary y-axis
fig.update_layout(
    title="Temperature & heating consumption in zone 8 with lowered setpoint simulated",
    xaxis_title="Time",
    yaxis=dict(
        title="Temperature (°C)",
        titlefont=dict(color="black"),
        tickfont=dict(color="black")
    ),
    yaxis2=dict(
        title="Heating Consumption (kWh)",  # Adjust unit if needed
        overlaying='y',
        side='right',
        titlefont=dict(color="black"),
        tickfont=dict(color="black"),
    ),
    legend=dict(
        x=1.05,         # Pushes legend to the right (1.0 is the edge of the plot area)
        y=1,            # Top alignment
        xanchor="left", # Anchor relative to left of the legend box
        yanchor="top"
    ),
    legend_title="Legend",
    template="plotly",
    height=600,
    width=1200,
)

# Show the plot

In [380]:
fig = go.Figure()

# Base temperature curve
fig.add_trace(go.Scatter(
    x=df_simulated['Time'], 
    y=df_simulated[test_heat], 
    mode='lines', 
    name='Zone 8 faulty operation'
))

# Convert y_pred to a Pandas Series with the correct index
y_pred_series = pd.Series(y_pred, index=test_y.index)

# Find indices for different cases
correct_1_indices = (test_y == 4) & (y_pred_series == 4)  # True 1, Pred 1 -> Green
false_negative_indices = (test_y == 4) & (y_pred_series == 0)  # True 1, Pred 0 -> Red
false_positive_indices = (test_y == 0) & (y_pred_series == 4)  # True 0, Pred 1 -> Orange

# Extract timestamps
correct_1_timestamps = df_simulated.loc[test_y.index, 'Time'][correct_1_indices]
false_negative_timestamps = df_simulated.loc[test_y.index, 'Time'][false_negative_indices]
false_positive_timestamps = df_simulated.loc[test_y.index, 'Time'][false_positive_indices]

# Extract corresponding temperatures
correct_1_temps = df_simulated.loc[test_y.index, test_heat][correct_1_indices]
false_negative_temps = df_simulated.loc[test_y.index, test_heat][false_negative_indices]
false_positive_temps = df_simulated.loc[test_y.index, test_heat][false_positive_indices]

# Add markers for correct predictions (Green)
fig.add_trace(go.Scatter(
    x=correct_1_timestamps,
    y=correct_1_temps,
    mode='markers',
    marker=dict(color='green', size=10),
    name='Correct (4 predicted as 4)'
))

# Add markers for false negatives (Red)
fig.add_trace(go.Scatter(
    x=false_negative_timestamps,
    y=false_negative_temps,
    mode='markers',
    marker=dict(color='red', size=10, symbol='x'),
    name='False Negative (4 predicted as 0)'
))

# Add markers for false positives (Orange)
fig.add_trace(go.Scatter(
    x=false_positive_timestamps,
    y=false_positive_temps,
    mode='markers',
    marker=dict(color='orange', size=10, symbol='triangle-up'),
    name='False Positive (0 predicted as 4)'
))

# Customize layout
fig.update_layout(
    title="Heating consumption with faulty operation in zone 8 & Prediction Markers",
    xaxis_title="Time",
    yaxis_title="Heat (kWh)",
    legend_title="Legend",
    template="plotly"
)

# Show the plot

## 3) Baseline + heat + outdoor temp 

In [381]:
df_simulated = pd.read_csv('/Users/sararhiger/Desktop/Master thesis /Data/Processed data/Model improvements/Fault4_baseline_heat_outdoor.csv')



In [382]:
# Define the start and end times for training data
start_time_train = "2023-01-01 00:00:00"  
end_time_train = "2023-12-31 23:00:00"  

# Define the start and end times for testing data
start_time_test = "2023-01-01 00:00:00"  
end_time_test = "2023-12-31 23:00:00"

# Filter DataFrame for timestamps within the range
df_train_simulated = df_simulated[(df_simulated['Time'] >= start_time_train) & (df_simulated['Time'] <= end_time_train)]

df_test_simulated = df_simulated[(df_simulated['Time'] >= start_time_test) & (df_simulated['Time'] <= end_time_test)]

# Zone 1
X_train_zone1_simulated = df_train_simulated[['Outdoor air temp','Mean air temperature, degC, zone 1', 'Relative humidity, %, zone 1', 'CO2, ppm (vol), zone 1', 'Local heating units, W, zone 1']]
y_train_zone1_simulated = df_train_simulated['FDD, zone 1']
X_test_zone1_simulated = df_test_simulated[['Outdoor air temp','Mean air temperature, degC, zone 1', 'Relative humidity, %, zone 1', 'CO2, ppm (vol), zone 1', 'Local heating units, W, zone 1']]
y_test_zone1_simulated = df_test_simulated['FDD, zone 1']

# Zone 8
X_train_zone8_simulated = df_train_simulated[[ 'Outdoor air temp','Mean air temperature, degC, zone 8', 'Relative humidity, %, zone 8', 'CO2, ppm (vol), zone 8', 'Local heating units, W, zone 8']]
y_train_zone8_simulated = df_train_simulated['FDD, zone 8']
X_test_zone8_simulated = df_test_simulated[['Outdoor air temp','Mean air temperature, degC, zone 8', 'Relative humidity, %, zone 8', 'CO2, ppm (vol), zone 8', 'Local heating units, W, zone 8']]
y_test_zone8_simulated = df_test_simulated['FDD, zone 8']

In [383]:
train_x = X_train_zone1_simulated
train_y = y_train_zone1_simulated

test_x = X_test_zone8_simulated
test_y = y_test_zone8_simulated

# Define zones and variables for plots 
test_temp = 'Mean air temperature, degC, zone 8'
test_heat = 'Local heating units, W, zone 8'

### Random Forest 

In [384]:
# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)#, class_weight='balanced')

# Train the model
rf_model.fit(train_x, train_y)

# Make predictions
y_pred_train = rf_model.predict(train_x)
y_pred = rf_model.predict(test_x)

# Evaluate the model
print("Accuracy:", accuracy_score(train_y, y_pred_train))
print("Train Report:\n", classification_report(train_y, y_pred_train))
print()
print("Accuracy:", accuracy_score(test_y, y_pred))
print("\nClassification Report:\n", classification_report(test_y, y_pred))

print(rf_model.feature_importances_)

Accuracy: 1.0
Train Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8572
           4       1.00      1.00      1.00       188

    accuracy                           1.00      8760
   macro avg       1.00      1.00      1.00      8760
weighted avg       1.00      1.00      1.00      8760


Accuracy: 0.9970319634703196

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8697
           4       0.91      0.65      0.76        63

    accuracy                           1.00      8760
   macro avg       0.95      0.83      0.88      8760
weighted avg       1.00      1.00      1.00      8760

[0.0221119  0.3574918  0.03435822 0.02878851 0.55724956]


In [385]:
# Store all runs
results = []

for i in range(10):
    rf_model = RandomForestClassifier()
    rf_model.fit(train_x, train_y)

    y_pred = rf_model.predict(test_x)
    report = classification_report(test_y, y_pred, output_dict=True)

    # Store per-class metrics
    run_result = {'run': i + 1}
    for label in report:
        if label not in ('accuracy', 'macro avg', 'weighted avg'):
            run_result[f'precision_class_{label}'] = report[label]['precision']
            run_result[f'recall_class_{label}'] = report[label]['recall']
            run_result[f'f1_class_{label}'] = report[label]['f1-score']
    
    results.append(run_result)

# Results as DataFrame
results_df = pd.DataFrame(results)

# Summary (mean and std)
summary = results_df.describe().loc[['mean', 'std']]

print("\nSummary (Mean ± Std):")
print(summary)

print("\nAll Per-Class Results:")
print(results_df)


Summary (Mean ± Std):
          run  precision_class_0  recall_class_0  f1_class_0  \
mean  5.50000           0.999735        0.998517    0.999125   
std   3.02765           0.000571        0.001211    0.000717   

      precision_class_4  recall_class_4  f1_class_4  
mean           0.835798        0.963492    0.891818  
std            0.108716        0.078852    0.084905  

All Per-Class Results:
   run  precision_class_0  recall_class_0  f1_class_0  precision_class_4  \
0    1           0.999885        0.998505    0.999195           0.826667   
1    2           1.000000        0.999540    0.999770           0.940299   
2    3           0.998275        0.998275    0.998275           0.761905   
3    4           1.000000        0.999425    0.999712           0.926471   
4    5           1.000000        0.999655    0.999827           0.954545   
5    6           1.000000        0.995746    0.997868           0.630000   
6    7           1.000000        0.997930    0.998964           0.

## 4) Baseline + heat + Outdoor + GHI 

In [387]:
df_simulated = pd.read_csv('/Users/sararhiger/Desktop/Master thesis /Data/Processed data/Model improvements/Fault4_baseline_heat_outdoor_GHI.csv')


In [388]:
# Define the start and end times for training data
start_time_train = "2023-01-01 00:00:00"  
end_time_train = "2023-12-31 23:00:00"  

# Define the start and end times for testing data
start_time_test = "2023-01-01 00:00:00"  
end_time_test = "2023-12-31 23:00:00"

# Filter DataFrame for timestamps within the range
df_train_simulated = df_simulated[(df_simulated['Time'] >= start_time_train) & (df_simulated['Time'] <= end_time_train)]

df_test_simulated = df_simulated[(df_simulated['Time'] >= start_time_test) & (df_simulated['Time'] <= end_time_test)]

# Zone 1
X_train_zone1_simulated = df_train_simulated[['Global Horizontal Irradiance, W/m2','Outdoor air temp','Mean air temperature, degC, zone 1', 'Relative humidity, %, zone 1', 'CO2, ppm (vol), zone 1', 'Local heating units, W, zone 1']]
y_train_zone1_simulated = df_train_simulated['FDD, zone 1']
X_test_zone1_simulated = df_test_simulated[['Global Horizontal Irradiance, W/m2','Outdoor air temp','Mean air temperature, degC, zone 1', 'Relative humidity, %, zone 1', 'CO2, ppm (vol), zone 1', 'Local heating units, W, zone 1']]
y_test_zone1_simulated = df_test_simulated['FDD, zone 1']

# Zone 8
X_train_zone8_simulated = df_train_simulated[['Global Horizontal Irradiance, W/m2', 'Outdoor air temp','Mean air temperature, degC, zone 8', 'Relative humidity, %, zone 8', 'CO2, ppm (vol), zone 8', 'Local heating units, W, zone 8']]
y_train_zone8_simulated = df_train_simulated['FDD, zone 8']
X_test_zone8_simulated = df_test_simulated[['Global Horizontal Irradiance, W/m2', 'Outdoor air temp','Mean air temperature, degC, zone 8', 'Relative humidity, %, zone 8', 'CO2, ppm (vol), zone 8', 'Local heating units, W, zone 8']]
y_test_zone8_simulated = df_test_simulated['FDD, zone 8']

In [389]:
train_x = X_train_zone1_simulated
train_y = y_train_zone1_simulated

test_x = X_test_zone8_simulated
test_y = y_test_zone8_simulated

# Define zones and variables for plots 
test_temp = 'Mean air temperature, degC, zone 8'
test_heat = 'Local heating units, W, zone 8'

In [390]:
# Create figure
fig = go.Figure()

# Plot the temperature curve
fig.add_trace(go.Scatter(
    x=df_simulated['Time'], 
    y=df_simulated['Mean air temperature, degC, zone 1'], 
    mode='lines', 
    name='Zone 1 faulty operation open window during heating'
))

# Find timestamps where "FDD, zone 1" is NOT 0
fault_indices = df_simulated['FDD, zone 1'] != 0
fault_times = df_simulated.loc[fault_indices, 'Time']
fault_temps = df_simulated.loc[fault_indices, 'Mean air temperature, degC, zone 1']

# Add markers for fault occurrences
fig.add_trace(go.Scatter(
    x=fault_times,
    y=fault_temps,
    mode='markers',
    marker=dict(color='red', size=10, symbol='x'),
    name='Fault implemented (FDD ≠ 0)'
))

# Customize layout
fig.update_layout(
    title="Temperature with fault 4 implemented in Zone 1",
    xaxis_title="Time",
    yaxis_title="Temperature (°C)",
    legend_title="Legend",
    template="plotly"
)

# Show the plot
fig.show()

### Random forest

In [391]:
# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)#, class_weight='balanced')

# Train the model
rf_model.fit(train_x, train_y)

# Make predictions
y_pred_train = rf_model.predict(train_x)
y_pred = rf_model.predict(test_x)

# Evaluate the model
print("Accuracy:", accuracy_score(train_y, y_pred_train))
print("Train Report:\n", classification_report(train_y, y_pred_train))
print()
print("Accuracy:", accuracy_score(test_y, y_pred))
print("\nClassification Report:\n", classification_report(test_y, y_pred))

print(rf_model.feature_importances_)

Accuracy: 1.0
Train Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8572
           4       1.00      1.00      1.00       188

    accuracy                           1.00      8760
   macro avg       1.00      1.00      1.00      8760
weighted avg       1.00      1.00      1.00      8760


Accuracy: 0.9987442922374429

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8697
           4       0.85      1.00      0.92        63

    accuracy                           1.00      8760
   macro avg       0.93      1.00      0.96      8760
weighted avg       1.00      1.00      1.00      8760

[0.02083009 0.02255519 0.31722544 0.03998682 0.02048719 0.57891527]


In [392]:
# Store all runs
results = []

for i in range(10):
    rf_model = RandomForestClassifier()
    rf_model.fit(train_x, train_y)

    y_pred = rf_model.predict(test_x)
    report = classification_report(test_y, y_pred, output_dict=True)

    # Store per-class metrics
    run_result = {'run': i + 1}
    for label in report:
        if label not in ('accuracy', 'macro avg', 'weighted avg'):
            run_result[f'precision_class_{label}'] = report[label]['precision']
            run_result[f'recall_class_{label}'] = report[label]['recall']
            run_result[f'f1_class_{label}'] = report[label]['f1-score']
    
    results.append(run_result)

# Results as DataFrame
results_df = pd.DataFrame(results)

# Summary (mean and std)
summary = results_df.describe().loc[['mean', 'std']]

print("\nSummary (Mean ± Std):")
print(summary)

print("\nAll Per-Class Results:")
print(results_df)


Summary (Mean ± Std):
          run  precision_class_0  recall_class_0  f1_class_0  \
mean  5.50000           0.998841        0.999494    0.999167   
std   3.02765           0.001384        0.000238    0.000692   

      precision_class_4  recall_class_4  f1_class_4  
mean           0.921612        0.839683    0.868377  
std            0.032659        0.191568    0.123246  

All Per-Class Results:
   run  precision_class_0  recall_class_0  f1_class_0  precision_class_4  \
0    1           0.996104        0.999655    0.997877           0.906250   
1    2           1.000000        0.999540    0.999770           0.940299   
2    3           0.998850        0.998850    0.998850           0.841270   
3    4           1.000000        0.999655    0.999827           0.954545   
4    5           1.000000        0.999540    0.999770           0.940299   
5    6           0.997361        0.999655    0.998507           0.930233   
6    7           0.999885        0.999540    0.999712           0.

In [393]:
# Compute confusion matrix
cm = confusion_matrix(test_y, y_pred)

# Ensure labels are 0 and 4
labels = [0, 4]  

# Create the confusion matrix heatmap
fig = go.Figure(data=go.Heatmap(
    z=cm,
    x=labels,  # Predicted labels
    y=labels,  # Actual labels
    colorscale='Blues',
    showscale=True,
    text=cm,  # Display numbers in the cells
    texttemplate="%{text}",  # Ensure counts appear inside the cells
    hovertemplate='Actual: %{y}<br>Predicted: %{x}<br>Count: %{z}<extra></extra>'
))

# Update layout for binary classification (0 and 1 on axes)
fig.update_layout(
    title="Confusion Matrix for zone 8 fault 4 ",
    width=500,   # Reduce width
    height=400,
    xaxis=dict(
        title="Predicted Label",
        tickmode='array',        # Custom tick labels
        tickvals=[0, 4],         # Only show 0 and 1
        showgrid=False
    ),
    yaxis=dict(
        title="Actual Label",
        tickmode='array',
        tickvals=[0, 4],         # Only show 0 and 1
        showgrid=False
    ),
    template="plotly_white"
)

# Show the plot
fig.show()

In [394]:
fig = go.Figure()

# Base temperature curve
fig.add_trace(go.Scatter(
    x=df_simulated['Time'], 
    y=df_simulated[test_heat], 
    mode='lines', 
    name='Zone 8 faulty operation'
))

# Convert y_pred to a Pandas Series with the correct index
y_pred_series = pd.Series(y_pred, index=test_y.index)

# Find indices for different cases
correct_1_indices = (test_y == 4) & (y_pred_series == 4)  # True 1, Pred 1 -> Green
false_negative_indices = (test_y == 4) & (y_pred_series == 0)  # True 1, Pred 0 -> Red
false_positive_indices = (test_y == 0) & (y_pred_series == 4)  # True 0, Pred 1 -> Orange

# Extract timestamps
correct_1_timestamps = df_simulated.loc[test_y.index, 'Time'][correct_1_indices]
false_negative_timestamps = df_simulated.loc[test_y.index, 'Time'][false_negative_indices]
false_positive_timestamps = df_simulated.loc[test_y.index, 'Time'][false_positive_indices]

# Extract corresponding temperatures
correct_1_temps = df_simulated.loc[test_y.index, test_heat][correct_1_indices]
false_negative_temps = df_simulated.loc[test_y.index, test_heat][false_negative_indices]
false_positive_temps = df_simulated.loc[test_y.index, test_heat][false_positive_indices]

# Add markers for correct predictions (Green)
fig.add_trace(go.Scatter(
    x=correct_1_timestamps,
    y=correct_1_temps,
    mode='markers',
    marker=dict(color='green', size=10),
    name='Correct (4 predicted as 4)'
))

# Add markers for false negatives (Red)
fig.add_trace(go.Scatter(
    x=false_negative_timestamps,
    y=false_negative_temps,
    mode='markers',
    marker=dict(color='red', size=10, symbol='x'),
    name='False Negative (4 predicted as 0)'
))

# Add markers for false positives (Orange)
fig.add_trace(go.Scatter(
    x=false_positive_timestamps,
    y=false_positive_temps,
    mode='markers',
    marker=dict(color='orange', size=10, symbol='triangle-up'),
    name='False Positive (0 predicted as 4)'
))

# Customize layout
fig.update_layout(
    title="Heating consumption with faulty operation in zone 8 & prediction markers",
    xaxis_title="Time",
    yaxis_title="Heat (kWh)",
    legend_title="Legend",
    template="plotly", 
    height = 500,
    width = 1200
)

# Show the plot

## 5) Normalized temperature

In [395]:
df_simulated = pd.read_csv('/Users/sararhiger/Desktop/Master thesis /Data/Processed data/Model improvements/Fault4_baseline_outdoor_GHI_heat_temp.csv')

In [396]:
# Define the start and end times for training data
start_time_train = "2023-01-01 00:00:00"  
end_time_train = "2023-12-31 23:00:00"  

# Define the start and end times for testing data
start_time_test = "2023-01-01 00:00:00"  
end_time_test = "2023-12-31 23:00:00"

# Filter DataFrame for timestamps within the range
df_train_simulated = df_simulated[(df_simulated['Time'] >= start_time_train) & (df_simulated['Time'] <= end_time_train)]

df_test_simulated = df_simulated[(df_simulated['Time'] >= start_time_test) & (df_simulated['Time'] <= end_time_test)]

# Zone 1
X_train_zone1_simulated = df_train_simulated[['Global Horizontal Irradiance, W/m2','Outdoor air temp','Mean air temperature, degC, zone 1', 'Relative humidity, %, zone 1', 'CO2, ppm (vol), zone 1', 'Local heating units, W, zone 1']]
y_train_zone1_simulated = df_train_simulated['FDD, zone 1']
X_test_zone1_simulated = df_test_simulated[['Global Horizontal Irradiance, W/m2','Outdoor air temp','Mean air temperature, degC, zone 1', 'Relative humidity, %, zone 1', 'CO2, ppm (vol), zone 1', 'Local heating units, W, zone 1']]
y_test_zone1_simulated = df_test_simulated['FDD, zone 1']

# Zone 8
X_train_zone8_simulated = df_train_simulated[['Global Horizontal Irradiance, W/m2', 'Outdoor air temp','Mean air temperature, degC, zone 8', 'Relative humidity, %, zone 8', 'CO2, ppm (vol), zone 8', 'Local heating units, W, zone 8']]
y_train_zone8_simulated = df_train_simulated['FDD, zone 8']
X_test_zone8_simulated = df_test_simulated[['Global Horizontal Irradiance, W/m2', 'Outdoor air temp','Mean air temperature, degC, zone 8', 'Relative humidity, %, zone 8', 'CO2, ppm (vol), zone 8', 'Local heating units, W, zone 8']]
y_test_zone8_simulated = df_test_simulated['FDD, zone 8']

In [397]:
train_x = X_train_zone1_simulated
train_y = y_train_zone1_simulated

test_x = X_test_zone8_simulated
test_y = y_test_zone8_simulated

# Define zones and variables for plots 
test_temp = 'Mean air temperature, degC, zone 8'
test_heat = 'Local heating units, W, zone 8'

In [398]:
# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)#, class_weight='balanced')

# Train the model
rf_model.fit(train_x, train_y)

# Make predictions
y_pred_train = rf_model.predict(train_x)
y_pred = rf_model.predict(test_x)

# Evaluate the model
print("Accuracy:", accuracy_score(train_y, y_pred_train))
print("Train Report:\n", classification_report(train_y, y_pred_train))
print()
print("Accuracy:", accuracy_score(test_y, y_pred))
print("\nClassification Report:\n", classification_report(test_y, y_pred))

print(rf_model.feature_importances_)

Accuracy: 1.0
Train Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8760

    accuracy                           1.00      8760
   macro avg       1.00      1.00      1.00      8760
weighted avg       1.00      1.00      1.00      8760


Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8760

    accuracy                           1.00      8760
   macro avg       1.00      1.00      1.00      8760
weighted avg       1.00      1.00      1.00      8760

[0. 0. 0. 0. 0. 0.]


In [399]:
# Store all runs
results = []

for i in range(10):
    rf_model = RandomForestClassifier()
    rf_model.fit(train_x, train_y)

    y_pred = rf_model.predict(test_x)
    report = classification_report(test_y, y_pred, output_dict=True)

    # Store per-class metrics
    run_result = {'run': i + 1}
    for label in report:
        if label not in ('accuracy', 'macro avg', 'weighted avg'):
            run_result[f'precision_class_{label}'] = report[label]['precision']
            run_result[f'recall_class_{label}'] = report[label]['recall']
            run_result[f'f1_class_{label}'] = report[label]['f1-score']
    
    results.append(run_result)

# Results as DataFrame
results_df = pd.DataFrame(results)

# Summary (mean and std)
summary = results_df.describe().loc[['mean', 'std']]

print("\nSummary (Mean ± Std):")
print(summary)

print("\nAll Per-Class Results:")
print(results_df)


Summary (Mean ± Std):
          run  precision_class_0  recall_class_0  f1_class_0
mean  5.50000                1.0             1.0         1.0
std   3.02765                0.0             0.0         0.0

All Per-Class Results:
   run  precision_class_0  recall_class_0  f1_class_0
0    1                1.0             1.0         1.0
1    2                1.0             1.0         1.0
2    3                1.0             1.0         1.0
3    4                1.0             1.0         1.0
4    5                1.0             1.0         1.0
5    6                1.0             1.0         1.0
6    7                1.0             1.0         1.0
7    8                1.0             1.0         1.0
8    9                1.0             1.0         1.0
9   10                1.0             1.0         1.0


In [400]:
fig = go.Figure()

# Base temperature curve
fig.add_trace(go.Scatter(
    x=df_simulated['Time'], 
    y=df_simulated[test_heat], 
    mode='lines', 
    name='Zone 8 faulty operation'
))

# Convert y_pred to a Pandas Series with the correct index
y_pred_series = pd.Series(y_pred, index=test_y.index)

# Find indices for different cases
correct_1_indices = (test_y == 4) & (y_pred_series == 4)  # True 1, Pred 1 -> Green
false_negative_indices = (test_y == 4) & (y_pred_series == 0)  # True 1, Pred 0 -> Red
false_positive_indices = (test_y == 0) & (y_pred_series == 4)  # True 0, Pred 1 -> Orange

# Extract timestamps
correct_1_timestamps = df_simulated.loc[test_y.index, 'Time'][correct_1_indices]
false_negative_timestamps = df_simulated.loc[test_y.index, 'Time'][false_negative_indices]
false_positive_timestamps = df_simulated.loc[test_y.index, 'Time'][false_positive_indices]

# Extract corresponding temperatures
correct_1_temps = df_simulated.loc[test_y.index, test_heat][correct_1_indices]
false_negative_temps = df_simulated.loc[test_y.index, test_heat][false_negative_indices]
false_positive_temps = df_simulated.loc[test_y.index, test_heat][false_positive_indices]

# Add markers for correct predictions (Green)
fig.add_trace(go.Scatter(
    x=correct_1_timestamps,
    y=correct_1_temps,
    mode='markers',
    marker=dict(color='green', size=10),
    name='Correct (4 predicted as 4)'
))

# Add markers for false negatives (Red)
fig.add_trace(go.Scatter(
    x=false_negative_timestamps,
    y=false_negative_temps,
    mode='markers',
    marker=dict(color='red', size=10, symbol='x'),
    name='False Negative (4 predicted as 0)'
))

# Add markers for false positives (Orange)
fig.add_trace(go.Scatter(
    x=false_positive_timestamps,
    y=false_positive_temps,
    mode='markers',
    marker=dict(color='orange', size=10, symbol='triangle-up'),
    name='False Positive (0 predicted as 4)'
))

# Customize layout
fig.update_layout(
    title="Heating consumption with faulty operation in zone 8 & prediction markers",
    xaxis_title="Time",
    yaxis_title="Heat (kWh)",
    legend_title="Legend",
    template="plotly", 
    height = 500,
    width = 1200
)

# Show the plot

In [401]:
df_simulated = pd.read_csv('/Users/sararhiger/Desktop/Master thesis /Data/Processed data/Model improvements/Fault4_baseline_heat_temp.csv')

In [402]:
# Define the start and end times for training data
start_time_train = "2023-01-01 00:00:00"  
end_time_train = "2023-12-31 23:00:00"  

# Define the start and end times for testing data
start_time_test = "2023-01-01 00:00:00"  
end_time_test = "2023-12-31 23:00:00"

# Filter DataFrame for timestamps within the range
df_train_simulated = df_simulated[(df_simulated['Time'] >= start_time_train) & (df_simulated['Time'] <= end_time_train)]

df_test_simulated = df_simulated[(df_simulated['Time'] >= start_time_test) & (df_simulated['Time'] <= end_time_test)]

# Zone 1
X_train_zone1_simulated = df_train_simulated[['Mean air temperature, degC, zone 1', 'Relative humidity, %, zone 1', 'CO2, ppm (vol), zone 1', 'Local heating units, W, zone 1']]
y_train_zone1_simulated = df_train_simulated['FDD, zone 1']
X_test_zone1_simulated = df_test_simulated[['Mean air temperature, degC, zone 1', 'Relative humidity, %, zone 1', 'CO2, ppm (vol), zone 1', 'Local heating units, W, zone 1']]
y_test_zone1_simulated = df_test_simulated['FDD, zone 1']

# Zone 8
X_train_zone8_simulated = df_train_simulated[['Mean air temperature, degC, zone 8', 'Relative humidity, %, zone 8', 'CO2, ppm (vol), zone 8', 'Local heating units, W, zone 8']]
y_train_zone8_simulated = df_train_simulated['FDD, zone 8']
X_test_zone8_simulated = df_test_simulated[['Mean air temperature, degC, zone 8', 'Relative humidity, %, zone 8', 'CO2, ppm (vol), zone 8', 'Local heating units, W, zone 8']]
y_test_zone8_simulated = df_test_simulated['FDD, zone 8']

In [403]:
train_x = X_train_zone1_simulated
train_y = y_train_zone1_simulated

test_x = X_test_zone8_simulated
test_y = y_test_zone8_simulated

# Define zones and variables for plots 
test_temp = 'Mean air temperature, degC, zone 8'
test_heat = 'Local heating units, W, zone 8'

In [404]:
# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)#, class_weight='balanced')

# Train the model
rf_model.fit(train_x, train_y)

# Make predictions
y_pred_train = rf_model.predict(train_x)
y_pred = rf_model.predict(test_x)

# Evaluate the model
print("Accuracy:", accuracy_score(train_y, y_pred_train))
print("Train Report:\n", classification_report(train_y, y_pred_train))
print()
print("Accuracy:", accuracy_score(test_y, y_pred))
print("\nClassification Report:\n", classification_report(test_y, y_pred))

print(rf_model.feature_importances_)

Accuracy: 1.0
Train Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8572
           4       1.00      1.00      1.00       188

    accuracy                           1.00      8760
   macro avg       1.00      1.00      1.00      8760
weighted avg       1.00      1.00      1.00      8760


Accuracy: 0.9948630136986302

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8697
           4       0.88      0.33      0.48        63

    accuracy                           0.99      8760
   macro avg       0.94      0.67      0.74      8760
weighted avg       0.99      0.99      0.99      8760

[0.4140711  0.03755824 0.0295789  0.51879177]


In [405]:
# Store all runs
results = []

for i in range(10):
    rf_model = RandomForestClassifier()
    rf_model.fit(train_x, train_y)

    y_pred = rf_model.predict(test_x)
    report = classification_report(test_y, y_pred, output_dict=True)

    # Store per-class metrics
    run_result = {'run': i + 1}
    for label in report:
        if label not in ('accuracy', 'macro avg', 'weighted avg'):
            run_result[f'precision_class_{label}'] = report[label]['precision']
            run_result[f'recall_class_{label}'] = report[label]['recall']
            run_result[f'f1_class_{label}'] = report[label]['f1-score']
    
    results.append(run_result)

# Results as DataFrame
results_df = pd.DataFrame(results)

# Summary (mean and std)
summary = results_df.describe().loc[['mean', 'std']]

print("\nSummary (Mean ± Std):")
print(summary)

print("\nAll Per-Class Results:")
print(results_df)


Summary (Mean ± Std):
          run  precision_class_0  recall_class_0  f1_class_0  \
mean  5.50000           0.997709        0.999011    0.998357   
std   3.02765           0.002244        0.001798    0.001241   

      precision_class_4  recall_class_4  f1_class_4  
mean           0.860147        0.682540    0.717922  
std            0.134078        0.311406    0.240229  

All Per-Class Results:
   run  precision_class_0  recall_class_0  f1_class_0  precision_class_4  \
0    1           0.999310        0.999655    0.999483           0.950000   
1    2           0.994395        0.999540    0.996961           0.777778   
2    3           0.999306        0.993906    0.996599           0.518182   
3    4           0.995078        0.999655    0.997361           0.869565   
4    5           0.999540        0.999655    0.999598           0.951613   
5    6           0.999540        0.999540    0.999540           0.936508   
6    7           0.994850        0.999540    0.997190           0.

In [406]:
fig = go.Figure()

# Base temperature curve
fig.add_trace(go.Scatter(
    x=df_simulated['Time'], 
    y=df_simulated[test_heat], 
    mode='lines', 
    name='Zone 8 faulty operation'
))

# Convert y_pred to a Pandas Series with the correct index
y_pred_series = pd.Series(y_pred, index=test_y.index)

# Find indices for different cases
correct_1_indices = (test_y == 4) & (y_pred_series == 4)  # True 1, Pred 1 -> Green
false_negative_indices = (test_y == 4) & (y_pred_series == 0)  # True 1, Pred 0 -> Red
false_positive_indices = (test_y == 0) & (y_pred_series == 4)  # True 0, Pred 1 -> Orange

# Extract timestamps
correct_1_timestamps = df_simulated.loc[test_y.index, 'Time'][correct_1_indices]
false_negative_timestamps = df_simulated.loc[test_y.index, 'Time'][false_negative_indices]
false_positive_timestamps = df_simulated.loc[test_y.index, 'Time'][false_positive_indices]

# Extract corresponding temperatures
correct_1_temps = df_simulated.loc[test_y.index, test_heat][correct_1_indices]
false_negative_temps = df_simulated.loc[test_y.index, test_heat][false_negative_indices]
false_positive_temps = df_simulated.loc[test_y.index, test_heat][false_positive_indices]

# Add markers for correct predictions (Green)
fig.add_trace(go.Scatter(
    x=correct_1_timestamps,
    y=correct_1_temps,
    mode='markers',
    marker=dict(color='green', size=10),
    name='Correct (4 predicted as 4)'
))

# Add markers for false negatives (Red)
fig.add_trace(go.Scatter(
    x=false_negative_timestamps,
    y=false_negative_temps,
    mode='markers',
    marker=dict(color='red', size=10, symbol='x'),
    name='False Negative (4 predicted as 0)'
))

# Add markers for false positives (Orange)
fig.add_trace(go.Scatter(
    x=false_positive_timestamps,
    y=false_positive_temps,
    mode='markers',
    marker=dict(color='orange', size=10, symbol='triangle-up'),
    name='False Positive (0 predicted as 4)'
))

# Customize layout
fig.update_layout(
    title="Heating consumption with faulty operation in zone 8 & prediction markers",
    xaxis_title="Time",
    yaxis_title="Heat (kWh)",
    legend_title="Legend",
    template="plotly", 
    height = 500,
    width = 1200
)

# Show the plot