## Final models on simulated datasets (Training on zone 1 and testing on zone 8)
### Energy consumption normalized and GHI added 

##### *Same experiments as in file RF_OneClass_Final.ipynb to compare the results*

In [292]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, accuracy_score

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from collections import Counter

## 1) Fault 1
### Training zone 1 and testing on zone 8 the whole year

In [293]:
df_simulated = pd.read_csv('/Users/sararhiger/Desktop/Master thesis /Data/Processed data/Fault1_Window_dataprocessed.csv')

In [294]:
# Define the start and end times for training data
start_time_train = "2023-01-01 00:00:00"  
end_time_train = "2023-12-31 23:00:00"  

# Define the start and end times for testing data
start_time_test = "2023-01-01 00:00:00"  
end_time_test = "2023-12-31 23:00:00"

# Filter DataFrame for timestamps within the range
df_train_simulated = df_simulated[(df_simulated['Time'] >= start_time_train) & (df_simulated['Time'] <= end_time_train)]

df_test_simulated = df_simulated[(df_simulated['Time'] >= start_time_test) & (df_simulated['Time'] <= end_time_test)]

# Zone 1
X_train_zone1_simulated = df_train_simulated[['Global Horizontal Irradiance, W/m2','Outdoor air temp','Mean air temperature, degC, zone 1', 'Relative humidity, %, zone 1', 'CO2, ppm (vol), zone 1', 'Local heating units, W, zone 1']]
y_train_zone1_simulated = df_train_simulated['FDD, zone 1']
X_test_zone1_simulated = df_test_simulated[['Global Horizontal Irradiance, W/m2','Outdoor air temp','Mean air temperature, degC, zone 1', 'Relative humidity, %, zone 1', 'CO2, ppm (vol), zone 1', 'Local heating units, W, zone 1']]
y_test_zone1_simulated = df_test_simulated['FDD, zone 1']

# Zone 8
X_train_zone8_simulated = df_train_simulated[['Global Horizontal Irradiance, W/m2', 'Outdoor air temp','Mean air temperature, degC, zone 8', 'Relative humidity, %, zone 8', 'CO2, ppm (vol), zone 8', 'Local heating units, W, zone 8']]
y_train_zone8_simulated = df_train_simulated['FDD, zone 8']
X_test_zone8_simulated = df_test_simulated[['Global Horizontal Irradiance, W/m2', 'Outdoor air temp','Mean air temperature, degC, zone 8', 'Relative humidity, %, zone 8', 'CO2, ppm (vol), zone 8', 'Local heating units, W, zone 8']]
y_test_zone8_simulated = df_test_simulated['FDD, zone 8']

In [295]:
train_x = X_train_zone1_simulated
train_y = y_train_zone1_simulated

test_x = X_test_zone8_simulated
test_y = y_test_zone8_simulated

# Define zones and variables for plots 
test_temp = 'Mean air temperature, degC, zone 8'
test_heat = 'Local heating units, W, zone 8'

### Random forest 

In [296]:
# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(class_weight='balanced')

# Train the model
rf_model.fit(train_x, train_y)

# Make predictions
y_pred_train = rf_model.predict(train_x)
y_pred = rf_model.predict(test_x)

# Evaluate the model
print("Accuracy:", accuracy_score(train_y, y_pred_train))
print("Train Report:\n", classification_report(train_y, y_pred_train))
print()
print("Accuracy:", accuracy_score(test_y, y_pred))
print("\nClassification Report:\n", classification_report(test_y, y_pred))

print(rf_model.feature_importances_)

Accuracy: 1.0
Train Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8712
           1       1.00      1.00      1.00        48

    accuracy                           1.00      8760
   macro avg       1.00      1.00      1.00      8760
weighted avg       1.00      1.00      1.00      8760


Accuracy: 0.9990867579908675

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8747
           1       0.86      0.46      0.60        13

    accuracy                           1.00      8760
   macro avg       0.93      0.73      0.80      8760
weighted avg       1.00      1.00      1.00      8760

[0.08439283 0.08202836 0.28606189 0.01935506 0.055073   0.47308886]


In [297]:
# Store all runs
results = []

for i in range(10):
    rf_model = RandomForestClassifier()
    rf_model.fit(train_x, train_y)

    y_pred = rf_model.predict(test_x)
    report = classification_report(test_y, y_pred, output_dict=True)

    # Store per-class metrics
    run_result = {'run': i + 1}
    for label in report:
        if label not in ('accuracy', 'macro avg', 'weighted avg'):
            run_result[f'precision_class_{label}'] = report[label]['precision']
            run_result[f'recall_class_{label}'] = report[label]['recall']
            run_result[f'f1_class_{label}'] = report[label]['f1-score']
    
    results.append(run_result)

# Results as DataFrame
results_df = pd.DataFrame(results)

# Summary (mean and std)
summary = results_df.describe().loc[['mean', 'std']]

print("\nSummary (Mean ± Std):")
print(summary)

print("\nAll Per-Class Results:")
print(results_df)


Summary (Mean ± Std):
          run  precision_class_0  recall_class_0  f1_class_0  \
mean  5.50000           0.999177    9.998857e-01    0.999531   
std   3.02765           0.000048    2.340556e-16    0.000024   

      precision_class_1  recall_class_1  f1_class_1  
mean           0.852381        0.446154    0.585263  
std            0.010039        0.032434    0.031068  

All Per-Class Results:
   run  precision_class_0  recall_class_0  f1_class_0  precision_class_1  \
0    1           0.999200        0.999886    0.999543           0.857143   
1    2           0.999200        0.999886    0.999543           0.857143   
2    3           0.999200        0.999886    0.999543           0.857143   
3    4           0.999200        0.999886    0.999543           0.857143   
4    5           0.999200        0.999886    0.999543           0.857143   
5    6           0.999086        0.999886    0.999486           0.833333   
6    7           0.999200        0.999886    0.999543           0.

In [298]:
# Compute confusion matrix
cm = confusion_matrix(test_y, y_pred)

# Ensure labels are 0 and 1
labels = [0, 1]  

# Create the confusion matrix heatmap
fig = go.Figure(data=go.Heatmap(
    z=cm,
    x=labels,  # Predicted labels
    y=labels,  # Actual labels
    colorscale='Blues',
    showscale=True,
    text=cm,  # Display numbers in the cells
    texttemplate="%{text}",  # Ensure counts appear inside the cells
    hovertemplate='Actual: %{y}<br>Predicted: %{x}<br>Count: %{z}<extra></extra>'
))

# Update layout for binary classification (0 and 1 on axes)
fig.update_layout(
    title="Confusion Matrix for zone 8 opened window",
    width=400,   # Reduce width
    height=400,
    xaxis=dict(
        title="Predicted Label",
        tickmode='array',        # Custom tick labels
        tickvals=[0, 1],         # Only show 0 and 1
        showgrid=False
    ),
    yaxis=dict(
        title="Actual Label",
        tickmode='array',
        tickvals=[0, 1],         # Only show 0 and 1
        showgrid=False
    ),
    template="plotly_white"
)

# Show the plot
fig.show()

In [299]:
fig = go.Figure()

# Base temperature curve
fig.add_trace(go.Scatter(
    x=df_simulated['Time'], 
    y=df_simulated[test_temp], 
    mode='lines', 
    name='Zone 8 faulty operation'
))

# Convert y_pred to a Pandas Series with the correct index
y_pred_series = pd.Series(y_pred, index=test_y.index)

# Find indices for different cases
correct_1_indices = (test_y == 1) & (y_pred_series == 1)  # True 1, Pred 1 -> Green
false_negative_indices = (test_y == 1) & (y_pred_series == 0)  # True 1, Pred 0 -> Red
false_positive_indices = (test_y == 0) & (y_pred_series == 1)  # True 0, Pred 1 -> Orange

# Extract timestamps
correct_1_timestamps = df_simulated.loc[test_y.index, 'Time'][correct_1_indices]
false_negative_timestamps = df_simulated.loc[test_y.index, 'Time'][false_negative_indices]
false_positive_timestamps = df_simulated.loc[test_y.index, 'Time'][false_positive_indices]

# Extract corresponding temperatures
correct_1_temps = df_simulated.loc[test_y.index, test_temp][correct_1_indices]
false_negative_temps = df_simulated.loc[test_y.index, test_temp][false_negative_indices]
false_positive_temps = df_simulated.loc[test_y.index, test_temp][false_positive_indices]

# Add markers for correct predictions (Green)
fig.add_trace(go.Scatter(
    x=correct_1_timestamps,
    y=correct_1_temps,
    mode='markers',
    marker=dict(color='green', size=10),
    name='Correct (1 predicted as 1)'
))

# Add markers for false negatives (Red)
fig.add_trace(go.Scatter(
    x=false_negative_timestamps,
    y=false_negative_temps,
    mode='markers',
    marker=dict(color='red', size=10, symbol='x'),
    name='False Negative (1 predicted as 0)'
))

# Add markers for false positives (Orange)
fig.add_trace(go.Scatter(
    x=false_positive_timestamps,
    y=false_positive_temps,
    mode='markers',
    marker=dict(color='orange', size=10, symbol='triangle-up'),
    name='False Positive (0 predicted as 1)'
))

# Customize layout
fig.update_layout(
    title="Temperatures with faulty operation in zone 8 & Prediction Markers",
    xaxis_title="Time",
    yaxis_title="Temperature (°C)",
    legend_title="Legend",
    template="plotly"
)

# Show the plot
fig.show()

### XGboost

In [300]:
# Initialize the XGBoost Classifier
xgb_model = XGBClassifier()

# Train the model
xgb_model.fit(train_x, train_y)

# Make predictions
y_pred_train = xgb_model.predict(train_x)
y_pred = xgb_model.predict(test_x)

# Evaluate the model
print("Accuracy on Train Set:", accuracy_score(train_y, y_pred_train))
print("Train Report:\n", classification_report(train_y, y_pred_train))
print()
print("Accuracy on Test Set:", accuracy_score(test_y, y_pred))
print("\nClassification Report:\n", classification_report(test_y, y_pred))

Accuracy on Train Set: 1.0
Train Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8712
           1       1.00      1.00      1.00        48

    accuracy                           1.00      8760
   macro avg       1.00      1.00      1.00      8760
weighted avg       1.00      1.00      1.00      8760


Accuracy on Test Set: 0.9987442922374429

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8747
           1       0.75      0.23      0.35        13

    accuracy                           1.00      8760
   macro avg       0.87      0.62      0.68      8760
weighted avg       1.00      1.00      1.00      8760



In [301]:
# Store all runs
results = []

for i in range(10):
    xgb_model = XGBClassifier()
    xgb_model.fit(train_x, train_y)

    y_pred = xgb_model.predict(test_x)
    report = classification_report(test_y, y_pred, output_dict=True)

    # Store per-class metrics
    run_result = {'run': i + 1}
    for label in report:
        if label not in ('accuracy', 'macro avg', 'weighted avg'):
            run_result[f'precision_class_{label}'] = report[label]['precision']
            run_result[f'recall_class_{label}'] = report[label]['recall']
            run_result[f'f1_class_{label}'] = report[label]['f1-score']
    
    results.append(run_result)

# Results as DataFrame
results_df = pd.DataFrame(results)

# Summary (mean and std)
summary = results_df.describe().loc[['mean', 'std']]

print("\nSummary (Mean ± Std):")
print(summary)

print("\nAll Per-Class Results:")
print(results_df)


Summary (Mean ± Std):
          run  precision_class_0  recall_class_0    f1_class_0  \
mean  5.50000           0.998858    9.998857e-01  9.993715e-01   
std   3.02765           0.000000    2.340556e-16  1.170278e-16   

      precision_class_1  recall_class_1    f1_class_1  
mean               0.75        0.230769  3.529412e-01  
std                0.00        0.000000  5.851389e-17  

All Per-Class Results:
   run  precision_class_0  recall_class_0  f1_class_0  precision_class_1  \
0    1           0.998858        0.999886    0.999372               0.75   
1    2           0.998858        0.999886    0.999372               0.75   
2    3           0.998858        0.999886    0.999372               0.75   
3    4           0.998858        0.999886    0.999372               0.75   
4    5           0.998858        0.999886    0.999372               0.75   
5    6           0.998858        0.999886    0.999372               0.75   
6    7           0.998858        0.999886    0.999372 

## 2) Fault 2 - setpoint 0 

In [302]:
df_simulated = pd.read_csv('/Users/sararhiger/Desktop/Master thesis /Data/Processed data/Fault2_LowSetpoint_dataprocessed.csv')


In [303]:
# Define the start and end times for training data
start_time_train = "2023-01-01 00:00:00"  
end_time_train = "2023-12-31 23:00:00"  

# Define the start and end times for testing data
start_time_test = "2023-01-01 00:00:00"  
end_time_test = "2023-12-31 23:00:00"

# Filter DataFrame for timestamps within the range
df_train_simulated = df_simulated[(df_simulated['Time'] >= start_time_train) & (df_simulated['Time'] <= end_time_train)]

df_test_simulated = df_simulated[(df_simulated['Time'] >= start_time_test) & (df_simulated['Time'] <= end_time_test)]

# Zone 1
X_train_zone1_simulated = df_train_simulated[['Global Horizontal Irradiance, W/m2','Outdoor air temp','Mean air temperature, degC, zone 1', 'Relative humidity, %, zone 1', 'CO2, ppm (vol), zone 1', 'Local heating units, W, zone 1']]
y_train_zone1_simulated = df_train_simulated['FDD, zone 1']
X_test_zone1_simulated = df_test_simulated[['Global Horizontal Irradiance, W/m2','Outdoor air temp','Mean air temperature, degC, zone 1', 'Relative humidity, %, zone 1', 'CO2, ppm (vol), zone 1', 'Local heating units, W, zone 1']]
y_test_zone1_simulated = df_test_simulated['FDD, zone 1']

# Zone 8
X_train_zone8_simulated = df_train_simulated[['Global Horizontal Irradiance, W/m2', 'Outdoor air temp','Mean air temperature, degC, zone 8', 'Relative humidity, %, zone 8', 'CO2, ppm (vol), zone 8', 'Local heating units, W, zone 8']]
y_train_zone8_simulated = df_train_simulated['FDD, zone 8']
X_test_zone8_simulated = df_test_simulated[['Global Horizontal Irradiance, W/m2', 'Outdoor air temp','Mean air temperature, degC, zone 8', 'Relative humidity, %, zone 8', 'CO2, ppm (vol), zone 8', 'Local heating units, W, zone 8']]
y_test_zone8_simulated = df_test_simulated['FDD, zone 8']

In [304]:
train_x = X_train_zone1_simulated
train_y = y_train_zone1_simulated

test_x = X_test_zone8_simulated
test_y = y_test_zone8_simulated

# Define zones and variables for plots 
test_temp = 'Mean air temperature, degC, zone 8'
test_heat = 'Local heating units, W, zone 8'

### Random forest

In [305]:
# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(class_weight='balanced')

# Train the model
rf_model.fit(train_x, train_y)

# Make predictions
y_pred_train = rf_model.predict(train_x)
y_pred = rf_model.predict(test_x)

# Evaluate the model
print("Accuracy:", accuracy_score(train_y, y_pred_train))
print("Train Report:\n", classification_report(train_y, y_pred_train))
print()
print("Accuracy:", accuracy_score(test_y, y_pred))
print("\nClassification Report:\n", classification_report(test_y, y_pred))

print(rf_model.feature_importances_)

Accuracy: 1.0
Train Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8657
           2       1.00      1.00      1.00       103

    accuracy                           1.00      8760
   macro avg       1.00      1.00      1.00      8760
weighted avg       1.00      1.00      1.00      8760


Accuracy: 0.9963470319634703

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8689
           2       1.00      0.55      0.71        71

    accuracy                           1.00      8760
   macro avg       1.00      0.77      0.85      8760
weighted avg       1.00      1.00      1.00      8760

[0.08586644 0.18534767 0.37763584 0.16829202 0.05959547 0.12326256]


In [306]:
# Store all runs
results = []

for i in range(10):
    rf_model = RandomForestClassifier()
    rf_model.fit(train_x, train_y)

    y_pred = rf_model.predict(test_x)
    report = classification_report(test_y, y_pred, output_dict=True)

    # Store per-class metrics
    run_result = {'run': i + 1}
    for label in report:
        if label not in ('accuracy', 'macro avg', 'weighted avg'):
            run_result[f'precision_class_{label}'] = report[label]['precision']
            run_result[f'recall_class_{label}'] = report[label]['recall']
            run_result[f'f1_class_{label}'] = report[label]['f1-score']
    
    results.append(run_result)

# Results as DataFrame
results_df = pd.DataFrame(results)

# Summary (mean and std)
summary = results_df.describe().loc[['mean', 'std']]

print("\nSummary (Mean ± Std):")
print(summary)

print("\nAll Per-Class Results:")
print(results_df)


Summary (Mean ± Std):
          run  precision_class_0  recall_class_0  f1_class_0  \
mean  5.50000       9.963302e-01        0.999862    0.998093   
std   3.02765       5.541545e-07        0.000152    0.000076   

      precision_class_2  recall_class_2  f1_class_2  
mean           0.971064    5.492958e-01    0.701527  
std            0.031035    1.170278e-16    0.008229  

All Per-Class Results:
   run  precision_class_0  recall_class_0  f1_class_0  precision_class_2  \
0    1           0.996331        1.000000    0.998162           1.000000   
1    2           0.996331        1.000000    0.998162           1.000000   
2    3           0.996330        0.999885    0.998104           0.975000   
3    4           0.996330        0.999770    0.998047           0.951220   
4    5           0.996331        1.000000    0.998162           1.000000   
5    6           0.996330        0.999770    0.998047           0.951220   
6    7           0.996330        0.999885    0.998104           0.

In [307]:
# Compute confusion matrix
cm = confusion_matrix(test_y, y_pred)

# Ensure labels are 0 and 1
labels = [0, 2]  

# Create the confusion matrix heatmap
fig = go.Figure(data=go.Heatmap(
    z=cm,
    x=labels,  # Predicted labels
    y=labels,  # Actual labels
    colorscale='Blues',
    showscale=True,
    text=cm,  # Display numbers in the cells
    texttemplate="%{text}",  # Ensure counts appear inside the cells
    hovertemplate='Actual: %{y}<br>Predicted: %{x}<br>Count: %{z}<extra></extra>'
))

# Update layout for binary classification (0 and 1 on axes)
fig.update_layout(
    title="Confusion Matrix for zone 8 thermostat fault",
    width=500,   # Reduce width
    height=400,
    xaxis=dict(
        title="Predicted Label",
        tickmode='array',        # Custom tick labels
        tickvals=[0, 2],         # Only show 0 and 1
        showgrid=False
    ),
    yaxis=dict(
        title="Actual Label",
        tickmode='array',
        tickvals=[0, 2],         # Only show 0 and 1
        showgrid=False
    ),
    template="plotly_white"
)

# Show the plot
fig.show()

In [308]:
fig = go.Figure()

# Base temperature curve
fig.add_trace(go.Scatter(
    x=df_simulated['Time'], 
    y=df_simulated[test_temp], 
    mode='lines', 
    name='Temperature (C°)',
    yaxis='y1'
    ))

# Convert y_pred to a Pandas Series with the correct index
y_pred_series = pd.Series(y_pred, index=test_y.index)

# Find indices for different cases
correct_1_indices = (test_y == 2) & (y_pred_series == 2)  # True 1, Pred 1 -> Green
false_negative_indices = (test_y == 2) & (y_pred_series == 0)  # True 1, Pred 0 -> Red
false_positive_indices = (test_y == 0) & (y_pred_series == 2)  # True 0, Pred 1 -> Orange

# Extract timestamps
correct_1_timestamps = df_simulated.loc[test_y.index, 'Time'][correct_1_indices]
false_negative_timestamps = df_simulated.loc[test_y.index, 'Time'][false_negative_indices]
false_positive_timestamps = df_simulated.loc[test_y.index, 'Time'][false_positive_indices]

# Extract corresponding temperatures
correct_1_temps = df_simulated.loc[test_y.index, test_temp][correct_1_indices]
false_negative_temps = df_simulated.loc[test_y.index, test_temp][false_negative_indices]
false_positive_temps = df_simulated.loc[test_y.index, test_temp][false_positive_indices]

# Add markers for correct predictions (Green)
fig.add_trace(go.Scatter(
    x=correct_1_timestamps,
    y=correct_1_temps,
    mode='markers',
    marker=dict(color='green', size=10),
    name='Correct (2 predicted as 2)'
))

# Add markers for false negatives (Red)
fig.add_trace(go.Scatter(
    x=false_negative_timestamps,
    y=false_negative_temps,
    mode='markers',
    marker=dict(color='red', size=10, symbol='x'),
    name='False Negative (2 predicted as 0)'
))

# Add markers for false positives (Orange)
fig.add_trace(go.Scatter(
    x=false_positive_timestamps,
    y=false_positive_temps,
    mode='markers',
    marker=dict(color='orange', size=10, symbol='triangle-up'),
    name='False Positive (0 predicted as 2)'
))

# Customize layout
#fig.update_layout(
#    title="Temperature with low setpoint fault simulated in Zone 8 with prediction markers",
#    xaxis_title="Time",
 #   yaxis_title="Temperature (°C)",
   # legend_title="Legend",
    #template="plotly"
#)

# Update layout with secondary y-axis
fig.update_layout(
    title="Temperature & heating consumption in zone 8 with lowered setpoint simulated",
    xaxis_title="Time",
    font=dict(size=14),
    yaxis=dict(
        title="Temperature (°C)",
        titlefont=dict(color="black"),
        tickfont=dict(color="black")
    ),
    yaxis2=dict(
        title="Heating Consumption (kWh)",  # Adjust unit if needed
        overlaying='y',
        side='right',
        titlefont=dict(color="black"),
        tickfont=dict(color="black"),
    ),
    legend=dict(
        x=1.09,         # Pushes legend to the right (1.0 is the edge of the plot area)
        y=1,            # Top alignment
        xanchor="left", # Anchor relative to left of the legend box
        yanchor="top"
    ),
    legend_title="Legend",
    template="plotly",
    height=400,
    width=1100,
)

# Show the plot

In [309]:
fig = go.Figure()

# Base temperature curve
fig.add_trace(go.Scatter(
    x=df_simulated['Time'], 
    y=df_simulated[test_temp], 
    mode='lines', 
    name='Temperature (C°)',
    yaxis='y1'
    ))

# Base temperature curve
fig.add_trace(go.Scatter(
    x=df_simulated['Time'], 
    y=df_simulated[test_heat], 
    mode='lines', 
    name='Normalized heating consumption',
    yaxis='y2',
    line=dict(dash='dot')
    ))

# Convert y_pred to a Pandas Series with the correct index
y_pred_series = pd.Series(y_pred, index=test_y.index)

# Find indices for different cases
correct_1_indices = (test_y == 2) & (y_pred_series == 2)  # True 1, Pred 1 -> Green
false_negative_indices = (test_y == 2) & (y_pred_series == 0)  # True 1, Pred 0 -> Red
false_positive_indices = (test_y == 0) & (y_pred_series == 2)  # True 0, Pred 1 -> Orange

# Extract timestamps
correct_1_timestamps = df_simulated.loc[test_y.index, 'Time'][correct_1_indices]
false_negative_timestamps = df_simulated.loc[test_y.index, 'Time'][false_negative_indices]
false_positive_timestamps = df_simulated.loc[test_y.index, 'Time'][false_positive_indices]

# Extract corresponding temperatures
correct_1_temps = df_simulated.loc[test_y.index, test_temp][correct_1_indices]
false_negative_temps = df_simulated.loc[test_y.index, test_temp][false_negative_indices]
false_positive_temps = df_simulated.loc[test_y.index, test_temp][false_positive_indices]

# Add markers for correct predictions (Green)
fig.add_trace(go.Scatter(
    x=correct_1_timestamps,
    y=correct_1_temps,
    mode='markers',
    marker=dict(color='green', size=10),
    name='Correct (2 predicted as 2)'
))

# Add markers for false negatives (Red)
fig.add_trace(go.Scatter(
    x=false_negative_timestamps,
    y=false_negative_temps,
    mode='markers',
    marker=dict(color='red', size=10, symbol='x'),
    name='False Negative (2 predicted as 0)'
))

# Add markers for false positives (Orange)
fig.add_trace(go.Scatter(
    x=false_positive_timestamps,
    y=false_positive_temps,
    mode='markers',
    marker=dict(color='orange', size=10, symbol='triangle-up'),
    name='False Positive (0 predicted as 2)'
))

# Customize layout
#fig.update_layout(
#    title="Temperature with low setpoint fault simulated in Zone 8 with prediction markers",
#    xaxis_title="Time",
 #   yaxis_title="Temperature (°C)",
   # legend_title="Legend",
    #template="plotly"
#)

# Update layout with secondary y-axis
fig.update_layout(
    title="Temperature & heating consumption in zone 8 with lowered setpoint simulated",
    xaxis_title="Time",
    font=dict(size=14),
    yaxis=dict(
        title="Temperature (°C)",
        titlefont=dict(color="black"),
        tickfont=dict(color="black")
    ),
    yaxis2=dict(
        title="Normalized heating",  # Adjust unit if needed
        overlaying='y',
        side='right',
        titlefont=dict(color="black"),
        tickfont=dict(color="black"),
    ),
    legend=dict(
        x=1.09,         # Pushes legend to the right (1.0 is the edge of the plot area)
        y=1,            # Top alignment
        xanchor="left", # Anchor relative to left of the legend box
        yanchor="top"
    ),
    legend_title="Legend",
    template="plotly",
    height=400,
    width=1100,
)

In [310]:
fig = go.Figure()

# Base temperature curve
fig.add_trace(go.Scatter(
    x=df_simulated['Time'], 
    y=df_simulated[test_heat], 
    mode='lines', 
    name='Zone 8 faulty operation'
))

# Convert y_pred to a Pandas Series with the correct index
y_pred_series = pd.Series(y_pred, index=test_y.index)

# Find indices for different cases
correct_1_indices = (test_y == 2) & (y_pred_series == 2)  # True 1, Pred 1 -> Green
false_negative_indices = (test_y == 2) & (y_pred_series == 0)  # True 1, Pred 0 -> Red
false_positive_indices = (test_y == 0) & (y_pred_series == 2)  # True 0, Pred 1 -> Orange

# Extract timestamps
correct_1_timestamps = df_simulated.loc[test_y.index, 'Time'][correct_1_indices]
false_negative_timestamps = df_simulated.loc[test_y.index, 'Time'][false_negative_indices]
false_positive_timestamps = df_simulated.loc[test_y.index, 'Time'][false_positive_indices]

# Extract corresponding temperatures
correct_1_temps = df_simulated.loc[test_y.index, test_heat][correct_1_indices]
false_negative_temps = df_simulated.loc[test_y.index, test_heat][false_negative_indices]
false_positive_temps = df_simulated.loc[test_y.index, test_heat][false_positive_indices]

# Add markers for correct predictions (Green)
fig.add_trace(go.Scatter(
    x=correct_1_timestamps,
    y=correct_1_temps,
    mode='markers',
    marker=dict(color='green', size=10),
    name='Correct (2 predicted as 2)'
))

# Add markers for false negatives (Red)
fig.add_trace(go.Scatter(
    x=false_negative_timestamps,
    y=false_negative_temps,
    mode='markers',
    marker=dict(color='red', size=10, symbol='x'),
    name='False Negative (2 predicted as 0)'
))

# Add markers for false positives (Orange)
fig.add_trace(go.Scatter(
    x=false_positive_timestamps,
    y=false_positive_temps,
    mode='markers',
    marker=dict(color='orange', size=10, symbol='triangle-up'),
    name='False Positive (0 predicted as 2)'
))

# Customize layout
fig.update_layout(
    title="Heating consumption with faulty operation in zone 8 & Prediction Markers",
    xaxis_title="Time",
    yaxis_title="Heat (kWh)",
    legend_title="Legend",
    template="plotly"
)

# Show the plot

### XGBoost

In [311]:
train_y = train_y.replace(2, 1)
test_y = test_y.replace(2, 1)

# Initialize the XGBoost Classifier
xgb_model = XGBClassifier()

# Train the model
xgb_model.fit(train_x, train_y)

# Make predictions
y_pred_train = xgb_model.predict(train_x)
y_pred = xgb_model.predict(test_x)

# Evaluate the model
print("Accuracy on Train Set:", accuracy_score(train_y, y_pred_train))
print("Train Report:\n", classification_report(train_y, y_pred_train))
print()
print("Accuracy on Test Set:", accuracy_score(test_y, y_pred))
print("\nClassification Report:\n", classification_report(test_y, y_pred))

Accuracy on Train Set: 1.0
Train Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8657
           1       1.00      1.00      1.00       103

    accuracy                           1.00      8760
   macro avg       1.00      1.00      1.00      8760
weighted avg       1.00      1.00      1.00      8760


Accuracy on Test Set: 0.9950913242009133

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8689
           1       0.78      0.55      0.64        71

    accuracy                           1.00      8760
   macro avg       0.89      0.77      0.82      8760
weighted avg       0.99      1.00      0.99      8760



In [312]:
# Store all runs
results = []

for i in range(10):
    xgb_model = XGBClassifier()
    xgb_model.fit(train_x, train_y)

    y_pred = xgb_model.predict(test_x)
    report = classification_report(test_y, y_pred, output_dict=True)

    # Store per-class metrics
    run_result = {'run': i + 1}
    for label in report:
        if label not in ('accuracy', 'macro avg', 'weighted avg'):
            run_result[f'precision_class_{label}'] = report[label]['precision']
            run_result[f'recall_class_{label}'] = report[label]['recall']
            run_result[f'f1_class_{label}'] = report[label]['f1-score']
    
    results.append(run_result)

# Results as DataFrame
results_df = pd.DataFrame(results)

# Summary (mean and std)
summary = results_df.describe().loc[['mean', 'std']]

print("\nSummary (Mean ± Std):")
print(summary)

print("\nAll Per-Class Results:")
print(results_df)


Summary (Mean ± Std):
          run  precision_class_0  recall_class_0  f1_class_0  \
mean  5.50000       9.963261e-01    9.987340e-01    0.997529   
std   3.02765       1.170278e-16    1.170278e-16    0.000000   

      precision_class_1  recall_class_1  f1_class_1  
mean       7.800000e-01    5.492958e-01    0.644628  
std        1.170278e-16    1.170278e-16    0.000000  

All Per-Class Results:
   run  precision_class_0  recall_class_0  f1_class_0  precision_class_1  \
0    1           0.996326        0.998734    0.997529               0.78   
1    2           0.996326        0.998734    0.997529               0.78   
2    3           0.996326        0.998734    0.997529               0.78   
3    4           0.996326        0.998734    0.997529               0.78   
4    5           0.996326        0.998734    0.997529               0.78   
5    6           0.996326        0.998734    0.997529               0.78   
6    7           0.996326        0.998734    0.997529             

## 3) Fejl 3 - Defekt CO2 sensor

In [313]:
df_simulated = pd.read_csv('/Users/sararhiger/Desktop/Master thesis /Data/Processed data/Fault3_CO2_dataprocessed.csv')

df_simulated

Unnamed: 0,Time,"Global Horizontal Irradiance, W/m2",Outdoor air temp,"Mean air temperature, degC, zone 1","Relative humidity, %, zone 1","CO2, ppm (vol), zone 1","Local heating units, W, zone 1","FDD, zone 1","Mean air temperature, degC, zone 8","Relative humidity, %, zone 8","CO2, ppm (vol), zone 8","Local heating units, W, zone 8","FDD, zone 8",Ideal heaters and other local u
0,2023-01-01 00:00:00,-0.073490,9.083333,20.999,0.24754,400.0,0.692990,0,20.998,0.24757,400.00,0.701106,0,0.673722
1,2023-01-01 01:00:00,0.034139,9.858333,20.999,0.24139,400.0,0.682082,0,20.998,0.24141,400.00,0.692060,0,0.683357
2,2023-01-01 02:00:00,-0.044210,10.631667,20.999,0.23849,400.0,0.689492,0,20.998,0.23851,400.00,0.707839,0,0.703352
3,2023-01-01 03:00:00,-0.080542,10.600000,20.999,0.23750,400.0,0.694407,0,20.998,0.23751,400.00,0.720235,0,0.718769
4,2023-01-01 04:00:00,-0.061235,10.510000,21.000,0.24057,400.0,0.697044,0,20.999,0.24058,400.00,0.729179,0,0.730608
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2023-12-31 19:00:00,-0.233520,4.926667,20.999,0.25542,400.0,0.661836,0,20.994,0.25553,400.18,0.597789,0,0.517586
8756,2023-12-31 20:00:00,-0.179327,5.053333,20.999,0.25297,400.0,0.667600,0,20.996,0.25302,400.03,0.626767,0,0.568380
8757,2023-12-31 21:00:00,-0.150863,5.141667,20.999,0.24982,400.0,0.676220,0,20.997,0.24986,400.01,0.653065,0,0.603620
8758,2023-12-31 22:00:00,-0.224063,5.200000,20.999,0.24829,400.0,0.684392,0,20.997,0.24832,400.00,0.675645,0,0.637174


In [314]:
# Define the start and end times for training data
start_time_train = "2023-01-01 00:00:00"  
end_time_train = "2023-12-31 23:00:00"  

# Define the start and end times for testing data
start_time_test = "2023-01-01 00:00:00"  
end_time_test = "2023-12-31 23:00:00"

# Filter DataFrame for timestamps within the range
df_train_simulated = df_simulated[(df_simulated['Time'] >= start_time_train) & (df_simulated['Time'] <= end_time_train)]

df_test_simulated = df_simulated[(df_simulated['Time'] >= start_time_test) & (df_simulated['Time'] <= end_time_test)]

# Zone 1
X_train_zone1_simulated = df_train_simulated[['Global Horizontal Irradiance, W/m2','Outdoor air temp','Mean air temperature, degC, zone 1', 'Relative humidity, %, zone 1', 'CO2, ppm (vol), zone 1', 'Local heating units, W, zone 1']]
y_train_zone1_simulated = df_train_simulated['FDD, zone 1']
X_test_zone1_simulated = df_test_simulated[['Global Horizontal Irradiance, W/m2','Outdoor air temp','Mean air temperature, degC, zone 1', 'Relative humidity, %, zone 1', 'CO2, ppm (vol), zone 1', 'Local heating units, W, zone 1']]
y_test_zone1_simulated = df_test_simulated['FDD, zone 1']

# Zone 8
X_train_zone8_simulated = df_train_simulated[['Global Horizontal Irradiance, W/m2', 'Outdoor air temp','Mean air temperature, degC, zone 8', 'Relative humidity, %, zone 8', 'CO2, ppm (vol), zone 8', 'Local heating units, W, zone 8']]
y_train_zone8_simulated = df_train_simulated['FDD, zone 8']
X_test_zone8_simulated = df_test_simulated[['Global Horizontal Irradiance, W/m2', 'Outdoor air temp','Mean air temperature, degC, zone 8', 'Relative humidity, %, zone 8', 'CO2, ppm (vol), zone 8', 'Local heating units, W, zone 8']]
y_test_zone8_simulated = df_test_simulated['FDD, zone 8']

In [315]:
train_x = X_train_zone1_simulated
train_y = y_train_zone1_simulated

test_x = X_test_zone8_simulated
test_y = y_test_zone8_simulated

# Define zones and variables for plots 
test_temp = 'Mean air temperature, degC, zone 8'
test_heat = 'Local heating units, W, zone 8'

### Random Forest 

In [316]:
# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(class_weight='balanced')

# Train the model
rf_model.fit(train_x, train_y)

# Make predictions
y_pred_train = rf_model.predict(train_x)
y_pred = rf_model.predict(test_x)

# Evaluate the model
print("Accuracy:", accuracy_score(train_y, y_pred_train))
print("Train Report:\n", classification_report(train_y, y_pred_train))
print()
print("Accuracy:", accuracy_score(test_y, y_pred))
print("\nClassification Report:\n", classification_report(test_y, y_pred))

print(rf_model.feature_importances_)

Accuracy: 1.0
Train Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      7315
           3       1.00      1.00      1.00      1445

    accuracy                           1.00      8760
   macro avg       1.00      1.00      1.00      8760
weighted avg       1.00      1.00      1.00      8760


Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8062
           3       1.00      1.00      1.00       698

    accuracy                           1.00      8760
   macro avg       1.00      1.00      1.00      8760
weighted avg       1.00      1.00      1.00      8760

[0.0045066  0.0153943  0.0047447  0.02636897 0.92688269 0.02210274]


In [317]:
# Store all runs
results = []

for i in range(10):
    rf_model = RandomForestClassifier()
    rf_model.fit(train_x, train_y)

    y_pred = rf_model.predict(test_x)
    report = classification_report(test_y, y_pred, output_dict=True)

    # Store per-class metrics
    run_result = {'run': i + 1}
    for label in report:
        if label not in ('accuracy', 'macro avg', 'weighted avg'):
            run_result[f'precision_class_{label}'] = report[label]['precision']
            run_result[f'recall_class_{label}'] = report[label]['recall']
            run_result[f'f1_class_{label}'] = report[label]['f1-score']
    
    results.append(run_result)

# Results as DataFrame
results_df = pd.DataFrame(results)

# Summary (mean and std)
summary = results_df.describe().loc[['mean', 'std']]

print("\nSummary (Mean ± Std):")
print(summary)

print("\nAll Per-Class Results:")
print(results_df)


Summary (Mean ± Std):
          run  precision_class_0  recall_class_0  f1_class_0  \
mean  5.50000                1.0             1.0         1.0   
std   3.02765                0.0             0.0         0.0   

      precision_class_3  recall_class_3  f1_class_3  
mean                1.0             1.0         1.0  
std                 0.0             0.0         0.0  

All Per-Class Results:
   run  precision_class_0  recall_class_0  f1_class_0  precision_class_3  \
0    1                1.0             1.0         1.0                1.0   
1    2                1.0             1.0         1.0                1.0   
2    3                1.0             1.0         1.0                1.0   
3    4                1.0             1.0         1.0                1.0   
4    5                1.0             1.0         1.0                1.0   
5    6                1.0             1.0         1.0                1.0   
6    7                1.0             1.0         1.0             

### XGboost

In [318]:
train_y = train_y.replace(3, 1)
test_y = test_y.replace(3, 1)

# Initialize the XGBoost Classifier
xgb_model = XGBClassifier()

# Train the model
xgb_model.fit(train_x, train_y)

# Make predictions
y_pred_train = xgb_model.predict(train_x)
y_pred = xgb_model.predict(test_x)

# Evaluate the model
print("Accuracy on Train Set:", accuracy_score(train_y, y_pred_train))
print("Train Report:\n", classification_report(train_y, y_pred_train))
print()
print("Accuracy on Test Set:", accuracy_score(test_y, y_pred))
print("\nClassification Report:\n", classification_report(test_y, y_pred))

Accuracy on Train Set: 1.0
Train Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      7315
           1       1.00      1.00      1.00      1445

    accuracy                           1.00      8760
   macro avg       1.00      1.00      1.00      8760
weighted avg       1.00      1.00      1.00      8760


Accuracy on Test Set: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8062
           1       1.00      1.00      1.00       698

    accuracy                           1.00      8760
   macro avg       1.00      1.00      1.00      8760
weighted avg       1.00      1.00      1.00      8760



In [319]:
# Store all runs
results = []

for i in range(10):
    xgb_model = XGBClassifier()
    xgb_model.fit(train_x, train_y)

    y_pred = xgb_model.predict(test_x)
    report = classification_report(test_y, y_pred, output_dict=True)

    # Store per-class metrics
    run_result = {'run': i + 1}
    for label in report:
        if label not in ('accuracy', 'macro avg', 'weighted avg'):
            run_result[f'precision_class_{label}'] = report[label]['precision']
            run_result[f'recall_class_{label}'] = report[label]['recall']
            run_result[f'f1_class_{label}'] = report[label]['f1-score']
    
    results.append(run_result)

# Results as DataFrame
results_df = pd.DataFrame(results)

# Summary (mean and std)
summary = results_df.describe().loc[['mean', 'std']]

print("\nSummary (Mean ± Std):")
print(summary)

print("\nAll Per-Class Results:")
print(results_df)


Summary (Mean ± Std):
          run  precision_class_0  recall_class_0  f1_class_0  \
mean  5.50000                1.0             1.0         1.0   
std   3.02765                0.0             0.0         0.0   

      precision_class_1  recall_class_1  f1_class_1  
mean                1.0             1.0         1.0  
std                 0.0             0.0         0.0  

All Per-Class Results:
   run  precision_class_0  recall_class_0  f1_class_0  precision_class_1  \
0    1                1.0             1.0         1.0                1.0   
1    2                1.0             1.0         1.0                1.0   
2    3                1.0             1.0         1.0                1.0   
3    4                1.0             1.0         1.0                1.0   
4    5                1.0             1.0         1.0                1.0   
5    6                1.0             1.0         1.0                1.0   
6    7                1.0             1.0         1.0             

## 4) Fejl 4 - High setpoint

In [320]:
df_simulated = pd.read_csv('/Users/sararhiger/Desktop/Master thesis /Data/Processed data/Fault4_HighSetpoint_dataprocessed.csv')


In [321]:
# Define the start and end times for training data
start_time_train = "2023-01-01 00:00:00"  
end_time_train = "2023-12-31 23:00:00"  

# Define the start and end times for testing data
start_time_test = "2023-01-01 00:00:00"  
end_time_test = "2023-12-31 23:00:00"

# Filter DataFrame for timestamps within the range
df_train_simulated = df_simulated[(df_simulated['Time'] >= start_time_train) & (df_simulated['Time'] <= end_time_train)]

df_test_simulated = df_simulated[(df_simulated['Time'] >= start_time_test) & (df_simulated['Time'] <= end_time_test)]

# Zone 1
X_train_zone1_simulated = df_train_simulated[['Global Horizontal Irradiance, W/m2','Outdoor air temp','Mean air temperature, degC, zone 1', 'Relative humidity, %, zone 1', 'CO2, ppm (vol), zone 1', 'Local heating units, W, zone 1']]
y_train_zone1_simulated = df_train_simulated['FDD, zone 1']
X_test_zone1_simulated = df_test_simulated[['Global Horizontal Irradiance, W/m2','Outdoor air temp','Mean air temperature, degC, zone 1', 'Relative humidity, %, zone 1', 'CO2, ppm (vol), zone 1', 'Local heating units, W, zone 1']]
y_test_zone1_simulated = df_test_simulated['FDD, zone 1']

# Zone 8
X_train_zone8_simulated = df_train_simulated[['Global Horizontal Irradiance, W/m2', 'Outdoor air temp','Mean air temperature, degC, zone 8', 'Relative humidity, %, zone 8', 'CO2, ppm (vol), zone 8', 'Local heating units, W, zone 8']]
y_train_zone8_simulated = df_train_simulated['FDD, zone 8']
X_test_zone8_simulated = df_test_simulated[['Global Horizontal Irradiance, W/m2', 'Outdoor air temp','Mean air temperature, degC, zone 8', 'Relative humidity, %, zone 8', 'CO2, ppm (vol), zone 8', 'Local heating units, W, zone 8']]
y_test_zone8_simulated = df_test_simulated['FDD, zone 8']

In [322]:
train_x = X_train_zone1_simulated
train_y = y_train_zone1_simulated

test_x = X_test_zone8_simulated
test_y = y_test_zone8_simulated

# Define zones and variables for plots 
test_temp = 'Mean air temperature, degC, zone 8'
test_heat = 'Local heating units, W, zone 8'

### Random forest

In [323]:
# Initialize the Random Forest Classifier
#rf_model = RandomForestClassifier(class_weight='balanced')
rf_model = RandomForestClassifier(n_estimators=500)

# Train the model
rf_model.fit(train_x, train_y)

# Make predictions
y_pred_train = rf_model.predict(train_x)
y_pred = rf_model.predict(test_x)

# Evaluate the model
print("Accuracy:", accuracy_score(train_y, y_pred_train))
print("Train Report:\n", classification_report(train_y, y_pred_train))
print()
print("Accuracy:", accuracy_score(test_y, y_pred))
print("\nClassification Report:\n", classification_report(test_y, y_pred))

print(rf_model.feature_importances_)

Accuracy: 1.0
Train Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8572
           4       1.00      1.00      1.00       188

    accuracy                           1.00      8760
   macro avg       1.00      1.00      1.00      8760
weighted avg       1.00      1.00      1.00      8760


Accuracy: 0.9995433789954338

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8697
           4       0.94      1.00      0.97        63

    accuracy                           1.00      8760
   macro avg       0.97      1.00      0.98      8760
weighted avg       1.00      1.00      1.00      8760

[0.0217155  0.02262941 0.37607017 0.03187796 0.02433777 0.52336919]


In [324]:
# Store all runs
results = []

for i in range(10):
    rf_model = RandomForestClassifier()
    rf_model.fit(train_x, train_y)

    y_pred = rf_model.predict(test_x)
    report = classification_report(test_y, y_pred, output_dict=True)

    # Store per-class metrics
    run_result = {'run': i + 1}
    for label in report:
        if label not in ('accuracy', 'macro avg', 'weighted avg'):
            run_result[f'precision_class_{label}'] = report[label]['precision']
            run_result[f'recall_class_{label}'] = report[label]['recall']
            run_result[f'f1_class_{label}'] = report[label]['f1-score']
    
    results.append(run_result)

# Results as DataFrame
results_df = pd.DataFrame(results)

# Summary (mean and std)
summary = results_df.describe().loc[['mean', 'std']]

print("\nSummary (Mean ± Std):")
print(summary)

print("\nAll Per-Class Results:")
print(results_df)


Summary (Mean ± Std):
          run  precision_class_0  recall_class_0  f1_class_0  \
mean  5.50000           0.999472        0.999299    0.999385   
std   3.02765           0.000991        0.000732    0.000534   

      precision_class_4  recall_class_4  f1_class_4  
mean           0.912621        0.926984    0.912498  
std            0.067371        0.137199    0.083652  

All Per-Class Results:
   run  precision_class_0  recall_class_0  f1_class_0  precision_class_4  \
0    1           1.000000        0.999655    0.999827           0.954545   
1    2           0.999770        0.999310    0.999540           0.910448   
2    3           0.997018        0.999655    0.998335           0.925000   
3    4           0.999770        0.999425    0.999597           0.924242   
4    5           1.000000        0.999540    0.999770           0.940299   
5    6           1.000000        0.999540    0.999770           0.940299   
6    7           0.999770        0.999540    0.999655           0.

In [325]:
# Compute confusion matrix
cm = confusion_matrix(test_y, y_pred)

# Ensure labels are 0 and 4
labels = [0, 4]  

# Create the confusion matrix heatmap
fig = go.Figure(data=go.Heatmap(
    z=cm,
    x=labels,  # Predicted labels
    y=labels,  # Actual labels
    colorscale='Blues',
    showscale=True,
    text=cm,  # Display numbers in the cells
    texttemplate="%{text}",  # Ensure counts appear inside the cells
    hovertemplate='Actual: %{y}<br>Predicted: %{x}<br>Count: %{z}<extra></extra>'
))

# Update layout for binary classification (0 and 1 on axes)
fig.update_layout(
    title="Confusion Matrix for zone 8 fault 4 ",
    width=500,   # Reduce width
    height=400,
    xaxis=dict(
        title="Predicted Label",
        tickmode='array',        # Custom tick labels
        tickvals=[0, 4],         # Only show 0 and 1
        showgrid=False
    ),
    yaxis=dict(
        title="Actual Label",
        tickmode='array',
        tickvals=[0, 4],         # Only show 0 and 1
        showgrid=False
    ),
    template="plotly_white"
)

# Show the plot
fig.show()

In [326]:
fig = go.Figure()

# Base temperature curve
fig.add_trace(go.Scatter(
    x=df_simulated['Time'], 
    y=df_simulated[test_heat], 
    mode='lines', 
    name='Zone 8 faulty operation'
))

# Convert y_pred to a Pandas Series with the correct index
y_pred_series = pd.Series(y_pred, index=test_y.index)

# Find indices for different cases
correct_1_indices = (test_y == 4) & (y_pred_series == 4)  # True 1, Pred 1 -> Green
false_negative_indices = (test_y == 4) & (y_pred_series == 0)  # True 1, Pred 0 -> Red
false_positive_indices = (test_y == 0) & (y_pred_series == 4)  # True 0, Pred 1 -> Orange

# Extract timestamps
correct_1_timestamps = df_simulated.loc[test_y.index, 'Time'][correct_1_indices]
false_negative_timestamps = df_simulated.loc[test_y.index, 'Time'][false_negative_indices]
false_positive_timestamps = df_simulated.loc[test_y.index, 'Time'][false_positive_indices]

# Extract corresponding temperatures
correct_1_temps = df_simulated.loc[test_y.index, test_heat][correct_1_indices]
false_negative_temps = df_simulated.loc[test_y.index, test_heat][false_negative_indices]
false_positive_temps = df_simulated.loc[test_y.index, test_heat][false_positive_indices]

# Add markers for correct predictions (Green)
fig.add_trace(go.Scatter(
    x=correct_1_timestamps,
    y=correct_1_temps,
    mode='markers',
    marker=dict(color='green', size=10),
    name='Correct (4 predicted as 4)'
))

# Add markers for false negatives (Red)
fig.add_trace(go.Scatter(
    x=false_negative_timestamps,
    y=false_negative_temps,
    mode='markers',
    marker=dict(color='red', size=10, symbol='x'),
    name='False Negative (4 predicted as 0)'
))

# Add markers for false positives (Orange)
fig.add_trace(go.Scatter(
    x=false_positive_timestamps,
    y=false_positive_temps,
    mode='markers',
    marker=dict(color='orange', size=10, symbol='triangle-up'),
    name='False Positive (0 predicted as 4)'
))

# Customize layout
fig.update_layout(
    title="Heating consumption with faulty operation in zone 8 & prediction markers",
    xaxis_title="Time",
    yaxis_title="Heat (kWh)",
    legend_title="Legend",
    template="plotly", 
    height = 500,
    width = 1200
)

# Show the plot

### XGBoost

In [327]:
train_y = train_y.replace(4, 1)
test_y = test_y.replace(4, 1)

# Initialize the XGBoost Classifier
xgb_model = XGBClassifier()

# Train the model
xgb_model.fit(train_x, train_y)

# Make predictions
y_pred_train = xgb_model.predict(train_x)
y_pred = xgb_model.predict(test_x)

# Evaluate the model
print("Accuracy on Train Set:", accuracy_score(train_y, y_pred_train))
print("Train Report:\n", classification_report(train_y, y_pred_train))
print()
print("Accuracy on Test Set:", accuracy_score(test_y, y_pred))
print("\nClassification Report:\n", classification_report(test_y, y_pred))

Accuracy on Train Set: 1.0
Train Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8572
           1       1.00      1.00      1.00       188

    accuracy                           1.00      8760
   macro avg       1.00      1.00      1.00      8760
weighted avg       1.00      1.00      1.00      8760


Accuracy on Test Set: 0.9941780821917808

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      8697
           1       0.88      0.22      0.35        63

    accuracy                           0.99      8760
   macro avg       0.93      0.61      0.68      8760
weighted avg       0.99      0.99      0.99      8760



In [328]:
# Store all runs
results = []

for i in range(10):
    xgb_model = XGBClassifier()
    xgb_model.fit(train_x, train_y)

    y_pred = xgb_model.predict(test_x)
    report = classification_report(test_y, y_pred, output_dict=True)

    # Store per-class metrics
    run_result = {'run': i + 1}
    for label in report:
        if label not in ('accuracy', 'macro avg', 'weighted avg'):
            run_result[f'precision_class_{label}'] = report[label]['precision']
            run_result[f'recall_class_{label}'] = report[label]['recall']
            run_result[f'f1_class_{label}'] = report[label]['f1-score']
    
    results.append(run_result)

# Results as DataFrame
results_df = pd.DataFrame(results)

# Summary (mean and std)
summary = results_df.describe().loc[['mean', 'std']]

print("\nSummary (Mean ± Std):")
print(summary)

print("\nAll Per-Class Results:")
print(results_df)


Summary (Mean ± Std):
          run  precision_class_0  recall_class_0  f1_class_0  \
mean  5.50000           0.994396         0.99977    0.997076   
std   3.02765           0.000000         0.00000    0.000000   

      precision_class_1  recall_class_1    f1_class_1  
mean              0.875    2.222222e-01  3.544304e-01  
std               0.000    5.851389e-17  5.851389e-17  

All Per-Class Results:
   run  precision_class_0  recall_class_0  f1_class_0  precision_class_1  \
0    1           0.994396         0.99977    0.997076              0.875   
1    2           0.994396         0.99977    0.997076              0.875   
2    3           0.994396         0.99977    0.997076              0.875   
3    4           0.994396         0.99977    0.997076              0.875   
4    5           0.994396         0.99977    0.997076              0.875   
5    6           0.994396         0.99977    0.997076              0.875   
6    7           0.994396         0.99977    0.997076       

## Multiclass model 
### Træning på zone 1 og test på zone 8 
### Normalized energy and GHI added but temperature not normalized

In [329]:
df_simulated = pd.read_csv('/Users/sararhiger/Desktop/Master thesis /Data/Processed data/Multiclass_dataprocessed.csv')

In [330]:
# Define the start and end times for training data
start_time_train = "2023-01-01 00:00:00"  
end_time_train = "2023-12-31 23:00:00"  

# Define the start and end times for testing data
start_time_test = "2023-01-01 00:00:00"  
end_time_test = "2023-12-31 23:00:00"

# Filter DataFrame for timestamps within the range
df_train_simulated = df_simulated[(df_simulated['Time'] >= start_time_train) & (df_simulated['Time'] <= end_time_train)]

df_test_simulated = df_simulated[(df_simulated['Time'] >= start_time_test) & (df_simulated['Time'] <= end_time_test)]

# Zone 1
X_train_zone1_simulated = df_train_simulated[['Global Horizontal Irradiance, W/m2','Outdoor air temp','Mean air temperature, degC, zone 1', 'Relative humidity, %, zone 1', 'CO2, ppm (vol), zone 1', 'Local heating units, W, zone 1']]
y_train_zone1_simulated = df_train_simulated['FDD, zone 1']
X_test_zone1_simulated = df_test_simulated[['Global Horizontal Irradiance, W/m2','Outdoor air temp','Mean air temperature, degC, zone 1', 'Relative humidity, %, zone 1', 'CO2, ppm (vol), zone 1', 'Local heating units, W, zone 1']]
y_test_zone1_simulated = df_test_simulated['FDD, zone 1']

# Zone 8
X_train_zone8_simulated = df_train_simulated[['Global Horizontal Irradiance, W/m2', 'Outdoor air temp','Mean air temperature, degC, zone 8', 'Relative humidity, %, zone 8', 'CO2, ppm (vol), zone 8', 'Local heating units, W, zone 8']]
y_train_zone8_simulated = df_train_simulated['FDD, zone 8']
X_test_zone8_simulated = df_test_simulated[['Global Horizontal Irradiance, W/m2', 'Outdoor air temp','Mean air temperature, degC, zone 8', 'Relative humidity, %, zone 8', 'CO2, ppm (vol), zone 8', 'Local heating units, W, zone 8']]
y_test_zone8_simulated = df_test_simulated['FDD, zone 8']

In [331]:
train_x = X_train_zone1_simulated
train_y = y_train_zone1_simulated

test_x = X_test_zone8_simulated
test_y = y_test_zone8_simulated

# Define zones and variables for plots 
test_temp = 'Mean air temperature, degC, zone 8'
test_heat = 'Local heating units, W, zone 8'

### Random Forest

In [332]:
# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier()

# Train the model
rf_model.fit(train_x, train_y)

# Make predictions
y_pred_train = rf_model.predict(train_x)
y_pred = rf_model.predict(test_x)

# Evaluate the model
print("Accuracy:", accuracy_score(train_y, y_pred_train))
print("Train Report:\n", classification_report(train_y, y_pred_train))
print()
print("Accuracy:", accuracy_score(test_y, y_pred))
print("\nClassification Report:\n", classification_report(test_y, y_pred))

print(rf_model.feature_importances_)

Accuracy: 1.0
Train Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      6976
           1       1.00      1.00      1.00        48
           2       1.00      1.00      1.00       103
           3       1.00      1.00      1.00      1445
           4       1.00      1.00      1.00       188

    accuracy                           1.00      8760
   macro avg       1.00      1.00      1.00      8760
weighted avg       1.00      1.00      1.00      8760


Accuracy: 0.993607305936073

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      7915
           1       0.86      0.46      0.60        13
           2       1.00      0.55      0.71        71
           3       1.00      1.00      1.00       698
           4       0.80      1.00      0.89        63

    accuracy                           0.99      8760
   macro avg       0.93      0.80      0.84     

In [333]:
# Store all runs
results = []

for i in range(10):
    rf_model = RandomForestClassifier()
    rf_model.fit(train_x, train_y)

    y_pred = rf_model.predict(test_x)
    report = classification_report(test_y, y_pred, output_dict=True)

    # Store per-class metrics
    run_result = {'run': i + 1}
    for label in report:
        if label not in ('accuracy', 'macro avg', 'weighted avg'):
            run_result[f'precision_class_{label}'] = report[label]['precision']
            run_result[f'recall_class_{label}'] = report[label]['recall']
            run_result[f'f1_class_{label}'] = report[label]['f1-score']
    
    results.append(run_result)

# Results as DataFrame
results_df = pd.DataFrame(results)

# Summary (mean and std)
summary = results_df.describe().loc[['mean', 'std']]

print("\nSummary (Mean ± Std):")
print(summary)

print("\nAll Per-Class Results:")
print(results_df)


Summary (Mean ± Std):
          run  precision_class_0  recall_class_0  f1_class_0  \
mean  5.50000           0.994986        0.997840    0.996411   
std   3.02765           0.000099        0.000234    0.000113   

      precision_class_1  recall_class_1  f1_class_1  precision_class_2  \
mean           0.862500        0.484615    0.620000           0.997500   
std            0.008626        0.037157    0.032203           0.007906   

      recall_class_2  f1_class_2  precision_class_3  recall_class_3  \
mean        0.547887    0.707268                1.0             1.0   
std         0.004454    0.004053                0.0             0.0   

      f1_class_3  precision_class_4  recall_class_4  f1_class_4  
mean         1.0           0.795312        0.984127    0.879521  
std          0.0           0.018204        0.014965    0.011361  

All Per-Class Results:
   run  precision_class_0  recall_class_0  f1_class_0  precision_class_1  \
0    1           0.994961        0.997852    0.99

In [334]:
# Define all possible classes
all_classes = [0, 1, 2, 3,4]  

# Compute confusion matrix with fixed labels
cm = confusion_matrix(y_test_zone8_simulated, y_pred, labels=all_classes)

# Convert to DataFrame
df_cm = pd.DataFrame(cm, index=all_classes, columns=all_classes)

# Create interactive Plotly heatmap
fig = px.imshow(df_cm, 
                text_auto=True,  
                color_continuous_scale='Blues',  
                labels=dict(x="Predicted", y="Actual", color="Count"))

# Set axis labels and tick values to only show whole numbers (0,1,2,3)
fig.update_layout(
    title="Confusion Matrix for zone 8",
    xaxis_title="Predicted Label",
    yaxis_title="Actual Label",
    xaxis=dict(tickmode='array', tickvals=[0,1,2,3,4], ticktext=['Fault free','Open window','Heating shut down','Defect CO2 sensor','Setpoint 25 degrees']),
    yaxis=dict(tickmode='array', tickvals=[0,1,2,3,4], ticktext=['Fault free','Open window','Heating shut down','Defect CO2 sensor','Setpoint 25 degrees']), 
    width=700,  # Set width of the figure
    height=500  # Set height of the figure
)

# Show plot
fig.show()

In [335]:
fig = go.Figure()

# Base temperature curve
fig.add_trace(go.Scatter(
    x=df_simulated['Time'], 
    y=df_simulated[test_temp], 
    mode='lines', 
    name='Temperature (C°)',
    yaxis='y1'
    ))

# Convert y_pred to a Pandas Series with the correct index
y_pred_series = pd.Series(y_pred, index=test_y.index)

# Find indices for different cases
correct_1_indices = (test_y == 1) & (y_pred_series == 1)  # True 1, Pred 1 -> Green
false_negative_indices = (test_y == 1) & (y_pred_series == 0)  # True 1, Pred 0 -> Red
false_positive_indices = (test_y == 0) & (y_pred_series == 1)  # True 0, Pred 1 -> Orange

# Extract timestamps
correct_1_timestamps = df_simulated.loc[test_y.index, 'Time'][correct_1_indices]
false_negative_timestamps = df_simulated.loc[test_y.index, 'Time'][false_negative_indices]
false_positive_timestamps = df_simulated.loc[test_y.index, 'Time'][false_positive_indices]

# Extract corresponding temperatures
correct_1_temps = df_simulated.loc[test_y.index, test_temp][correct_1_indices]
false_negative_temps = df_simulated.loc[test_y.index, test_temp][false_negative_indices]
false_positive_temps = df_simulated.loc[test_y.index, test_temp][false_positive_indices]

# Add markers for correct predictions (Green)
fig.add_trace(go.Scatter(
    x=correct_1_timestamps,
    y=correct_1_temps,
    mode='markers',
    marker=dict(color='green', size=10),
    name='Correct (2 predicted as 2)'
))

# Add markers for false negatives (Red)
fig.add_trace(go.Scatter(
    x=false_negative_timestamps,
    y=false_negative_temps,
    mode='markers',
    marker=dict(color='red', size=10, symbol='x'),
    name='False Negative (2 predicted as 0)'
))

# Add markers for false positives (Orange)
fig.add_trace(go.Scatter(
    x=false_positive_timestamps,
    y=false_positive_temps,
    mode='markers',
    marker=dict(color='orange', size=10, symbol='triangle-up'),
    name='False Positive (0 predicted as 2)'
))

# Customize layout
#fig.update_layout(
#    title="Temperature with low setpoint fault simulated in Zone 8 with prediction markers",
#    xaxis_title="Time",
 #   yaxis_title="Temperature (°C)",
   # legend_title="Legend",
    #template="plotly"
#)

# Update layout with secondary y-axis
fig.update_layout(
    title="Temperature & heating consumption in zone 8 with lowered setpoint simulated",
    xaxis_title="Time",
    yaxis=dict(
        title="Temperature (°C)",
        titlefont=dict(color="black"),
        tickfont=dict(color="black")
    ),
    yaxis2=dict(
        title="Heating Consumption (kWh)",  # Adjust unit if needed
        overlaying='y',
        side='right',
        titlefont=dict(color="black"),
        tickfont=dict(color="black"),
    ),
    legend=dict(
        x=1.05,         # Pushes legend to the right (1.0 is the edge of the plot area)
        y=1,            # Top alignment
        xanchor="left", # Anchor relative to left of the legend box
        yanchor="top"
    ),
    legend_title="Legend",
    template="plotly",
    height=600,
    width=1200,
)

# Show the plot

### XGboost 

In [336]:
# Initialize the XGBoost Classifier
xgb_model = XGBClassifier()

# Train the model
xgb_model.fit(train_x, train_y)

# Make predictions
y_pred_train = xgb_model.predict(train_x)
y_pred = xgb_model.predict(test_x)

# Evaluate the model
print("Accuracy on Train Set:", accuracy_score(train_y, y_pred_train))
print("Train Report:\n", classification_report(train_y, y_pred_train))
print()
print("Accuracy on Test Set:", accuracy_score(test_y, y_pred))
print("\nClassification Report:\n", classification_report(test_y, y_pred))

print(xgb_model.feature_importances_)

Accuracy on Train Set: 1.0
Train Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      6976
           1       1.00      1.00      1.00        48
           2       1.00      1.00      1.00       103
           3       1.00      1.00      1.00      1445
           4       1.00      1.00      1.00       188

    accuracy                           1.00      8760
   macro avg       1.00      1.00      1.00      8760
weighted avg       1.00      1.00      1.00      8760


Accuracy on Test Set: 0.9896118721461187

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      7915
           1       0.83      0.38      0.53        13
           2       0.83      0.55      0.66        71
           3       1.00      1.00      1.00       698
           4       0.59      0.97      0.73        63

    accuracy                           0.99      8760
   macro avg       0.8

In [337]:
# Store all runs
results = []

for i in range(10):
    xgb_model = XGBClassifier()
    xgb_model.fit(train_x, train_y)

    y_pred = xgb_model.predict(test_x)
    report = classification_report(test_y, y_pred, output_dict=True)

    # Store per-class metrics
    run_result = {'run': i + 1}
    for label in report:
        if label not in ('accuracy', 'macro avg', 'weighted avg'):
            run_result[f'precision_class_{label}'] = report[label]['precision']
            run_result[f'recall_class_{label}'] = report[label]['recall']
            run_result[f'f1_class_{label}'] = report[label]['f1-score']
    
    results.append(run_result)

# Results as DataFrame
results_df = pd.DataFrame(results)

# Summary (mean and std)
summary = results_df.describe().loc[['mean', 'std']]

print("\nSummary (Mean ± Std):")
print(summary)

print("\nAll Per-Class Results:")
print(results_df)


Summary (Mean ± Std):
          run  precision_class_0  recall_class_0    f1_class_0  \
mean  5.50000       9.949406e-01        0.993809  9.943746e-01   
std   3.02765       1.170278e-16        0.000000  1.170278e-16   

      precision_class_1  recall_class_1  f1_class_1  precision_class_2  \
mean       8.333333e-01        0.384615    0.526316           0.829787   
std        1.170278e-16        0.000000    0.000000           0.000000   

      recall_class_2  f1_class_2  precision_class_3  recall_class_3  \
mean    5.492958e-01    0.661017                1.0             1.0   
std     1.170278e-16    0.000000                0.0             0.0   

      f1_class_3  precision_class_4  recall_class_4    f1_class_4  
mean         1.0       5.922330e-01    9.682540e-01  7.349398e-01  
std          0.0       1.170278e-16    1.170278e-16  1.170278e-16  

All Per-Class Results:
   run  precision_class_0  recall_class_0  f1_class_0  precision_class_1  \
0    1           0.994941        0.99

In [338]:
# Define all possible classes
all_classes = [0, 1, 2, 3, 4]  

# Compute confusion matrix with fixed labels
cm = confusion_matrix(y_test_zone8_simulated, y_pred, labels=all_classes)

# Convert to DataFrame
df_cm = pd.DataFrame(cm, index=all_classes, columns=all_classes)

# Create a copy for coloring — set [0,0] to NaN so it doesn't affect color scale
cm_for_plot = df_cm.copy()
cm_for_plot.iloc[0, 0] = np.nan  # exclude top-left from color scaling

# Create interactive Plotly heatmap
fig = px.imshow(
    cm_for_plot,
    text_auto=True,
    color_continuous_scale='Blues',
    labels=dict(x="Predicted", y="Actual", color="Count")
)

# Overlay the real value in top-left as custom annotation
fig.add_annotation(
    text=str(df_cm.iloc[0, 0]),  # Actual value
    x=0, y=0,
    showarrow=False,
    font=dict(color="red", size=12),
    bgcolor="white"
)

# Customize axis labels
fig.update_layout(
    title="Confusion Matrix for Zone 8",
    xaxis_title="Predicted Label",
    yaxis_title="Actual Label",
    xaxis=dict(
        tickmode='array',
        tickvals=[0, 1, 2, 3, 4],
        ticktext=[
            'Fault free', 'Open window', 'Heating shut down',
            'Defect CO2 sensor', 'Setpoint 25 degrees'
        ]
    ),
    yaxis=dict(
        tickmode='array',
        tickvals=[0, 1, 2, 3, 4],
        ticktext=[
            'Fault free', 'Open window', 'Heating shut down',
            'Defect CO2 sensor', 'Setpoint 25 degrees'
        ]
    ),
    width=700,
    height=500
)

# Show plot
fig.show()

In [339]:
fig = go.Figure()

# Base temperature curve
fig.add_trace(go.Scatter(
    x=df_simulated['Time'], 
    y=df_simulated[test_heat], 
    mode='lines', 
    name='Zone 8 faulty operation'
))

# Convert y_pred to a Pandas Series with the correct index
y_pred_series = pd.Series(y_pred, index=test_y.index)

# Find indices for different cases
correct_1_indices = (test_y == 2) & (y_pred_series == 2)  # True 1, Pred 1 -> Green
false_negative_indices = (test_y == 2) & (y_pred_series == 0)  # True 1, Pred 0 -> Red
false_positive_indices = (test_y == 0) & (y_pred_series == 2)  # True 0, Pred 1 -> Orange

# Extract timestamps
correct_1_timestamps = df_simulated.loc[test_y.index, 'Time'][correct_1_indices]
false_negative_timestamps = df_simulated.loc[test_y.index, 'Time'][false_negative_indices]
false_positive_timestamps = df_simulated.loc[test_y.index, 'Time'][false_positive_indices]

# Extract corresponding temperatures
correct_1_temps = df_simulated.loc[test_y.index, test_heat][correct_1_indices]
false_negative_temps = df_simulated.loc[test_y.index, test_heat][false_negative_indices]
false_positive_temps = df_simulated.loc[test_y.index, test_heat][false_positive_indices]

# Add markers for correct predictions (Green)
fig.add_trace(go.Scatter(
    x=correct_1_timestamps,
    y=correct_1_temps,
    mode='markers',
    marker=dict(color='green', size=10),
    name='Correct (2 predicted as 2)'
))

# Add markers for false negatives (Red)
fig.add_trace(go.Scatter(
    x=false_negative_timestamps,
    y=false_negative_temps,
    mode='markers',
    marker=dict(color='red', size=10, symbol='x'),
    name='False Negative (2 predicted as 0)'
))

# Add markers for false positives (Orange)
fig.add_trace(go.Scatter(
    x=false_positive_timestamps,
    y=false_positive_temps,
    mode='markers',
    marker=dict(color='orange', size=10, symbol='triangle-up'),
    name='False Positive (0 predicted as 2)'
))

# Customize layout
fig.update_layout(
    title="Heating consumption with faulty operation in zone 8 & Prediction Markers",
    xaxis_title="Time",
    yaxis_title="Heat (kWh)",
    legend_title="Legend",
    template="plotly"
)

# Show the plot