In [107]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, accuracy_score

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from collections import Counter

## 1) Fault 1
### Training on zone 1 binary class open window and testing on Engen Mileparken 

In [108]:
df_simulated = pd.read_csv('/Users/sararhiger/Desktop/Master thesis /Data/Processed data/Fault1_Window_dataprocessed.csv')

df_engen = pd.read_csv('/Users/sararhiger/Desktop/Master thesis /Data/Processed data/Mileparken.csv')


In [109]:
# Define the start and end times for training data
start_time_train = "2023-01-01 00:00:00"  
end_time_train = "2023-12-31 23:00:00"  

# Define the start and end times for testing data
start_time_test = "2025-03-15 00:00:00"  
end_time_test = "2025-03-31 00:00:00"

# Filter DataFrame for timestamps within the range
df_train = df_simulated[(df_simulated['Time'] >= start_time_train) & (df_simulated['Time'] <= end_time_train)]

df_test = df_engen[(df_engen['Time'] >= start_time_test) & (df_engen['Time'] <= end_time_test)]




In [110]:
train_x= df_train[['Global Horizontal Irradiance, W/m2','Outdoor air temp','Mean air temperature, degC, zone 1', 'Relative humidity, %, zone 1', 'CO2, ppm (vol), zone 1', 'Ideal heaters and other local u']]
train_y = df_train['FDD, zone 1']

test_x = df_test[['GHI, W/m2','Outdoor temperature','TEMP °C', 'HUMIDITY %', 'CO2 ppm', 'Heating kWh']]
test_y = df_test['FDD']

# Define zones and variables for plots 
test_temp = 'TEMP °C'
test_heat = 'Heating kWh'

### Random forest

In [111]:
# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier()

# Train the model
rf_model.fit(train_x, train_y)

# Make predictions
y_pred_train = rf_model.predict(train_x)
y_pred = rf_model.predict(test_x)

# Evaluate the model
print("Accuracy:", accuracy_score(train_y, y_pred_train))
print("Train Report:\n", classification_report(train_y, y_pred_train))
print()
print("Accuracy:", accuracy_score(test_y, y_pred))
print("\nClassification Report:\n", classification_report(test_y, y_pred))

print(rf_model.feature_importances_)

Accuracy: 1.0
Train Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8712
           1       1.00      1.00      1.00        48

    accuracy                           1.00      8760
   macro avg       1.00      1.00      1.00      8760
weighted avg       1.00      1.00      1.00      8760


Accuracy: 0.9558441558441558

Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       366
           1       1.00      0.11      0.19        19

    accuracy                           0.96       385
   macro avg       0.98      0.55      0.58       385
weighted avg       0.96      0.96      0.94       385

[0.06336826 0.04717156 0.73245614 0.04750588 0.06564744 0.04385072]


In [112]:
# Store all runs
results = []

for i in range(10):
    rf_model = RandomForestClassifier()
    rf_model.fit(train_x, train_y)

    y_pred = rf_model.predict(test_x)
    report = classification_report(test_y, y_pred, output_dict=True)

    # Store per-class metrics
    run_result = {'run': i + 1}
    for label in report:
        if label not in ('accuracy', 'macro avg', 'weighted avg'):
            run_result[f'precision_class_{label}'] = report[label]['precision']
            run_result[f'recall_class_{label}'] = report[label]['recall']
            run_result[f'f1_class_{label}'] = report[label]['f1-score']
    
    results.append(run_result)

# Results as DataFrame
results_df = pd.DataFrame(results)

# Summary (mean and std)
summary = results_df.describe().loc[['mean', 'std']]

print("\nSummary (Mean ± Std):")
print(summary)

print("\nAll Per-Class Results:")
print(results_df)


Summary (Mean ± Std):
          run  precision_class_0  recall_class_0  f1_class_0  \
mean  5.50000           0.955602        0.999727    0.977166   
std   3.02765           0.000037        0.000864    0.000432   

      precision_class_1  recall_class_1  f1_class_1  
mean           0.966667        0.105263    0.189610  
std            0.105409        0.000000    0.002738  

All Per-Class Results:
   run  precision_class_0  recall_class_0  f1_class_0  precision_class_1  \
0    1           0.955497        0.997268    0.975936           0.666667   
1    2           0.955614        1.000000    0.977303           1.000000   
2    3           0.955614        1.000000    0.977303           1.000000   
3    4           0.955614        1.000000    0.977303           1.000000   
4    5           0.955614        1.000000    0.977303           1.000000   
5    6           0.955614        1.000000    0.977303           1.000000   
6    7           0.955614        1.000000    0.977303           1.

In [113]:
# Compute confusion matrix
cm = confusion_matrix(test_y, y_pred)

# Ensure labels are 0 and 1
labels = [0, 1]  

# Create the confusion matrix heatmap
fig = go.Figure(data=go.Heatmap(
    z=cm,
    x=labels,  # Predicted labels
    y=labels,  # Actual labels
    colorscale='Blues',
    showscale=True,
    text=cm,  # Display numbers in the cells
    texttemplate="%{text}",  # Ensure counts appear inside the cells
    hovertemplate='Actual: %{y}<br>Predicted: %{x}<br>Count: %{z}<extra></extra>'
))

# Update layout for binary classification (0 and 1 on axes)
fig.update_layout(
    title="Confusion Matrix for meeting room opened window",
    width=500,   # Reduce width
    height=500,
    xaxis=dict(
        title="Predicted Label",
        tickmode='array',        # Custom tick labels
        tickvals=[0, 1],         # Only show 0 and 1
        showgrid=False
    ),
    yaxis=dict(
        title="Actual Label",
        tickmode='array',
        tickvals=[0, 1],         # Only show 0 and 1
        showgrid=False
    ),
    template="plotly_white"
)

# Show the plot
fig.show()

In [114]:
fig = go.Figure()

# Base temperature curve
fig.add_trace(go.Scatter(
    x=df_engen['Time'], 
    y=df_engen[test_temp], 
    mode='lines', 
    name='Meeting room faulty operation'
))

# Convert y_pred to a Pandas Series with the correct index
y_pred_series = pd.Series(y_pred, index=test_y.index)

# Find indices for different cases
correct_1_indices = (test_y == 1) & (y_pred_series == 1)  # True 1, Pred 1 -> Green
false_negative_indices = (test_y == 1) & (y_pred_series == 0)  # True 1, Pred 0 -> Red
false_positive_indices = (test_y == 0) & (y_pred_series == 1)  # True 0, Pred 1 -> Orange

# Extract timestamps
correct_1_timestamps = df_engen.loc[test_y.index, 'Time'][correct_1_indices]
false_negative_timestamps = df_engen.loc[test_y.index, 'Time'][false_negative_indices]
false_positive_timestamps = df_engen.loc[test_y.index, 'Time'][false_positive_indices]

# Extract corresponding temperatures
correct_1_temps = df_engen.loc[test_y.index, test_temp][correct_1_indices]
false_negative_temps = df_engen.loc[test_y.index, test_temp][false_negative_indices]
false_positive_temps = df_engen.loc[test_y.index, test_temp][false_positive_indices]

# Add markers for correct predictions (Green)
fig.add_trace(go.Scatter(
    x=correct_1_timestamps,
    y=correct_1_temps,
    mode='markers',
    marker=dict(color='green', size=10),
    name='Correct (1 predicted as 1)'
))

# Add markers for false negatives (Red)
fig.add_trace(go.Scatter(
    x=false_negative_timestamps,
    y=false_negative_temps,
    mode='markers',
    marker=dict(color='red', size=10, symbol='x'),
    name='False Negative (1 predicted as 0)'
))

# Add markers for false positives (Orange)
fig.add_trace(go.Scatter(
    x=false_positive_timestamps,
    y=false_positive_temps,
    mode='markers',
    marker=dict(color='orange', size=10, symbol='triangle-up'),
    name='False Positive (0 predicted as 1)'
))

# Customize layout
fig.update_layout(
    title="Temperatures in meeting room & Prediction Markers for fault 1",
    xaxis_title="Time",
    yaxis_title="Temperature (°C)",
    legend_title="Legend",
    template="plotly", 
    width = 800,
    height = 500
)

# Show the plot
fig.show()

### XGBoost

In [115]:
# Initialize the XGBoost Classifier
xgb_model = XGBClassifier()

# Train the model
xgb_model.fit(train_x, train_y)

# Make predictions
y_pred_train = xgb_model.predict(train_x)
y_pred = xgb_model.predict(test_x)

# Evaluate the model
print("Accuracy on Train Set:", accuracy_score(train_y, y_pred_train))
print("Train Report:\n", classification_report(train_y, y_pred_train))
print()
print("Accuracy on Test Set:", accuracy_score(test_y, y_pred))
print("\nClassification Report:\n", classification_report(test_y, y_pred))


Accuracy on Train Set: 1.0
Train Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8712
           1       1.00      1.00      1.00        48

    accuracy                           1.00      8760
   macro avg       1.00      1.00      1.00      8760
weighted avg       1.00      1.00      1.00      8760


Accuracy on Test Set: 0.9688311688311688

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98       366
           1       0.73      0.58      0.65        19

    accuracy                           0.97       385
   macro avg       0.86      0.78      0.82       385
weighted avg       0.97      0.97      0.97       385



In [116]:
# Store all runs
results = []

for i in range(10):
    xgb_model = XGBClassifier()
    xgb_model.fit(train_x, train_y)

    y_pred = xgb_model.predict(test_x)
    report = classification_report(test_y, y_pred, output_dict=True)

    # Store per-class metrics
    run_result = {'run': i + 1}
    for label in report:
        if label not in ('accuracy', 'macro avg', 'weighted avg'):
            run_result[f'precision_class_{label}'] = report[label]['precision']
            run_result[f'recall_class_{label}'] = report[label]['recall']
            run_result[f'f1_class_{label}'] = report[label]['f1-score']
    
    results.append(run_result)

# Results as DataFrame
results_df = pd.DataFrame(results)

# Summary (mean and std)
summary = results_df.describe().loc[['mean', 'std']]

print("\nSummary (Mean ± Std):")
print(summary)

print("\nAll Per-Class Results:")
print(results_df)


Summary (Mean ± Std):
          run  precision_class_0  recall_class_0    f1_class_0  \
mean  5.50000           0.978378    9.890710e-01  9.836957e-01   
std   3.02765           0.000000    2.340556e-16  2.340556e-16   

      precision_class_1  recall_class_1    f1_class_1  
mean           0.733333    5.789474e-01  6.470588e-01  
std            0.000000    1.170278e-16  1.170278e-16  

All Per-Class Results:
   run  precision_class_0  recall_class_0  f1_class_0  precision_class_1  \
0    1           0.978378        0.989071    0.983696           0.733333   
1    2           0.978378        0.989071    0.983696           0.733333   
2    3           0.978378        0.989071    0.983696           0.733333   
3    4           0.978378        0.989071    0.983696           0.733333   
4    5           0.978378        0.989071    0.983696           0.733333   
5    6           0.978378        0.989071    0.983696           0.733333   
6    7           0.978378        0.989071    0.983696 

In [117]:
fig = go.Figure()

# Base temperature curve
fig.add_trace(go.Scatter(
    x=df_engen['Time'], 
    y=df_engen[test_temp], 
    mode='lines', 
    name='Mean air temperature'
))

# Convert y_pred to a Pandas Series with the correct index
y_pred_series = pd.Series(y_pred, index=test_y.index)

# Find indices for different cases
correct_1_indices = (test_y == 1) & (y_pred_series == 1)  # True 1, Pred 1 -> Green
false_negative_indices = (test_y == 1) & (y_pred_series == 0)  # True 1, Pred 0 -> Red
false_positive_indices = (test_y == 0) & (y_pred_series == 1)  # True 0, Pred 1 -> Orange

# Extract timestamps
correct_1_timestamps = df_engen.loc[test_y.index, 'Time'][correct_1_indices]
false_negative_timestamps = df_engen.loc[test_y.index, 'Time'][false_negative_indices]
false_positive_timestamps = df_engen.loc[test_y.index, 'Time'][false_positive_indices]

# Extract corresponding temperatures
correct_1_temps = df_engen.loc[test_y.index, test_temp][correct_1_indices]
false_negative_temps = df_engen.loc[test_y.index, test_temp][false_negative_indices]
false_positive_temps = df_engen.loc[test_y.index, test_temp][false_positive_indices]

# Add markers for correct predictions (Green)
fig.add_trace(go.Scatter(
    x=correct_1_timestamps,
    y=correct_1_temps,
    mode='markers',
    marker=dict(color='green', size=10),
    name='Correct (1 predicted as 1)'
))

# Add markers for false negatives (Red)
fig.add_trace(go.Scatter(
    x=false_negative_timestamps,
    y=false_negative_temps,
    mode='markers',
    marker=dict(color='red', size=10, symbol='x'),
    name='False Negative (1 predicted as 0)'
))

# Add markers for false positives (Orange)
fig.add_trace(go.Scatter(
    x=false_positive_timestamps,
    y=false_positive_temps,
    mode='markers',
    marker=dict(color='orange', size=10, symbol='triangle-up'),
    name='False Positive (0 predicted as 1)'
))

# Customize layout
fig.update_layout(
    title="Temperatures in meeting room & Prediction Markers for fault 1",
    xaxis_title="Time",
    yaxis_title="Temperature (°C)",
    legend_title="Legend",
    template="plotly", 
    width = 800,
    height = 500
)

# Show the plot
fig.show()

## 2) Fault 4
### Training on zone 1 binary class high setpoint  and testing on Engen Mileparken 

In [118]:
df_simulated = pd.read_csv('/Users/sararhiger/Desktop/Master thesis /Data/Processed data/Fault4_HighSetpoint_dataprocessed.csv')

df_engen = pd.read_csv('/Users/sararhiger/Desktop/Master thesis /Data/Processed data/Mileparken.csv')


In [119]:
# Define the start and end times for training data
start_time_train = "2023-01-01 00:00:00"  
end_time_train = "2023-12-31 23:00:00"  

# Define the start and end times for testing data
start_time_test = "2025-03-11 00:00:00"  
end_time_test = "2025-03-18 07:00:00"

# Filter DataFrame for timestamps within the range
df_train = df_simulated[(df_simulated['Time'] >= start_time_train) & (df_simulated['Time'] <= end_time_train)]

df_test = df_engen[(df_engen['Time'] >= start_time_test) & (df_engen['Time'] <= end_time_test)]

In [120]:
train_x= df_train[['Global Horizontal Irradiance, W/m2','Outdoor air temp','Mean air temperature, degC, zone 1', 'Relative humidity, %, zone 1', 'CO2, ppm (vol), zone 1', 'Ideal heaters and other local u']]
train_y = df_train['FDD, zone 1']

test_x = df_test[['GHI, W/m2','Outdoor temperature','TEMP °C', 'HUMIDITY %', 'CO2 ppm', 'Heating kWh']]
test_y = df_test['FDD']

# Define zones and variables for plots 
test_temp = 'TEMP °C'
test_heat = 'Heating kWh'

### Random Forest

In [121]:
# Initialize the Random Forest Classifier
#rf_model = RandomForestClassifier(class_weight='balanced')
rf_model = RandomForestClassifier()

# Train the model
rf_model.fit(train_x, train_y)

# Make predictions
y_pred_train = rf_model.predict(train_x)
y_pred = rf_model.predict(test_x)

# Evaluate the model
print("Accuracy:", accuracy_score(train_y, y_pred_train))
print("Train Report:\n", classification_report(train_y, y_pred_train))
print()
print("Accuracy:", accuracy_score(test_y, y_pred))
print("\nClassification Report:\n", classification_report(test_y, y_pred))

print(rf_model.feature_importances_)

Accuracy: 1.0
Train Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8572
           4       1.00      1.00      1.00       188

    accuracy                           1.00      8760
   macro avg       1.00      1.00      1.00      8760
weighted avg       1.00      1.00      1.00      8760


Accuracy: 0.9943181818181818

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00       172
           4       1.00      0.75      0.86         4

    accuracy                           0.99       176
   macro avg       1.00      0.88      0.93       176
weighted avg       0.99      0.99      0.99       176

[0.0417537  0.04219284 0.78117635 0.05766126 0.02818913 0.04902672]


In [122]:
# Store all runs
results = []

for i in range(10):
    rf_model = RandomForestClassifier()
    rf_model.fit(train_x, train_y)

    y_pred = rf_model.predict(test_x)
    report = classification_report(test_y, y_pred, output_dict=True)

    # Store per-class metrics
    run_result = {'run': i + 1}
    for label in report:
        if label not in ('accuracy', 'macro avg', 'weighted avg'):
            run_result[f'precision_class_{label}'] = report[label]['precision']
            run_result[f'recall_class_{label}'] = report[label]['recall']
            run_result[f'f1_class_{label}'] = report[label]['f1-score']
    
    results.append(run_result)

# Results as DataFrame
results_df = pd.DataFrame(results)

# Summary (mean and std)
summary = results_df.describe().loc[['mean', 'std']]

print("\nSummary (Mean ± Std):")
print(summary)

print("\nAll Per-Class Results:")
print(results_df)


Summary (Mean ± Std):
          run  precision_class_0  recall_class_0  f1_class_0  \
mean  5.50000           0.995954             1.0    0.997971   
std   3.02765           0.002792             0.0    0.001400   

      precision_class_4  recall_class_4  f1_class_4  
mean                1.0        0.825000    0.900000  
std                 0.0        0.120761    0.069007  

All Per-Class Results:
   run  precision_class_0  recall_class_0  f1_class_0  precision_class_4  \
0    1            0.99422             1.0    0.997101                1.0   
1    2            1.00000             1.0    1.000000                1.0   
2    3            0.99422             1.0    0.997101                1.0   
3    4            0.99422             1.0    0.997101                1.0   
4    5            0.99422             1.0    0.997101                1.0   
5    6            0.99422             1.0    0.997101                1.0   
6    7            1.00000             1.0    1.000000             

In [123]:
# Compute confusion matrix
cm = confusion_matrix(test_y, y_pred)

# Ensure labels are 0 and 1
labels = [0, 1]  

# Create the confusion matrix heatmap
fig = go.Figure(data=go.Heatmap(
    z=cm,
    x=labels,  # Predicted labels
    y=labels,  # Actual labels
    colorscale='Blues',
    showscale=True,
    text=cm,  # Display numbers in the cells
    texttemplate="%{text}",  # Ensure counts appear inside the cells
    hovertemplate='Actual: %{y}<br>Predicted: %{x}<br>Count: %{z}<extra></extra>'
))

# Update layout for binary classification (0 and 1 on axes)
fig.update_layout(
    title="Confusion Matrix for meeting room opened window",
    width=500,   # Reduce width
    height=500,
    xaxis=dict(
        title="Predicted Label",
        tickmode='array',        # Custom tick labels
        tickvals=[0, 1],         # Only show 0 and 1
        showgrid=False
    ),
    yaxis=dict(
        title="Actual Label",
        tickmode='array',
        tickvals=[0, 1],         # Only show 0 and 1
        showgrid=False
    ),
    template="plotly_white"
)

# Show the plot
fig.show()

In [124]:
fig = go.Figure()

# Base temperature curve
fig.add_trace(go.Scatter(
    x=df_engen['Time'], 
    y=df_engen[test_temp], 
    mode='lines', 
    name='Meeting room faulty operation'
))

# Convert y_pred to a Pandas Series with the correct index
y_pred_series = pd.Series(y_pred, index=test_y.index)

# Find indices for different cases
correct_1_indices = (test_y == 4) & (y_pred_series == 4)  # True 1, Pred 1 -> Green
false_negative_indices = (test_y == 4) & (y_pred_series == 0)  # True 1, Pred 0 -> Red
false_positive_indices = (test_y == 0) & (y_pred_series == 4)  # True 0, Pred 1 -> Orange

# Extract timestamps
correct_1_timestamps = df_engen.loc[test_y.index, 'Time'][correct_1_indices]
false_negative_timestamps = df_engen.loc[test_y.index, 'Time'][false_negative_indices]
false_positive_timestamps = df_engen.loc[test_y.index, 'Time'][false_positive_indices]

# Extract corresponding temperatures
correct_1_temps = df_engen.loc[test_y.index, test_temp][correct_1_indices]
false_negative_temps = df_engen.loc[test_y.index, test_temp][false_negative_indices]
false_positive_temps = df_engen.loc[test_y.index, test_temp][false_positive_indices]

# Add markers for correct predictions (Green)
fig.add_trace(go.Scatter(
    x=correct_1_timestamps,
    y=correct_1_temps,
    mode='markers',
    marker=dict(color='green', size=10),
    name='Correct (4 predicted as 4)'
))

# Add markers for false negatives (Red)
fig.add_trace(go.Scatter(
    x=false_negative_timestamps,
    y=false_negative_temps,
    mode='markers',
    marker=dict(color='red', size=10, symbol='x'),
    name='False Negative (4 predicted as 0)'
))

# Add markers for false positives (Orange)
fig.add_trace(go.Scatter(
    x=false_positive_timestamps,
    y=false_positive_temps,
    mode='markers',
    marker=dict(color='orange', size=10, symbol='triangle-up'),
    name='False Positive (0 predicted as 4)'
))

# Customize layout
fig.update_layout(
    title="Temperatures in meeting room & Prediction Markers for fault 4",
    xaxis_title="Time",
    yaxis_title="Temperature (°C)",
    legend_title="Legend",
    template="plotly"
)

# Show the plot
fig.show()

### XGBoost

In [125]:
train_y = train_y.replace(4, 1)
test_y = test_y.replace(4, 1)

# Initialize the XGBoost Classifier
xgb_model = XGBClassifier()

# Train the model
xgb_model.fit(train_x, train_y)

# Make predictions
y_pred_train = xgb_model.predict(train_x)
y_pred = xgb_model.predict(test_x)

# Evaluate the model
print("Accuracy on Train Set:", accuracy_score(train_y, y_pred_train))
print("Train Report:\n", classification_report(train_y, y_pred_train))
print()
print("Accuracy on Test Set:", accuracy_score(test_y, y_pred))
print("\nClassification Report:\n", classification_report(test_y, y_pred))

Accuracy on Train Set: 1.0
Train Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8572
           1       1.00      1.00      1.00       188

    accuracy                           1.00      8760
   macro avg       1.00      1.00      1.00      8760
weighted avg       1.00      1.00      1.00      8760


Accuracy on Test Set: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       172
           1       1.00      1.00      1.00         4

    accuracy                           1.00       176
   macro avg       1.00      1.00      1.00       176
weighted avg       1.00      1.00      1.00       176



In [126]:
# Store all runs
results = []

for i in range(10):
    xgb_model = XGBClassifier()
    xgb_model.fit(train_x, train_y)

    y_pred = xgb_model.predict(test_x)
    report = classification_report(test_y, y_pred, output_dict=True)

    # Store per-class metrics
    run_result = {'run': i + 1}
    for label in report:
        if label not in ('accuracy', 'macro avg', 'weighted avg'):
            run_result[f'precision_class_{label}'] = report[label]['precision']
            run_result[f'recall_class_{label}'] = report[label]['recall']
            run_result[f'f1_class_{label}'] = report[label]['f1-score']
    
    results.append(run_result)

# Results as DataFrame
results_df = pd.DataFrame(results)

# Summary (mean and std)
summary = results_df.describe().loc[['mean', 'std']]

print("\nSummary (Mean ± Std):")
print(summary)

print("\nAll Per-Class Results:")
print(results_df)


Summary (Mean ± Std):
          run  precision_class_0  recall_class_0  f1_class_0  \
mean  5.50000                1.0             1.0         1.0   
std   3.02765                0.0             0.0         0.0   

      precision_class_1  recall_class_1  f1_class_1  
mean                1.0             1.0         1.0  
std                 0.0             0.0         0.0  

All Per-Class Results:
   run  precision_class_0  recall_class_0  f1_class_0  precision_class_1  \
0    1                1.0             1.0         1.0                1.0   
1    2                1.0             1.0         1.0                1.0   
2    3                1.0             1.0         1.0                1.0   
3    4                1.0             1.0         1.0                1.0   
4    5                1.0             1.0         1.0                1.0   
5    6                1.0             1.0         1.0                1.0   
6    7                1.0             1.0         1.0             

## 3) Multiclass model på Mileparken 

In [127]:
df_simulated = pd.read_csv('/Users/sararhiger/Desktop/Master thesis /Data/Processed data/Multiclass_dataprocessed.csv')

df_engen = pd.read_csv('/Users/sararhiger/Desktop/Master thesis /Data/Processed data/Mileparken.csv')


In [128]:
# Define the start and end times for training data
start_time_train = "2023-01-01 00:00:00"  
end_time_train = "2023-12-31 23:00:00"  

# Define the start and end times for testing data
start_time_test = "2025-03-11 00:00:00"  
end_time_test = "2025-03-31 07:00:00"

# Filter DataFrame for timestamps within the range
df_train = df_simulated[(df_simulated['Time'] >= start_time_train) & (df_simulated['Time'] <= end_time_train)]

df_test = df_engen[(df_engen['Time'] >= start_time_test) & (df_engen['Time'] <= end_time_test)]




In [129]:
train_x= df_train[['Global Horizontal Irradiance, W/m2','Outdoor air temp','Mean air temperature, degC, zone 1', 'Relative humidity, %, zone 1', 'CO2, ppm (vol), zone 1', 'Ideal heaters and other local u']]
train_y = df_train['FDD, zone 1']

test_x = df_test[['GHI, W/m2','Outdoor temperature','TEMP °C', 'HUMIDITY %', 'CO2 ppm', 'Heating kWh']]
test_y = df_test['FDD']

# Define zones and variables for plots 
test_temp = 'TEMP °C'
test_heat = 'Heating kWh'

In [130]:
# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier()

# Train the model
rf_model.fit(train_x, train_y)

# Make predictions
y_pred_train = rf_model.predict(train_x)
y_pred = rf_model.predict(test_x)

# Evaluate the model
print("Accuracy:", accuracy_score(train_y, y_pred_train))
print("Train Report:\n", classification_report(train_y, y_pred_train))
print()
print("Accuracy:", accuracy_score(test_y, y_pred))
print("\nClassification Report:\n", classification_report(test_y, y_pred))

print(rf_model.feature_importances_)

Accuracy: 1.0
Train Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      6976
           1       1.00      1.00      1.00        48
           2       1.00      1.00      1.00       103
           3       1.00      1.00      1.00      1445
           4       1.00      1.00      1.00       188

    accuracy                           1.00      8760
   macro avg       1.00      1.00      1.00      8760
weighted avg       1.00      1.00      1.00      8760


Accuracy: 0.9569672131147541

Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       465
           1       1.00      0.11      0.19        19
           4       0.00      0.00      0.00         4

    accuracy                           0.96       488
   macro avg       0.65      0.37      0.39       488
weighted avg       0.95      0.96      0.94       488

[0.02754447 0.03064252 0.10878296 0.03266064 0


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [131]:
results = []
expected_labels = ['0', '1', '2', '3', '4']  # All classes in training data

for i in range(10):
    rf_model = RandomForestClassifier()
    rf_model.fit(train_x, train_y)

    y_pred = rf_model.predict(test_x)
    report = classification_report(test_y, y_pred, output_dict=True, zero_division=0)

    run_result = {'run': i + 1}
    
    for label in expected_labels:
        if label in report:
            run_result[f'precision_class_{label}'] = report[label]['precision']
            run_result[f'recall_class_{label}'] = report[label]['recall']
            run_result[f'f1_class_{label}'] = report[label]['f1-score']
        else:
            run_result[f'precision_class_{label}'] = 0.0
            run_result[f'recall_class_{label}'] = 0.0
            run_result[f'f1_class_{label}'] = 0.0

    results.append(run_result)

# Build results DataFrame
results_df = pd.DataFrame(results)

# Confirm no NaNs
print("Any NaNs?", results_df.isna().any().any())

# Summary
summary = results_df.describe().loc[['mean', 'std']]
print(summary)

Any NaNs? False
          run  precision_class_0  recall_class_0  f1_class_0  \
mean  5.50000           0.959167        0.998925    0.978624   
std   3.02765           0.006398        0.002730    0.002363   

      precision_class_1  recall_class_1  f1_class_1  precision_class_2  \
mean                1.0        0.089474    0.163333                0.0   
std                 0.0        0.025423    0.043704                0.0   

      recall_class_2  f1_class_2  precision_class_3  recall_class_3  \
mean             0.0         0.0                0.0             0.0   
std              0.0         0.0                0.0             0.0   

      f1_class_3  precision_class_4  recall_class_4  f1_class_4  
mean         0.0                0.0             0.0         0.0  
std          0.0                0.0             0.0         0.0  


In [132]:
# Define all possible classes
all_classes = [0, 1, 2, 3, 4]  

# Compute confusion matrix with fixed labels
cm = confusion_matrix(test_y, y_pred, labels=all_classes)

# Convert to DataFrame
df_cm = pd.DataFrame(cm, index=all_classes, columns=all_classes)

# Create interactive Plotly heatmap
fig = px.imshow(df_cm, 
                text_auto=True,  
                color_continuous_scale='Blues',  
                labels=dict(x="Predicted", y="Actual", color="Count"))

# Set axis labels and tick values to only show whole numbers (0,1,2,3)
fig.update_layout(
    title="Confusion Matrix for meeting room",
    xaxis_title="Predicted Label",
    yaxis_title="Actual Label",
    xaxis=dict(tickmode='array', tickvals=[0,1,2,3,4], ticktext=['Fault free','Open window','Heating shut down','Defect CO2 sensor','Setpoint 25 degrees']),
    yaxis=dict(tickmode='array', tickvals=[0,1,2,3,4], ticktext=['Fault free','Open window','Heating shut down','Defect CO2 sensor ','Setpoint 25 degrees']), 
    width=700,  # Set width of the figure
    height=500  # Set height of the figure
)

# Show plot
fig.show()


## Multiclass model på Mileparken med XGboost

In [133]:
# Initialize the XGBoost Classifier
xgb_model = XGBClassifier()

# Train the model
xgb_model.fit(train_x, train_y)

# Make predictions
y_pred_train = xgb_model.predict(train_x)
y_pred = xgb_model.predict(test_x)

# Evaluate the model
print("Accuracy on Train Set:", accuracy_score(train_y, y_pred_train))
print("Train Report:\n", classification_report(train_y, y_pred_train))
print()
print("Accuracy on Test Set:", accuracy_score(test_y, y_pred))
print("\nClassification Report:\n", classification_report(test_y, y_pred))

print(xgb_model.feature_importances_)

Accuracy on Train Set: 1.0
Train Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      6976
           1       1.00      1.00      1.00        48
           2       1.00      1.00      1.00       103
           3       1.00      1.00      1.00      1445
           4       1.00      1.00      1.00       188

    accuracy                           1.00      8760
   macro avg       1.00      1.00      1.00      8760
weighted avg       1.00      1.00      1.00      8760


Accuracy on Test Set: 0.9528688524590164

Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       465
           1       0.67      0.11      0.18        19
           4       0.00      0.00      0.00         4

    accuracy                           0.95       488
   macro avg       0.54      0.37      0.39       488
weighted avg       0.94      0.95      0.94       488

[0.02236849 0.0250659

In [134]:
# Store all runs
results = []

for i in range(10):
    xgb_model = XGBClassifier()
    xgb_model.fit(train_x, train_y)

    y_pred = xgb_model.predict(test_x)
    report = classification_report(test_y, y_pred, output_dict=True)

    # Store per-class metrics
    run_result = {'run': i + 1}
    for label in report:
        if label not in ('accuracy', 'macro avg', 'weighted avg'):
            run_result[f'precision_class_{label}'] = report[label]['precision']
            run_result[f'recall_class_{label}'] = report[label]['recall']
            run_result[f'f1_class_{label}'] = report[label]['f1-score']
    
    results.append(run_result)

# Results as DataFrame
results_df = pd.DataFrame(results)

# Summary (mean and std)
summary = results_df.describe().loc[['mean', 'std']]

print("\nSummary (Mean ± Std):")
print(summary)

print("\nAll Per-Class Results:")
print(results_df)


Summary (Mean ± Std):
          run  precision_class_0  recall_class_0    f1_class_0  \
mean  5.50000       9.566116e-01    9.956989e-01  9.757640e-01   
std   3.02765       2.340556e-16    1.170278e-16  1.170278e-16   

      precision_class_1  recall_class_1    f1_class_1  precision_class_4  \
mean       6.666667e-01        0.105263  1.818182e-01                0.0   
std        1.170278e-16        0.000000  2.925695e-17                0.0   

      recall_class_4  f1_class_4  
mean             0.0         0.0  
std              0.0         0.0  

All Per-Class Results:
   run  precision_class_0  recall_class_0  f1_class_0  precision_class_1  \
0    1           0.956612        0.995699    0.975764           0.666667   
1    2           0.956612        0.995699    0.975764           0.666667   
2    3           0.956612        0.995699    0.975764           0.666667   
3    4           0.956612        0.995699    0.975764           0.666667   
4    5           0.956612        0.99569

In [135]:
# Define all possible classes
all_classes = [0, 1, 2, 3, 4]  

# Compute confusion matrix with fixed labels
cm = confusion_matrix(test_y, y_pred, labels=all_classes)

# Convert to DataFrame
df_cm = pd.DataFrame(cm, index=all_classes, columns=all_classes)

# Create a copy for coloring — set [0,0] to NaN so it doesn't affect color scale
cm_for_plot = df_cm.copy()
cm_for_plot.iloc[0, 0] = np.nan  # exclude top-left from color scaling

# Create interactive Plotly heatmap
fig = px.imshow(
    cm_for_plot,
    text_auto=True,
    color_continuous_scale='Blues',
    labels=dict(x="Predicted", y="Actual", color="Count")
)

# Overlay the real value in top-left as custom annotation
fig.add_annotation(
    text=str(df_cm.iloc[0, 0]),  # Actual value
    x=0, y=0,
    showarrow=False,
    font=dict(color="red", size=12),
    bgcolor="white"
)

# Customize axis labels
fig.update_layout(
    title="Confusion Matrix for Zone 8",
    xaxis_title="Predicted Label",
    yaxis_title="Actual Label",
    xaxis=dict(
        tickmode='array',
        tickvals=[0, 1, 2, 3, 4],
        ticktext=[
            'Fault free', 'Open window', 'Heating shut down',
            'Defect CO2 sensor', 'Setpoint 25 degrees'
        ]
    ),
    yaxis=dict(
        tickmode='array',
        tickvals=[0, 1, 2, 3, 4],
        ticktext=[
            'Fault free', 'Open window', 'Heating shut down',
            'Defect CO2 sensor', 'Setpoint 25 degrees'
        ]
    ),
    width=700,
    height=500
)

# Show plot
fig.show()

In [136]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=df_engen['Time'], y=df_engen['TEMP °C'], mode='lines', name='TEMP °C'))

# Make sure 'Time' is datetime
df_engen['Time'] = pd.to_datetime(df_engen['Time'])

fig.update_layout(
    title="Temperatures in meeting room March 2025",
    xaxis_title="Time",
    yaxis_title="Temperature (°C)",
    legend_title="Temperature Curves",
    template="plotly",
    width = 1800
)

# Show the plot
fig.show()

## 4) Multiclass uden CO2 fault

In [137]:
df_simulated = pd.read_csv('/Users/sararhiger/Desktop/Master thesis /Data/Processed data/Multiclass_WithoutCO2.csv')
df_engen = pd.read_csv('/Users/sararhiger/Desktop/Master thesis /Data/Processed data/Mileparken.csv')


In [138]:
# Define the start and end times for training data
start_time_train = "2023-01-01 00:00:00"  
end_time_train = "2023-12-31 23:00:00"  

# Define the start and end times for testing data
start_time_test = "2025-03-11 00:00:00"  
end_time_test = "2025-03-31 07:00:00"

# Filter DataFrame for timestamps within the range
df_train = df_simulated[(df_simulated['Time'] >= start_time_train) & (df_simulated['Time'] <= end_time_train)]

df_test = df_engen[(df_engen['Time'] >= start_time_test) & (df_engen['Time'] <= end_time_test)]

In [139]:
train_x= df_train[['Global Horizontal Irradiance, W/m2','Outdoor air temp','Mean air temperature, degC, zone 1', 'Relative humidity, %, zone 1', 'CO2, ppm (vol), zone 1', 'Ideal heaters and other local u']]
train_y = df_train['FDD, zone 1']

test_x = df_test[['GHI, W/m2','Outdoor temperature','TEMP °C', 'HUMIDITY %', 'CO2 ppm', 'Heating kWh']]
test_y = df_test['FDD']

# Define zones and variables for plots 
test_temp = 'TEMP °C'
test_heat = 'Heating kWh'

In [140]:
# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(class_weight='balanced')

# Train the model
rf_model.fit(train_x, train_y)

# Make predictions
y_pred_train = rf_model.predict(train_x)
y_pred = rf_model.predict(test_x)

# Evaluate the model
print("Accuracy:", accuracy_score(train_y, y_pred_train))
print("Train Report:\n", classification_report(train_y, y_pred_train))
print()
print("Accuracy:", accuracy_score(test_y, y_pred))
print("\nClassification Report:\n", classification_report(test_y, y_pred))

print(rf_model.feature_importances_)

Accuracy: 1.0
Train Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8421
           1       1.00      1.00      1.00        48
           2       1.00      1.00      1.00       103
           4       1.00      1.00      1.00       188

    accuracy                           1.00      8760
   macro avg       1.00      1.00      1.00      8760
weighted avg       1.00      1.00      1.00      8760


Accuracy: 0.9569672131147541

Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       465
           1       1.00      0.11      0.19        19
           4       0.00      0.00      0.00         4

    accuracy                           0.96       488
   macro avg       0.65      0.37      0.39       488
weighted avg       0.95      0.96      0.94       488

[0.18398415 0.15760692 0.36454908 0.11563837 0.07183874 0.10638274]



Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [141]:
# Define all possible classes
all_classes = [0, 1, 2, 4]  

# Compute confusion matrix with fixed labels
cm = confusion_matrix(test_y, y_pred, labels=all_classes)

# Convert to DataFrame
df_cm = pd.DataFrame(cm, index=all_classes, columns=all_classes)

# Create a copy for coloring — set [0,0] to NaN so it doesn't affect color scale
cm_for_plot = df_cm.copy()
cm_for_plot.iloc[0, 0] = np.nan  # exclude top-left from color scaling

# Create interactive Plotly heatmap
fig = px.imshow(
    cm_for_plot,
    text_auto=True,
    color_continuous_scale='Blues',
    labels=dict(x="Predicted", y="Actual", color="Count")
)

# Overlay the real value in top-left as custom annotation
fig.add_annotation(
    text=str(df_cm.iloc[0, 0]),  # Actual value
    x=0, y=0,
    showarrow=False,
    font=dict(color="red", size=12),
    bgcolor="white"
)

# Customize axis labels
fig.update_layout(
    title="Confusion Matrix for Zone 8",
    xaxis_title="Predicted Label",
    yaxis_title="Actual Label",
    xaxis=dict(
        tickmode='array',
        tickvals=[0, 1, 2, 4],
        ticktext=[
            'Fault free', 'Open window', 'Heating shut down', 'Setpoint 25 degrees'
        ]
    ),
    yaxis=dict(
        tickmode='array',
        tickvals=[0, 1, 2, 4],
        ticktext=[
            'Fault free', 'Open window', 'Heating shut down', 'Setpoint 25 degrees'
        ]
    ),
    width=700,
    height=500
)

# Show plot
fig.show()

In [142]:
train_y = np.where(train_y == 4, 3, train_y)
test_y = np.where(test_y == 4, 3, test_y)

# Initialize the XGBoost Classifier
xgb_model = XGBClassifier()

# Train the model
xgb_model.fit(train_x, train_y)

# Make predictions
y_pred_train = xgb_model.predict(train_x)
y_pred = xgb_model.predict(test_x)

# Evaluate the model
print("Accuracy on Train Set:", accuracy_score(train_y, y_pred_train))
print("Train Report:\n", classification_report(train_y, y_pred_train))
print()
print("Accuracy on Test Set:", accuracy_score(test_y, y_pred))
print("\nClassification Report:\n", classification_report(test_y, y_pred))

print(xgb_model.feature_importances_)

Accuracy on Train Set: 1.0
Train Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8421
           1       1.00      1.00      1.00        48
           2       1.00      1.00      1.00       103
           3       1.00      1.00      1.00       188

    accuracy                           1.00      8760
   macro avg       1.00      1.00      1.00      8760
weighted avg       1.00      1.00      1.00      8760


Accuracy on Test Set: 0.9549180327868853

Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       465
           1       0.67      0.11      0.18        19
           3       0.00      0.00      0.00         4

    accuracy                           0.95       488
   macro avg       0.54      0.37      0.39       488
weighted avg       0.94      0.95      0.94       488

[0.12158141 0.13850498 0.40426373 0.12625456 0.09520453 0.11419083]
