In [31]:
# need to install in terminal
# pip install torch torchtuples lifelines
# pip install pyarrow


In [32]:
# imports
import pandas as pd
import numpy as np
# import torch
import torch.nn as nn
import torchtuples as tt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from lifelines.utils import concordance_index

In [33]:
# load dataset
df = pd.read_csv("../../data/raw/air_12318.csv", on_bad_lines='skip')
df = df.sort_values("time").reset_index(drop=True) #sort by date

# generator state --double chack that is correct
df['is_running'] = (
    (df[['ia_A','ib_A','ic_A','pa_W','pb_W','pc_W']].sum(axis=1) > 0) | 
    (df['pressure_Bar'] > 0)  
).astype(int)


numeric_cols = [
    'va_V','vb_V','vc_V','va-vb_V','vb-vc_V','vc-va_V',
    'mVa_V','mVb_V','mVc_v','mVa-mBb_V','mVb-mVc_V','mVc-mVa_V',
    'ia_A','ib_A','ic_A','pa_W','pb_W','pc_W','ptot_W',
    'qa_Var','qb_Var','qc_Var','qtot_Var','sa_VA','sb_VA','sc_VA','stot_VA',
    'pfa_None','pfb_None','pfc_None','pftot_None','freq_Hz*10',
    'temp_Degrees Celsius','pressure_Bar','fuel_%','vbat_V'
]

In [34]:
# Thresholds for generator monitoring columns
thresholds = {
    # Voltages (V)
    "va_V": (180, 260),
    "vb_V": (180, 260),
    "vc_V": (180, 260),
    "mVa_V": (180, 260),
    "mVb_V": (180, 260),
    "mVc_v": (180, 260),
    "va-vb_V": (310, 450),
    "vb-vc_V": (310, 450),
    "vc-va_V": (310, 450),
    "mVa-mBb_V": (310, 450),
    "mVb-mVc_V": (310, 450),
    "mVc-mVa_V": (310, 450),

    # Currents (A)
    "ia_A": (0, 120),
    "ib_A": (0, 120),
    "ic_A": (0, 120),

    # Active Power (W)
    "pa_W": (0, 110000),
    "pb_W": (0, 110000),
    "pc_W": (0, 110000),
    "ptot_W": (0, 330000),

    # Reactive Power (Var)
    "qa_Var": (0, 100000),
    "qb_Var": (0, 100000),
    "qc_Var": (0, 100000),
    "qtot_Var": (0, 300000),

    # Apparent Power (VA)
    "sa_VA": (0, 120000),
    "sb_VA": (0, 120000),
    "sc_VA": (0, 120000),
    "stot_VA": (0, 360000),

    # Power Factor
    "pfa_None": (0.7, 1.0),
    "pfb_None": (0.7, 1.0),
    "pfc_None": (0.7, 1.0),
    "pftot_None": (0.7, 1.0),

    # Energy
    "expwh_Kwh*10": (0, float("inf")),
    "expvar_Kvarh*10": (0, float("inf")),

    # Frequency
    "freq_Hz*10": (495, 505),  # 49.5 - 50.5 Hz

    # Environmental / Fuel / Battery
    "temp_Degrees Celsius": (0, 90),
    "pressure_Bar": (1, 2.5),
    "fuel_%": (0, 100),
    "vbat_V": (11, 14),

    # Runtime
    "hours_sec": (0, float("inf"))
}


In [None]:
#Getting a lot of events that dont make sense so I want to see if i put 0volt as a nan if there are any  other fields that are als doing this
# Replace zero readings with NaN
voltage_cols = ['va_V', 'vb_V', 'vc_V','va-vb_V', 'vb-vc_V', 'vc-va_V']
df[voltage_cols] = df[voltage_cols].replace(0, np.nan)

In [46]:
# mark any event--> if any field is out of the threshold
def detect_event(row):
    if row.get("is_running", 0) == 1:
        for col in numeric_cols:
            low, high = thresholds[col]
            val = row[col]
            if pd.notnull(val) and (val < low or val > high):
                return 1
    return 0

df['event'] = df.apply(detect_event, axis=1)

In [47]:
# debug the event to try to see if events were acurately created
def detect_event_debug(row):
    if row.get("is_running", 0) == 1:
        for col in numeric_cols:
            low, high = thresholds[col]
            val = row[col]
            if pd.notnull(val) and (val < low or val > high):
                return f"{col}: {val} out of [{low:.2f}, {high:.2f}]"
    return None

print(df.head(50).apply(detect_event_debug, axis=1))

0                                     None
1                                     None
2                                     None
3                                     None
4                                     None
5                                     None
6                                     None
7                                     None
8                                     None
9                                     None
10                                    None
11                                    None
12                                    None
13                                    None
14                                    None
15                                    None
16                                    None
17                                    None
18                                    None
19                                    None
20                                    None
21                                    None
22                                    None
23    vc-va

In [38]:
df["event"] = df.apply(detect_event, axis=1)

print(df[["id","time", "event"]].head(20))

         id                       time  event
0   2258572  2023-02-24 08:56:14+00:00      0
1   2271340  2023-02-28 11:05:31+00:00      0
2   2271341  2023-02-28 11:05:41+00:00      0
3   2271342  2023-02-28 11:05:51+00:00      0
4   2271345  2023-02-28 11:06:01+00:00      0
5   2271346  2023-02-28 11:06:11+00:00      0
6   2271347  2023-02-28 11:06:21+00:00      0
7   2271352  2023-02-28 11:06:31+00:00      0
8   2271353  2023-02-28 11:06:41+00:00      0
9   2271354  2023-02-28 11:06:51+00:00      0
10  2271356  2023-02-28 11:07:01+00:00      0
11  2271357  2023-02-28 11:07:11+00:00      0
12  2271358  2023-02-28 11:07:21+00:00      0
13  2271361  2023-02-28 11:07:31+00:00      0
14  2271362  2023-02-28 11:07:41+00:00      0
15  2271364  2023-02-28 11:07:51+00:00      0
16  2271365  2023-02-28 11:08:01+00:00      0
17  2271366  2023-02-28 11:08:11+00:00      0
18  2271367  2023-02-28 11:08:21+00:00      0
19  2271368  2023-02-28 11:08:31+00:00      0


In [39]:
# get duration
df = df.sort_values("time").reset_index(drop=True) # ensure the data is sorted by date
df['event_group'] = (df['event'] != df['event'].shift()).cumsum()
events = df[df['event'] == 1] #only use event rows
durations = events.groupby('event_group')['time'].agg(['min', 'max']) #group by event block
durations['min'] = pd.to_datetime(durations['min'])
durations['max'] = pd.to_datetime(durations['max'])
durations['duration_seconds'] = (durations['max'] - durations['min']).dt.total_seconds()

print(durations)


                                  min                       max  \
event_group                                                       
2           2023-02-28 11:11:18+00:00 2023-03-25 05:40:35+00:00   
4           2023-03-25 05:44:35+00:00 2023-03-25 05:52:35+00:00   
6           2023-03-25 05:58:35+00:00 2023-03-25 06:08:35+00:00   
8           2023-03-25 06:12:35+00:00 2023-03-25 06:32:35+00:00   
10          2023-03-25 06:38:35+00:00 2023-03-25 07:12:35+00:00   
...                               ...                       ...   
132         2025-07-08 07:34:29+00:00 2025-07-08 07:38:29+00:00   
134         2025-07-10 04:52:29+00:00 2025-07-10 04:54:28+00:00   
136         2025-07-17 07:45:56+00:00 2025-07-17 07:49:56+00:00   
138         2025-07-24 04:58:35+00:00 2025-07-24 04:58:35+00:00   
140         2025-07-31 04:57:08+00:00 2025-07-31 04:57:08+00:00   

             duration_seconds  
event_group                    
2                   2140157.0  
4                       480.0  


In [40]:
# preprocessing
scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_val, y_train, y_val = train_test_split(
    X, np.column_stack([T, E]), test_size=0.2, random_state=42
)

NameError: name 'X' is not defined

In [None]:
# define deepsurv network
class DeepSurvNet(nn.Module):
    def __init__(self, in_features, hidden_nodes=[128, 64], dropout=0.3):
        super().__init__()
        layers = []
        prev_nodes = in_features
        for nodes in hidden_nodes:
            layers.append(nn.Linear(prev_nodes, nodes))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            prev_nodes = nodes
        layers.append(nn.Linear(prev_nodes, 1))  # final risk score
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)

in_features = X_train.shape[1]
net = DeepSurvNet(in_features)


In [None]:
# Wrap in torchtuples model
model = tt.practical.DeepSurv(net, tt.optim.Adam)
model.optimizer.set_lr(1e-3)

In [None]:
# train
train_data = (X_train, y_train)
val_data = (X_val, y_val)

batch_size = 256
epochs = 50

log = model.fit(
    X_train, y_train,
    batch_size, epochs,
    val_data=val_data,
    verbose=True
)

In [None]:
# evaluate  concordance index
from lifelines.utils import concordance_index

# Predict risk scores
risk_scores = model.predict(X_val).reshape(-1)

c_index = concordance_index(y_val[:,0], -risk_scores, y_val[:,1])
print("Validation Concordance Index (C-index):", c_index)

In [None]:
# tests to compare accuracy to other models