1. Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.mixture import GaussianMixture


In [49]:
# Load cleaned dataset 
df = pd.read_csv("air_12318.csv")
df['time'] = pd.to_datetime(df['time'])
df = df.sort_values('time')

In [50]:
# ON/OFF Detection
# GMM on log(power)
p = df['ptot_W'].clip(lower=0).fillna(0)
X = np.log1p(p).values.reshape(-1,1)

gmm = GaussianMixture(n_components=2, random_state=42).fit(X)
labels = gmm.predict(X)

means = gmm.means_.flatten()
on_cluster = np.argmax(means)

df['is_running_gmm'] = (labels == on_cluster).astype(int)

In [51]:

# Cross-check with currents
df['any_current'] = df[['ia_A','ib_A','ic_A']].sum(axis=1) > 0.1

In [52]:
# Final ON/OFF label
df['is_running'] = ((df['is_running_gmm']==1) | df['any_current']).astype(int)

print("ON/OFF split:", df['is_running'].value_counts(normalize=True))


ON/OFF split: is_running
0    0.872446
1    0.127554
Name: proportion, dtype: float64


In [53]:
#Filter ON cycles
df_on = df[df['is_running'] == 1].copy()

2. Feature Engineering

In [54]:
# Current imbalance
if all(col in df_on.columns for col in ['ia_A','ib_A','ic_A']):
    df_on['current_imbalance'] = df_on[['ia_A','ib_A','ic_A']].std(axis=1)

In [55]:
# Voltage imbalance (if available)
voltage_cols = [c for c in df_on.columns if c.lower().startswith('mv')]
if len(voltage_cols) >= 3:
    df_on['voltage_imbalance'] = df_on[voltage_cols].std(axis=1)

In [56]:
# Power factor anomaly (deviation from ideal = 1)
if 'pftot_None' in df_on.columns:
    df_on['pf_anomaly'] = np.abs(1 - df_on['pftot_None'])

In [57]:
# Temperature rate of change
temp_cols = [c for c in df_on.columns if 'temp' in c.lower()]
for col in temp_cols:
    df_on[f'{col}_roc'] = df_on[col].diff()

In [58]:
# Fuel rate of change
fuel_cols = [c for c in df_on.columns if 'fuel' in c.lower()]
for col in fuel_cols:
    df_on[f'{col}_roc'] = df_on[col].diff()

In [59]:
# Rolling means & stds
rolling_window = 60  # adjust (e.g., 60 rows ~ 1h if data is per minute)
for col in ['ptot_W','ia_A','pf_anomaly']:
    if col in df_on.columns:
        df_on[f'{col}_rollmean'] = df_on[col].rolling(window=rolling_window, min_periods=1).mean()
        df_on[f'{col}_rollstd']  = df_on[col].rolling(window=rolling_window, min_periods=1).std()

In [60]:
corr = df_on[feature_cols].corr().abs()
redundant = [(i,j,corr.loc[i,j]) for i in corr.columns for j in corr.columns if i!=j and corr.loc[i,j] > 0.95]
print("Highly correlated feature pairs:", redundant[:10])


Highly correlated feature pairs: [('mVa_V', 'mVb_V', 0.9999502516651855), ('mVa_V', 'mVc_v', 0.9999637243960561), ('mVa_V', 'mVa-mBb_V', 0.999989740932252), ('mVa_V', 'mVb-mVc_V', 0.9999674695903271), ('mVa_V', 'mVc-mVa_V', 0.9999752114586884), ('mVa_V', 'va_V', 0.9857583600219197), ('mVa_V', 'vb_V', 0.985750081425482), ('mVa_V', 'vc_V', 0.9858585061662808), ('mVa_V', 'va-vb_V', 0.9856732994004986), ('mVa_V', 'vb-vc_V', 0.9858023112246473)]


3. Save Outputs

In [None]:
feature_cols = [c for c in df_on.columns if c not in ['time']]
df_on.to_parquet("air_12318_features_full.parquet")   # big file, keep local
df_on.sample(50000, random_state=42).to_csv("air_12318_features_sample.csv", index=False)  # smaller, push to Git

print("Feature engineering complete. Full saved as parquet, sample saved as CSV.")

Feature engineering complete. Full saved as parquet, sample saved as CSV.
