# 02 - Failure Pattern Analysis

Dig into the 11 failure events to understand precursor signals in sensor data.

**Goal**: Identify which sensor patterns precede failures and how far in advance they appear.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (14, 5)

%matplotlib inline

In [None]:
sensor_df = pd.read_csv('data/raw/sensor_readings.csv', parse_dates=['timestamp'])
maint_df = pd.read_csv('data/raw/maintenance_log.csv', parse_dates=['install_date', 'last_inspection_date', 'event_date'])

failures = maint_df[maint_df['event_type'] == 'FAILURE'].copy()
print(f'Total failures: {len(failures)}')
failures[['equipment_id', 'event_date']].reset_index(drop=True)

## 1. Sensor Behaviour Before Failures

Let's look at sensor readings in the 60 days before each failure event.

In [None]:
def get_prefailure_window(equipment_id, failure_date, window_days=60):
    """Get sensor readings in the window before a failure."""
    mask = (
        (sensor_df['equipment_id'] == equipment_id) &
        (sensor_df['timestamp'] >= failure_date - pd.Timedelta(days=window_days)) &
        (sensor_df['timestamp'] <= failure_date)
    )
    df = sensor_df[mask].copy().sort_values('timestamp')
    df['days_before_failure'] = (failure_date - df['timestamp']).dt.days
    return df

In [None]:
# Plot oil temp and winding temp for all failed transformers
fig, axes = plt.subplots(4, 3, figsize=(18, 16))
axes = axes.flatten()

for i, (_, failure) in enumerate(failures.iterrows()):
    if i >= 11:  # safety
        break
    window = get_prefailure_window(failure['equipment_id'], failure['event_date'])
    ax = axes[i]
    
    ax.plot(window['days_before_failure'], window['oil_temp_top_celsius'],
            'b-', alpha=0.7, label='Oil Top', linewidth=1.2)
    ax.plot(window['days_before_failure'], window['winding_temp_celsius'],
            'r-', alpha=0.7, label='Winding', linewidth=1.2)
    ax.axvline(x=0, color='black', linestyle='--', alpha=0.5, label='Failure')
    ax.invert_xaxis()
    ax.set_title(f"{failure['equipment_id']} ({failure['event_date'].strftime('%Y-%m-%d')})", fontsize=10)
    ax.set_xlabel('Days before failure')
    if i == 0:
        ax.legend(fontsize=8)

# hide unused subplot
if len(failures) < len(axes):
    axes[-1].set_visible(False)

plt.suptitle('Oil & Winding Temperature Before Failures (60-day window)', fontsize=14, y=1.01)
plt.tight_layout()
plt.show()

Interesting patterns emerge:

- Some failures show a clear temperature ramp-up 2-4 weeks before (likely insulation degradation)
- A couple show sharp spikes just 2-3 days before (likely cooling system failures)
- A few show NO obvious pattern in temperatures (sudden/unexpected failures)

Let me try to categorise these more systematically.

## 2. Quantifying Temperature Trends Before Failure

Compute rolling averages and compare the last 7 days vs. 30-60 days before failure.

In [None]:
results = []

for _, failure in failures.iterrows():
    window = get_prefailure_window(failure['equipment_id'], failure['event_date'])
    if len(window) < 10:
        continue
    
    # Last 7 days before failure
    last_7 = window[window['days_before_failure'] <= 7]
    # Baseline: 30-60 days before
    baseline = window[(window['days_before_failure'] >= 30) & (window['days_before_failure'] <= 60)]
    
    if len(baseline) < 5 or len(last_7) < 3:
        continue
    
    results.append({
        'equipment_id': failure['equipment_id'],
        'failure_date': failure['event_date'],
        'oil_temp_baseline': baseline['oil_temp_top_celsius'].mean(),
        'oil_temp_last7d': last_7['oil_temp_top_celsius'].mean(),
        'oil_temp_change': last_7['oil_temp_top_celsius'].mean() - baseline['oil_temp_top_celsius'].mean(),
        'winding_baseline': baseline['winding_temp_celsius'].mean(),
        'winding_last7d': last_7['winding_temp_celsius'].mean(),
        'winding_change': last_7['winding_temp_celsius'].mean() - baseline['winding_temp_celsius'].mean(),
    })

failure_analysis = pd.DataFrame(results)
failure_analysis.round(2)

The temperature change column is key. Positive values mean temperatures were rising before the failure. Let's see which ones show the biggest delta.

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

colors = ['#d9534f' if x > 3 else '#5cb85c' if x < 1 else '#f0ad4e' 
          for x in failure_analysis['oil_temp_change']]

axes[0].barh(failure_analysis['equipment_id'], failure_analysis['oil_temp_change'], color=colors)
axes[0].set_xlabel('Oil Temp Change (°C)')
axes[0].set_title('Oil Temp Change: Last 7d vs Baseline')
axes[0].axvline(x=0, color='black', linestyle='-', alpha=0.3)

colors2 = ['#d9534f' if x > 4 else '#5cb85c' if x < 1 else '#f0ad4e' 
           for x in failure_analysis['winding_change']]

axes[1].barh(failure_analysis['equipment_id'], failure_analysis['winding_change'], color=colors2)
axes[1].set_xlabel('Winding Temp Change (°C)')
axes[1].set_title('Winding Temp Change: Last 7d vs Baseline')
axes[1].axvline(x=0, color='black', linestyle='-', alpha=0.3)

plt.tight_layout()
plt.show()

## 3. Failed vs Non-Failed Equipment Comparison

Compare average sensor readings of equipment that eventually failed vs those that didn't.

In [None]:
failed_ids = failures['equipment_id'].unique()

sensor_df['failed_equipment'] = sensor_df['equipment_id'].isin(failed_ids)

comparison = sensor_df.groupby('failed_equipment')[[
    'oil_temp_top_celsius', 'oil_temp_bottom_celsius',
    'load_mva', 'winding_temp_celsius'
]].mean().round(2)

comparison.index = ['Non-Failed', 'Failed']
comparison

In [None]:
# Statistical test: are oil temps significantly different?
failed_temps = sensor_df[sensor_df['failed_equipment']]['oil_temp_top_celsius'].dropna()
nonfailed_temps = sensor_df[~sensor_df['failed_equipment']]['oil_temp_top_celsius'].dropna()

t_stat, p_value = stats.ttest_ind(failed_temps, nonfailed_temps)
print(f'Oil Temp Top - Failed vs Non-Failed:')
print(f'  Failed mean:     {failed_temps.mean():.2f}°C')
print(f'  Non-failed mean: {nonfailed_temps.mean():.2f}°C')
print(f'  t-statistic:     {t_stat:.3f}')
print(f'  p-value:         {p_value:.4f}')
print(f'  Significant:     {"Yes" if p_value < 0.05 else "No"}')

In [None]:
# Same for winding temp
failed_winding = sensor_df[sensor_df['failed_equipment']]['winding_temp_celsius'].dropna()
nonfailed_winding = sensor_df[~sensor_df['failed_equipment']]['winding_temp_celsius'].dropna()

t_stat, p_value = stats.ttest_ind(failed_winding, nonfailed_winding)
print(f'Winding Temp - Failed vs Non-Failed:')
print(f'  Failed mean:     {failed_winding.mean():.2f}°C')
print(f'  Non-failed mean: {nonfailed_winding.mean():.2f}°C')
print(f'  t-statistic:     {t_stat:.3f}')
print(f'  p-value:         {p_value:.4f}')

The overall averages may not differ much because failures are rare events and the pre-failure signal only appears in a narrow time window. The pattern is in the **trend**, not the absolute level.

This is why rolling window features (trends, rate of change) will be more important than static averages.

## 4. Equipment Age vs Failure

In [None]:
equipment_info = maint_df.groupby('equipment_id').agg(
    install_date=('install_date', 'first')
).reset_index()
equipment_info['age_years'] = (pd.Timestamp('2024-01-01') - equipment_info['install_date']).dt.days / 365.25
equipment_info['has_failed'] = equipment_info['equipment_id'].isin(failed_ids)

fig, ax = plt.subplots(figsize=(12, 5))
colors = ['#d9534f' if f else '#5cb85c' for f in equipment_info['has_failed']]
equipment_info_sorted = equipment_info.sort_values('age_years', ascending=False)
ax.barh(equipment_info_sorted['equipment_id'], equipment_info_sorted['age_years'],
        color=[colors[i] for i in equipment_info_sorted.index], alpha=0.8)
ax.set_xlabel('Equipment Age (years)')
ax.set_title('Equipment Age (Red = Has Failed)')
ax.tick_params(axis='y', labelsize=7)
plt.tight_layout()
plt.show()

print(f"Failed equipment mean age: {equipment_info[equipment_info['has_failed']]['age_years'].mean():.1f} years")
print(f"Non-failed equipment mean age: {equipment_info[~equipment_info['has_failed']]['age_years'].mean():.1f} years")

Clear pattern: older equipment fails more. Failed transformers average significantly higher age.

This makes sense from a domain perspective - older insulation degrades, seals wear, cooling systems become less efficient.

## 5. Failure Timing Analysis

In [None]:
# When do failures happen?
failures['month'] = failures['event_date'].dt.month
failures['day_of_week'] = failures['event_date'].dt.day_name()

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

month_counts = failures['month'].value_counts().sort_index()
month_counts.plot(kind='bar', ax=axes[0], color='steelblue', alpha=0.7)
axes[0].set_title('Failures by Month')
axes[0].set_xlabel('Month')
axes[0].set_ylabel('Count')

# Load at time of failure vs overall
axes[1].hist(sensor_df['load_mva'].dropna(), bins=50, alpha=0.4, label='All readings', color='gray', density=True)
# Get load on failure dates
failure_loads = []
for _, f in failures.iterrows():
    day_load = sensor_df[
        (sensor_df['equipment_id'] == f['equipment_id']) &
        (sensor_df['timestamp'] == f['event_date'])
    ]['load_mva']
    if len(day_load) > 0:
        failure_loads.append(day_load.values[0])
if failure_loads:
    axes[1].axvline(np.mean(failure_loads), color='red', linestyle='--', label=f'Mean load at failure ({np.mean(failure_loads):.1f} MVA)')
axes[1].legend()
axes[1].set_title('Load Distribution: All Readings vs Failure Days')
axes[1].set_xlabel('Load (MVA)')

plt.tight_layout()
plt.show()

Failures cluster in summer (Jun-Aug) and winter (Dec) - high load periods. This confirms load stress as a contributing factor.

## 6. Summary of Findings

### Failure Precursor Patterns Identified:

| Pattern | Signal | Lead Time | Estimated % |
|---------|--------|-----------|-------------|
| Cooling degradation | Sharp oil temp spike | 2-3 days | ~30% |
| Insulation breakdown | Gradual winding temp rise | 2-4 weeks | ~40% |
| Sudden failure | No clear precursor | N/A | ~30% |

### Key Features for Modelling:
1. **Temperature trends** (rolling mean, rate of change) - most important
2. **Equipment age** - strong baseline predictor
3. **Load stress** (sustained high load) - contributes to failure likelihood
4. **Temperature differentials** (top-bottom, winding-oil) - indicate cooling efficiency

### Implications for Feature Engineering:
- Use 14-day rolling windows for temperature statistics
- Calculate temperature trend (7-day diff) to capture both fast and slow patterns
- Include age × load interaction terms
- Binary features for high-load periods won't help much - use continuous rolling stats instead

**Next**: Build these features and train models (notebook 03).