In [None]:
# Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path

# Plot settings
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)
plt.rcParams['font.size'] = 11

# Data directory
DATA_DIR = Path('..') / '..' / 'data' / 'raw' / '1sec'
print(f'Data directory: {DATA_DIR.resolve()}')

---
## 1Ô∏è‚É£ Load and Inventory All Files

In [None]:
# List all CSV files
files = sorted([f for f in os.listdir(DATA_DIR) if f.endswith('.csv')])

print('='*80)
print('üìÇ FILE INVENTORY')
print('='*80)
print(f'\nFound {len(files)} files:\n')

file_info = []
for f in files:
    path = DATA_DIR / f
    # Read first few rows to get info
    df_sample = pd.read_csv(path, nrows=5)
    # Count rows (fast method)
    with open(path, 'r') as file:
        row_count = sum(1 for _ in file) - 1  # minus header
    
    file_info.append({
        'file': f,
        'rows': row_count,
        'columns': len(df_sample.columns)
    })
    print(f'   üìÑ {f}: {row_count:,} rows, {len(df_sample.columns)} columns')

print(f'\nüìä Total rows across all files: {sum(f["rows"] for f in file_info):,}')

---
## 2Ô∏è‚É£ Analyze Each File: Building (Aggregate) Availability

In [None]:
print('='*80)
print('üîç BUILDING (AGGREGATE) AVAILABILITY PER FILE')
print('='*80)
print()

file_analysis = []

for f in files:
    path = DATA_DIR / f
    df = pd.read_csv(path)
    df['_time'] = pd.to_datetime(df['_time'])
    
    # Time analysis
    time_start = df['_time'].min()
    time_end = df['_time'].max()
    time_range = time_end - time_start
    
    # Resolution
    time_diffs = df['_time'].diff().dt.total_seconds().dropna()
    resolution = time_diffs.mode().iloc[0] if len(time_diffs.mode()) > 0 else time_diffs.median()
    
    # Building analysis
    building_null_pct = df['Building'].isna().sum() / len(df) * 100
    building_usable = building_null_pct < 50
    
    # Completeness
    expected_rows = time_range.total_seconds() / resolution
    completeness = len(df) / expected_rows * 100 if expected_rows > 0 else 0
    
    file_analysis.append({
        'file': f,
        'rows': len(df),
        'time_start': time_start,
        'time_end': time_end,
        'days': time_range.days + time_range.seconds/86400,
        'resolution_sec': resolution,
        'building_null_pct': building_null_pct,
        'building_usable': building_usable,
        'completeness_pct': completeness
    })
    
    status = '‚úÖ' if building_usable else '‚ùå'
    res_str = f'{resolution:.0f}sec'
    print(f'{status} {f}')
    print(f'   Rows: {len(df):,} | Resolution: {res_str} | Days: {time_range.days}')
    print(f'   Building NULL: {building_null_pct:.1f}% | Completeness: {completeness:.1f}%')
    print()

In [None]:
# Summary table
summary_df = pd.DataFrame(file_analysis)
summary_df['usable'] = summary_df['building_null_pct'] < 50

print('='*80)
print('üìä SUMMARY TABLE')
print('='*80)
print()
print(summary_df[['file', 'rows', 'resolution_sec', 'building_null_pct', 'completeness_pct', 'usable']].to_string(index=False))

usable_files = summary_df[summary_df['usable']]['file'].tolist()
print(f'\n‚úÖ Usable files: {len(usable_files)}/10')
for f in usable_files:
    print(f'   ‚Ä¢ {f}')

---
## 3Ô∏è‚É£ Load Usable Files Only

In [None]:
# Load only usable files (March, April, May 2024)
usable_files = ['samengevoegd_2024-03.csv', 'samengevoegd_2024-04.csv', 'samengevoegd_2024-05.csv']

print('='*80)
print('üìÇ LOADING USABLE FILES')
print('='*80)

dfs = []
for f in usable_files:
    path = DATA_DIR / f
    df = pd.read_csv(path)
    df['_time'] = pd.to_datetime(df['_time'])
    df['source_file'] = f
    dfs.append(df)
    print(f'   ‚úÖ Loaded {f}: {len(df):,} rows')

# Combine
df_combined = pd.concat(dfs, ignore_index=True)
df_combined = df_combined.sort_values('_time').reset_index(drop=True)

print(f'\nüìä Combined dataset: {len(df_combined):,} rows')
print(f'   Time range: {df_combined["_time"].min()} ‚Üí {df_combined["_time"].max()}')

---
## 4Ô∏è‚É£ Dataset Structure

In [None]:
print('='*80)
print('üèóÔ∏è DATASET STRUCTURE')
print('='*80)

print(f'\nüìã Columns ({len(df_combined.columns)}):')  
for i, col in enumerate(df_combined.columns, 1):
    dtype = df_combined[col].dtype
    null_pct = df_combined[col].isna().sum() / len(df_combined) * 100
    print(f'   {i:2d}. {col:25s} - {str(dtype):15s} (NULL: {null_pct:.1f}%)')

---
## 5Ô∏è‚É£ Numerical Features Analysis

In [None]:
print('='*80)
print('üìà NUMERICAL FEATURES ANALYSIS')
print('='*80)

# Appliance columns (exclude _time and source_file)
appliance_cols = [c for c in df_combined.columns if c not in ['_time', 'source_file']]

print(f'\n{"Column":<20} {"Mean":>10} {"Std":>10} {"Min":>10} {"Max":>10} {"NULL%":>8} {"NEG%":>8}')
print('-' * 80)

for col in appliance_cols:
    series = df_combined[col]
    valid = series.dropna()
    
    null_pct = series.isna().sum() / len(series) * 100
    neg_pct = (valid < 0).sum() / len(valid) * 100 if len(valid) > 0 else 0
    
    mean_val = valid.mean() if len(valid) > 0 else 0
    std_val = valid.std() if len(valid) > 0 else 0
    min_val = valid.min() if len(valid) > 0 else 0
    max_val = valid.max() if len(valid) > 0 else 0
    
    print(f'{col:<20} {mean_val:>10.4f} {std_val:>10.4f} {min_val:>10.4f} {max_val:>10.4f} {null_pct:>7.1f}% {neg_pct:>7.1f}%')

### üí° Key Insights: Negative Values

Several appliances show high percentages of negative values:
- **Fornuis (Stove)**: ~99% negative - CT sensor offset
- **Oven**: ~98% negative - CT sensor offset  
- **Vaatwasser (Dishwasher)**: ~92% negative - CT sensor offset
- **Wasmachine (Washing Machine)**: ~96% negative - CT sensor offset

**Solution**: Apply `clip(lower=0)` during preprocessing (same as 15min pipeline).

---
## 6Ô∏è‚É£ Resolution Analysis

In [None]:
print('='*80)
print('‚è±Ô∏è RESOLUTION ANALYSIS')
print('='*80)

for f in usable_files:
    df_file = df_combined[df_combined['source_file'] == f].copy()
    time_diffs = df_file['_time'].diff().dt.total_seconds().dropna()
    
    print(f'\nüìÑ {f}')
    print(f'   Median diff: {time_diffs.median():.1f} sec')
    print(f'   Mode diff:   {time_diffs.mode().iloc[0]:.1f} sec')
    
    # Distribution
    print(f'   Top 3 intervals:')
    for diff, count in time_diffs.value_counts().head(3).items():
        pct = count / len(time_diffs) * 100
        print(f'      {diff:.0f}sec: {count:,} ({pct:.1f}%)')

---
## 7Ô∏è‚É£ Ghost Load Analysis

In [None]:
print('='*80)
print('üëª GHOST LOAD ANALYSIS')
print('='*80)

# Only where Building is valid
df_valid = df_combined[df_combined['Building'].notna()].copy()

# Appliance columns (exclude Building, _time, source_file)
appliance_only = [c for c in df_valid.columns if c not in ['_time', 'source_file', 'Building']]

# Sum of appliances (clip negatives)
df_valid['sum_appliances'] = df_valid[appliance_only].clip(lower=0).sum(axis=1)

# Ghost load
df_valid['ghost_load'] = df_valid['Building'] - df_valid['sum_appliances']

building_mean = df_valid['Building'].mean()
sum_mean = df_valid['sum_appliances'].mean()
ghost_mean = df_valid['ghost_load'].mean()
ghost_pct = ghost_mean / building_mean * 100 if building_mean > 0 else 0

print(f'\nüìä Overall Statistics:')
print(f'   Building (Aggregate) mean: {building_mean:.4f} kW')
print(f'   Sum(Appliances) mean:      {sum_mean:.4f} kW')
print(f'   Ghost Load mean:           {ghost_mean:.4f} kW ({ghost_pct:.1f}%)')

# Correlation
corr = df_valid['Building'].corr(df_valid['sum_appliances'])
print(f'\n   Correlation Building vs Sum: {corr:.4f}')

if ghost_pct > 30:
    print(f'\n‚ö†Ô∏è HIGH GHOST LOAD ({ghost_pct:.0f}%): Missing appliances not measured')
    print('   Missing in 1sec: Kast garage (~0.27 kW), Laadpaal_stopcontact, Warmtepomp-Sturing')

---
## 8Ô∏è‚É£ Visualization: Aggregate vs Sum of Appliances

In [None]:
# Sample for visualization (too many points otherwise)
sample_size = min(10000, len(df_valid))
df_sample = df_valid.sample(sample_size, random_state=42).sort_values('_time')

fig, axes = plt.subplots(2, 1, figsize=(14, 8))

# Plot 1: Time series
ax1 = axes[0]
ax1.plot(df_sample['_time'], df_sample['Building'], alpha=0.7, label='Building (Aggregate)')
ax1.plot(df_sample['_time'], df_sample['sum_appliances'], alpha=0.7, label='Sum(Appliances)')
ax1.set_xlabel('Time')
ax1.set_ylabel('Power (kW)')
ax1.set_title('Building vs Sum of Appliances (Sample)')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Plot 2: Scatter
ax2 = axes[1]
ax2.scatter(df_sample['sum_appliances'], df_sample['Building'], alpha=0.3, s=5)
max_val = max(df_sample['Building'].max(), df_sample['sum_appliances'].max())
ax2.plot([0, max_val], [0, max_val], 'r--', label='Perfect match (y=x)')
ax2.set_xlabel('Sum of Appliances (kW)')
ax2.set_ylabel('Building (kW)')
ax2.set_title(f'Correlation: {corr:.3f}')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

---
## 9Ô∏è‚É£ Per-Appliance Power Distribution

In [None]:
# Boxplot of appliance power (clipped)
appliance_data = df_valid[appliance_only].clip(lower=0)

fig, ax = plt.subplots(figsize=(14, 6))
appliance_data.boxplot(ax=ax, vert=True, showfliers=False)
ax.set_xlabel('Appliance')
ax.set_ylabel('Power (kW)')
ax.set_title('Power Distribution per Appliance (outliers hidden)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

---
## üîü Temporal Patterns

In [None]:
# Hourly pattern for main appliances
df_valid['hour'] = df_valid['_time'].dt.hour

# Select top consumers
top_appliances = ['Building', 'Warmtepomp', 'Wasmachine', 'Vaatwasser', 'Fornuis']
existing_appliances = [a for a in top_appliances if a in df_valid.columns]

hourly_mean = df_valid.groupby('hour')[existing_appliances].mean()

fig, ax = plt.subplots(figsize=(14, 6))
for col in existing_appliances:
    if col == 'Building':
        ax.plot(hourly_mean.index, hourly_mean[col], linewidth=2, label=col, marker='o')
    else:
        ax.plot(hourly_mean.index, hourly_mean[col].clip(lower=0), label=col, alpha=0.7)

ax.set_xlabel('Hour of Day')
ax.set_ylabel('Mean Power (kW)')
ax.set_title('Average Power Consumption by Hour')
ax.legend(bbox_to_anchor=(1.02, 1), loc='upper left')
ax.set_xticks(range(24))
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

---
## üìä Summary & Conclusions

### ‚úÖ Usable Data
| File | Period | Resolution | Rows | Building | Status |
|------|--------|------------|------|----------|--------|
| 2024-03 | March | 10sec | 154K | 97% valid | ‚úÖ |
| 2024-04 | April | 10sec | 219K | 100% valid | ‚úÖ |
| 2024-05 | May | 1sec | 1.08M | 100% valid | ‚úÖ |

### ‚ùå Unusable Data (June-December)
- Building (Aggregate) is 100% NULL
- NILM requires Aggregate as input

### üîß Preprocessing Steps Needed
1. **Resample to 10sec** (unify resolution)
2. **Clip negative values** (CT sensor offset)
3. **Remove Smappee_laadpaal** (NULL for usable months)
4. **Add temporal features** (hour, day of week, month)
5. **Export** to NILM-ready format

### ‚ö†Ô∏è Limitations
- Ghost load ~50% (missing: Kast garage, Laadpaal_stopcontact, Warmtepomp-Sturing)
- Only ~80 days of usable data (vs 365 for 15min)
- 8 appliances instead of 12