# NOx Data Exploration
Overview of machines, NOx emissions compared to eBay values

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Load the data
df = pd.read_csv('data/NOx_intervals - 2026-01-20T164805.872.csv')
print(f"Total rows: {len(df):,}")
print(f"\nColumns: {list(df.columns)}")
df.head()

## Overview of Machines

In [None]:
# Number of unique machines
n_machines = df['device_id'].nunique()
print(f"Number of unique machines: {n_machines}")
print(f"\nMachine IDs: {sorted(df['device_id'].unique())}")

In [None]:
# Machines by MainGroupLabel (machine type)
machines_by_type = df.groupby('MainGroupLabel')['device_id'].nunique().sort_values(ascending=False)
print("Machines per type:")
print(machines_by_type)

fig, ax = plt.subplots(figsize=(10, 6))
machines_by_type.plot(kind='bar', ax=ax, color='steelblue')
ax.set_title('Number of Machines per Type', fontsize=14)
ax.set_xlabel('Machine Type')
ax.set_ylabel('Number of Machines')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Records per machine state
state_counts = df['machine_staat'].value_counts()
print("Records per machine state:")
print(state_counts)

fig, ax = plt.subplots(figsize=(8, 5))
colors = {'Uit': 'gray', 'Stationair': 'orange', 'Werkend': 'green'}
state_counts.plot(kind='bar', ax=ax, color=[colors.get(x, 'blue') for x in state_counts.index])
ax.set_title('Records per Machine State', fontsize=14)
ax.set_xlabel('State')
ax.set_ylabel('Number of Records')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

## NOx Emissions: Measured vs eBay (AUB6)
Compare the actual NOx emissions (`NOx_mass_flow_kg`) with the expected eBay values (`NOx_AUB6_kg`)

In [None]:
# Filter for working machines only (where we have meaningful data)
df_working = df[df['machine_staat'] == 'Werkend'].copy()
df_working = df_working.dropna(subset=['NOx_mass_flow_kg', 'NOx_AUB6_kg', 'motorbelasting'])
print(f"Working records with complete data: {len(df_working):,}")

In [None]:
# Aggregate per machine: total NOx measured vs total NOx eBay
machine_totals = df_working.groupby('device_id').agg({
    'NOx_mass_flow_kg': 'sum',
    'NOx_AUB6_kg': 'sum',
    'MainGroupLabel': 'first',
    'Power': 'first'
}).reset_index()

machine_totals['difference'] = machine_totals['NOx_mass_flow_kg'] - machine_totals['NOx_AUB6_kg']
machine_totals['diff_percent'] = (machine_totals['difference'] / machine_totals['NOx_AUB6_kg']) * 100

print("Per machine totals (NOx in kg):")
print(machine_totals.to_string())

In [None]:
# Bar chart: Measured vs eBay NOx per machine
fig, ax = plt.subplots(figsize=(14, 6))

x = np.arange(len(machine_totals))
width = 0.35

bars1 = ax.bar(x - width/2, machine_totals['NOx_mass_flow_kg'], width, label='Measured NOx', color='coral')
bars2 = ax.bar(x + width/2, machine_totals['NOx_AUB6_kg'], width, label='eBay NOx (AUB6)', color='steelblue')

ax.set_xlabel('Device ID')
ax.set_ylabel('Total NOx (kg)')
ax.set_title('Measured NOx vs eBay NOx per Machine', fontsize=14)
ax.set_xticks(x)
ax.set_xticklabels(machine_totals['device_id'], rotation=45, ha='right')
ax.legend()
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Scatter plot: Measured vs eBay NOx (per machine)
fig, ax = plt.subplots(figsize=(10, 8))

# Color by machine type
types = machine_totals['MainGroupLabel'].unique()
colors = plt.cm.tab10(np.linspace(0, 1, len(types)))
color_map = dict(zip(types, colors))

for mtype in types:
    mask = machine_totals['MainGroupLabel'] == mtype
    ax.scatter(machine_totals.loc[mask, 'NOx_AUB6_kg'], 
               machine_totals.loc[mask, 'NOx_mass_flow_kg'],
               c=[color_map[mtype]], label=mtype, s=100, alpha=0.7)

# Add 1:1 line
max_val = max(machine_totals['NOx_AUB6_kg'].max(), machine_totals['NOx_mass_flow_kg'].max())
ax.plot([0, max_val], [0, max_val], 'k--', alpha=0.5, label='1:1 line')

ax.set_xlabel('eBay NOx (AUB6) [kg]', fontsize=12)
ax.set_ylabel('Measured NOx [kg]', fontsize=12)
ax.set_title('Measured vs eBay NOx per Machine', fontsize=14)
ax.legend(loc='upper left')
ax.grid(alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Summary by machine type
type_summary = df_working.groupby('MainGroupLabel').agg({
    'device_id': 'nunique',
    'NOx_mass_flow_kg': 'sum',
    'NOx_AUB6_kg': 'sum',
    'motorbelasting': 'mean'
}).rename(columns={'device_id': 'n_machines'})

type_summary['diff_percent'] = ((type_summary['NOx_mass_flow_kg'] - type_summary['NOx_AUB6_kg']) / type_summary['NOx_AUB6_kg']) * 100
type_summary = type_summary.sort_values('NOx_mass_flow_kg', ascending=False)

print("Summary by machine type:")
print(type_summary.round(2).to_string())

In [None]:
# NOx vs motorbelasting scatter (sample to avoid overplotting)
sample = df_working.sample(min(5000, len(df_working)), random_state=42)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Measured NOx vs motorbelasting
ax1 = axes[0]
ax1.scatter(sample['motorbelasting'], sample['NOx_mass_flow_kg'], alpha=0.3, s=10)
ax1.set_xlabel('Motorbelasting')
ax1.set_ylabel('Measured NOx (kg)')
ax1.set_title('Measured NOx vs Engine Load')
ax1.grid(alpha=0.3)

# eBay NOx vs motorbelasting
ax2 = axes[1]
ax2.scatter(sample['motorbelasting'], sample['NOx_AUB6_kg'], alpha=0.3, s=10, color='orange')
ax2.set_xlabel('Motorbelasting')
ax2.set_ylabel('eBay NOx (AUB6) (kg)')
ax2.set_title('eBay NOx vs Engine Load')
ax2.grid(alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Distribution of difference percentage
fig, ax = plt.subplots(figsize=(10, 5))

# Filter out extreme values for visualization
diff_pct = df_working['verschil_percentage'].dropna()
diff_pct_clipped = diff_pct.clip(-200, 200)

ax.hist(diff_pct_clipped, bins=50, edgecolor='black', alpha=0.7)
ax.axvline(0, color='red', linestyle='--', linewidth=2, label='Zero difference')
ax.axvline(diff_pct.median(), color='green', linestyle='-', linewidth=2, label=f'Median: {diff_pct.median():.1f}%')

ax.set_xlabel('Difference % (Measured - eBay) / eBay * 100')
ax.set_ylabel('Frequency')
ax.set_title('Distribution of NOx Difference Percentage')
ax.legend()
ax.grid(alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nDifference statistics:")
print(f"  Mean: {diff_pct.mean():.1f}%")
print(f"  Median: {diff_pct.median():.1f}%")
print(f"  Std: {diff_pct.std():.1f}%")
print(f"  Records above 0% (higher than eBay): {(diff_pct > 0).sum():,} ({(diff_pct > 0).mean()*100:.1f}%)")
print(f"  Records below 0% (lower than eBay): {(diff_pct < 0).sum():,} ({(diff_pct < 0).mean()*100:.1f}%)")

In [None]:
# Boxplot of difference percentage by machine type
fig, ax = plt.subplots(figsize=(12, 6))

types_ordered = type_summary.index.tolist()
data_by_type = [df_working[df_working['MainGroupLabel'] == t]['verschil_percentage'].dropna().clip(-200, 200) for t in types_ordered]

bp = ax.boxplot(data_by_type, labels=types_ordered, patch_artist=True)
for patch in bp['boxes']:
    patch.set_facecolor('lightblue')

ax.axhline(0, color='red', linestyle='--', linewidth=1)
ax.set_xlabel('Machine Type')
ax.set_ylabel('Difference % (clipped to [-200, 200])')
ax.set_title('NOx Difference Distribution by Machine Type')
plt.xticks(rotation=45, ha='right')
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()