# Equipment Data Analysis
Analysis of equipment operational data including NOx emissions, fuel consumption, and power metrics

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

## 1. Load and Explore Data

In [None]:
# Load the data
df = pd.read_csv('Exploratory/untitled - 2025-11-18T141129.591.csv')

# Display basic information
print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

In [None]:
# Check data types and missing values
print("Data Info:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum()[df.isnull().sum() > 0])

In [None]:
# Statistical summary
df.describe()

## 2. Equipment Overview

In [None]:
# Equipment type distribution
fig = px.histogram(df, x='TypeOfEquipment', 
                   title='Equipment Type Distribution',
                   color='Pilot',
                   labels={'TypeOfEquipment': 'Equipment Type', 'count': 'Count'})
fig.update_layout(xaxis_tickangle=-45, height=500)
fig.show()

In [None]:
# Brand distribution
brand_counts = df['BrandLabel'].value_counts().head(10)
fig = px.bar(x=brand_counts.index, y=brand_counts.values,
             title='Top 10 Equipment Brands',
             labels={'x': 'Brand', 'y': 'Count'},
             color=brand_counts.values,
             color_continuous_scale='viridis')
fig.show()

In [None]:
# Power distribution by equipment type
fig = px.box(df, x='TypeOfEquipment', y='Power',
             title='Power Distribution by Equipment Type',
             color='EngineClassificationLabel',
             labels={'Power': 'Power (kW)', 'TypeOfEquipment': 'Equipment Type'})
fig.update_layout(xaxis_tickangle=-45, height=600)
fig.show()

## 3. NOx Emissions Analysis

In [None]:
# NOx per liter by fuel source
fig = make_subplots(rows=1, cols=3,
                    subplot_titles=('FF NOx/Liter', 'CANBUS NOx/Liter', 'NOxMAF NOx/Liter'))

fig.add_trace(go.Box(y=df['FF_NOxPerLiter'], name='FF', marker_color='lightblue'),
              row=1, col=1)
fig.add_trace(go.Box(y=df['CANBUS_NOxPerLiter'], name='CANBUS', marker_color='lightgreen'),
              row=1, col=2)
fig.add_trace(go.Box(y=df['NOxMAF_NOxPerLiter'], name='NOxMAF', marker_color='lightcoral'),
              row=1, col=3)

fig.update_layout(title_text='NOx Emissions per Liter by Measurement Source',
                  height=500, showlegend=False)
fig.update_yaxes(title_text='NOx per Liter', row=1, col=1)
fig.show()

In [None]:
# NOx Total by Equipment Type
fig = px.box(df, x='TypeOfEquipment', y='NOxTotal',
             title='Total NOx Emissions by Equipment Type',
             color='EngineClassificationLabel',
             labels={'NOxTotal': 'Total NOx', 'TypeOfEquipment': 'Equipment Type'})
fig.update_layout(xaxis_tickangle=-45, height=600)
fig.show()

In [None]:
# Scatter: Power vs NOx Total
fig = px.scatter(df, x='Power', y='NOxTotal',
                 color='TypeOfEquipment',
                 size='duration_from_rows',
                 hover_data=['name', 'BrandLabel', 'EngineClassificationLabel'],
                 title='Power vs Total NOx Emissions',
                 labels={'Power': 'Power (kW)', 'NOxTotal': 'Total NOx'},
                 opacity=0.6)
fig.update_layout(height=600)
fig.show()

## 4. Fuel Consumption Analysis

In [None]:
# Fuel consumption comparison
fuel_cols = ['FF_validated_fuel', 'CANBUS_validated_fuel']
df_fuel = df[fuel_cols + ['TypeOfEquipment']].dropna()

fig = px.box(df_fuel.melt(id_vars=['TypeOfEquipment'], 
                          value_vars=fuel_cols,
                          var_name='Source', 
                          value_name='Fuel'),
             x='TypeOfEquipment', y='Fuel', color='Source',
             title='Fuel Consumption by Equipment Type and Measurement Source',
             labels={'Fuel': 'Fuel Consumption', 'TypeOfEquipment': 'Equipment Type'})
fig.update_layout(xaxis_tickangle=-45, height=600)
fig.show()

In [None]:
# Fuel per hour comparison
fig = make_subplots(rows=1, cols=3,
                    subplot_titles=('FF FPH', 'CANBUS FPH', 'NOxMAF FPH'))

fig.add_trace(go.Box(y=df['FF_FPH'], name='FF', marker_color='lightblue'),
              row=1, col=1)
fig.add_trace(go.Box(y=df['CANBUS_FPH'], name='CANBUS', marker_color='lightgreen'),
              row=1, col=2)
fig.add_trace(go.Box(y=df['NOxMAF_FPH'], name='NOxMAF', marker_color='lightcoral'),
              row=1, col=3)

fig.update_layout(title_text='Fuel Per Hour by Measurement Source',
                  height=500, showlegend=False)
fig.update_yaxes(title_text='Fuel per Hour', row=1, col=1)
fig.show()

In [None]:
# Power vs Fuel consumption
fig = px.scatter(df.dropna(subset=['FF_FPH', 'Power']), 
                 x='Power', y='FF_FPH',
                 color='TypeOfEquipment',
                 size='FF_motorbelasting',
                 hover_data=['name', 'BrandLabel'],
                 title='Power vs Fuel Per Hour (FF Source)',
                 labels={'Power': 'Power (kW)', 'FF_FPH': 'Fuel Per Hour'},
                 opacity=0.6,
                 trendline='ols')
fig.update_layout(height=600)
fig.show()

## 5. Engine Load Analysis

In [None]:
# Engine load distribution
load_cols = ['FF_motorbelasting', 'CANBUS_motorbelasting', 'NOxMAF_motorbelasting']
df_load = df[load_cols].dropna()

fig = go.Figure()
for col in load_cols:
    fig.add_trace(go.Violin(y=df_load[col], name=col.replace('_motorbelasting', ''),
                            box_visible=True, meanline_visible=True))

fig.update_layout(title='Engine Load Distribution by Measurement Source',
                  yaxis_title='Motor Load',
                  height=500)
fig.show()

In [None]:
# Engine load vs Duration scatter plot with trend per machine
df_plot = df.dropna(subset=['NOxMAF_motorbelasting', 'duration_from_rows'])

fig = go.Figure()

# Get unique machines
machines = df_plot['name'].unique()

# Color palette for equipment types
equipment_types = df_plot['TypeOfEquipment'].unique()
color_map = {}
colors = px.colors.qualitative.Plotly + px.colors.qualitative.Set2 + px.colors.qualitative.Pastel
for i, eq_type in enumerate(equipment_types):
    color_map[eq_type] = colors[i % len(colors)]

# Plot scatter points for each machine
for machine in machines:
    machine_data = df_plot[df_plot['name'] == machine].copy()
    equipment_type = machine_data['TypeOfEquipment'].iloc[0]
    
    # Add scatter points
    fig.add_trace(go.Scatter(
        x=machine_data['duration_from_rows'],
        y=machine_data['NOxMAF_motorbelasting'],
        mode='markers',
        name=equipment_type,
        legendgroup=equipment_type,
        showlegend=equipment_type not in [trace.legendgroup for trace in fig.data if hasattr(trace, 'legendgroup')],
        marker=dict(
            size=6,
            color=color_map[equipment_type],
            opacity=0.5,
            line=dict(width=0)
        ),
        hovertemplate=f'<b>{machine}</b><br>Duration: %{{x:.1f}} hrs<br>Load: %{{y:.3f}}<extra></extra>'
    ))
    
    # Add trend line for machines with enough data points
    if len(machine_data) >= 3:
        # Sort by duration for smooth line
        machine_data = machine_data.sort_values('duration_from_rows')
        
        # Calculate linear regression
        x_vals = machine_data['duration_from_rows'].values
        y_vals = machine_data['NOxMAF_motorbelasting'].values
        
        # Simple linear fit
        coeffs = np.polyfit(x_vals, y_vals, 1)
        trend_y = coeffs[0] * x_vals + coeffs[1]
        
        # Add trend line
        fig.add_trace(go.Scatter(
            x=x_vals,
            y=trend_y,
            mode='lines',
            name=f'{machine} trend',
            line=dict(
                color=color_map[equipment_type],
                width=1,
                dash='dot'
            ),
            showlegend=False,
            hoverinfo='skip',
            opacity=0.3
        ))

# Add overall average line
df_plot_sorted = df_plot.sort_values('duration_from_rows')
df_plot_sorted['duration_bin'] = pd.cut(df_plot_sorted['duration_from_rows'], bins=20)
avg_by_duration = df_plot_sorted.groupby('duration_bin')['NOxMAF_motorbelasting'].mean().reset_index()
avg_by_duration['duration_mid'] = avg_by_duration['duration_bin'].apply(lambda x: x.mid)

fig.add_trace(go.Scatter(
    x=avg_by_duration['duration_mid'], 
    y=avg_by_duration['NOxMAF_motorbelasting'],
    mode='lines',
    name='Overall Average',
    line=dict(color='red', width=3, dash='dash'),
    showlegend=True
))

fig.update_layout(
    title='NOxMAF Engine Load vs Operational Duration (with Individual Machine Trends)',
    xaxis_title='Duration (hours)',
    yaxis_title='NOxMAF Engine Load',
    height=600,
    hovermode='closest'
)

fig.show()

In [None]:
# Temporal correlation AND duration correlation per individual machine
print("\n" + "=" * 60)
print("CORRELATION ANALYSIS BY INDIVIDUAL MACHINE")
print("=" * 60)

# Calculate correlations for each machine
machine_correlations = []

for machine_name in df['name'].unique():
    machine_data = df[df['name'] == machine_name].copy()
    
    if len(machine_data) > 3:  # Need at least 4 data points for meaningful correlation
        machine_data['date_numeric'] = (machine_data['date'] - machine_data['date'].min()).dt.days
        
        # Calculate correlation for NOxMAF_motorbelasting vs Date
        valid_temporal_data = machine_data[['NOxMAF_motorbelasting', 'date_numeric']].dropna()
        
        # Calculate correlation for NOxMAF_motorbelasting vs Duration
        valid_duration_data = machine_data[['NOxMAF_motorbelasting', 'duration_from_rows']].dropna()
        
        if len(valid_temporal_data) > 3:
            temporal_corr = valid_temporal_data['date_numeric'].corr(valid_temporal_data['NOxMAF_motorbelasting'])
            
            # Duration correlation (only if we have data)
            duration_corr = None
            if len(valid_duration_data) > 3:
                duration_corr = valid_duration_data['duration_from_rows'].corr(valid_duration_data['NOxMAF_motorbelasting'])
            
            machine_correlations.append({
                'name': machine_name,
                'equipment_type': machine_data['TypeOfEquipment'].iloc[0],
                'brand': machine_data['BrandLabel'].iloc[0],
                'temporal_correlation': temporal_corr,
                'duration_correlation': duration_corr,
                'data_points': len(valid_temporal_data),
                'date_range_days': machine_data['date_numeric'].max()
            })

# Convert to DataFrame and sort by absolute temporal correlation
corr_df = pd.DataFrame(machine_correlations)
corr_df['abs_temporal_corr'] = corr_df['temporal_correlation'].abs()
corr_df['abs_duration_corr'] = corr_df['duration_correlation'].abs()
corr_df_temporal = corr_df.sort_values('abs_temporal_corr', ascending=False)
corr_df_duration = corr_df.sort_values('abs_duration_corr', ascending=False)

print(f"\nTotal machines analyzed: {len(corr_df)}")

print("\n" + "=" * 60)
print("TEMPORAL CORRELATION (Date vs Engine Load)")
print("=" * 60)
print(f"\nTop 10 machines with STRONGEST temporal correlation:")
print(corr_df_temporal[['name', 'equipment_type', 'temporal_correlation', 'data_points', 'date_range_days']].head(10).to_string(index=False))

print(f"\n\nMachines with WEAKEST temporal correlation (most stable over time):")
print(corr_df_temporal[['name', 'equipment_type', 'temporal_correlation', 'data_points', 'date_range_days']].tail(10).to_string(index=False))

print("\n" + "=" * 60)
print("DURATION CORRELATION (Duration vs Engine Load)")
print("=" * 60)
print(f"\nTop 10 machines with STRONGEST duration correlation:")
print(corr_df_duration[['name', 'equipment_type', 'duration_correlation', 'data_points']].head(10).to_string(index=False))

print(f"\n\nMachines with WEAKEST duration correlation:")
print(corr_df_duration[['name', 'equipment_type', 'duration_correlation', 'data_points']].tail(10).to_string(index=False))

# Statistics
print("\n" + "=" * 60)
print("STATISTICS SUMMARY")
print("=" * 60)

print(f"\nTemporal Correlation (Date vs Load):")
print(f"  ‚Ä¢ Average: {corr_df['temporal_correlation'].mean():.4f}")
print(f"  ‚Ä¢ Median: {corr_df['temporal_correlation'].median():.4f}")
print(f"  ‚Ä¢ Std deviation: {corr_df['temporal_correlation'].std():.4f}")
print(f"  ‚Ä¢ Positive trend (>0.3): {(corr_df['temporal_correlation'] > 0.3).sum()} ({(corr_df['temporal_correlation'] > 0.3).sum()/len(corr_df)*100:.1f}%)")
print(f"  ‚Ä¢ Negative trend (<-0.3): {(corr_df['temporal_correlation'] < -0.3).sum()} ({(corr_df['temporal_correlation'] < -0.3).sum()/len(corr_df)*100:.1f}%)")
print(f"  ‚Ä¢ Stable (-0.3 to 0.3): {((corr_df['temporal_correlation'] >= -0.3) & (corr_df['temporal_correlation'] <= 0.3)).sum()} ({((corr_df['temporal_correlation'] >= -0.3) & (corr_df['temporal_correlation'] <= 0.3)).sum()/len(corr_df)*100:.1f}%)")

print(f"\nDuration Correlation (Duration vs Load):")
valid_duration_corr = corr_df['duration_correlation'].dropna()
print(f"  ‚Ä¢ Average: {valid_duration_corr.mean():.4f}")
print(f"  ‚Ä¢ Median: {valid_duration_corr.median():.4f}")
print(f"  ‚Ä¢ Std deviation: {valid_duration_corr.std():.4f}")
print(f"  ‚Ä¢ Positive correlation (>0.3): {(valid_duration_corr > 0.3).sum()} ({(valid_duration_corr > 0.3).sum()/len(valid_duration_corr)*100:.1f}%)")
print(f"  ‚Ä¢ Negative correlation (<-0.3): {(valid_duration_corr < -0.3).sum()} ({(valid_duration_corr < -0.3).sum()/len(valid_duration_corr)*100:.1f}%)")
print(f"  ‚Ä¢ Weak correlation (-0.3 to 0.3): {((valid_duration_corr >= -0.3) & (valid_duration_corr <= 0.3)).sum()} ({((valid_duration_corr >= -0.3) & (valid_duration_corr <= 0.3)).sum()/len(valid_duration_corr)*100:.1f}%)")

# Store for visualization
df_machine_corr = corr_df.copy()

In [None]:
# Grouped bar chart: Average load by Main Group
main_group_summary = df.groupby('MainGroupLabel').agg({
    'NOxMAF_motorbelasting': ['mean', 'std', 'count'],
    'FF_motorbelasting': 'mean',
    'CANBUS_motorbelasting': 'mean',
    'Power': 'mean',
    'duration_from_rows': 'mean'
}).round(4)

main_group_summary.columns = ['_'.join(col).strip() for col in main_group_summary.columns.values]
main_group_summary = main_group_summary.reset_index()
main_group_summary.columns = [
    'Main Group',
    'NOxMAF Avg',
    'NOxMAF Std',
    'Count',
    'FF Avg',
    'CANBUS Avg',
    'Avg Power',
    'Avg Duration'
]
main_group_summary = main_group_summary.sort_values('NOxMAF Avg', ascending=False)

# Create grouped bar chart
fig = go.Figure()

fig.add_trace(go.Bar(
    name='NOxMAF',
    x=main_group_summary['Main Group'],
    y=main_group_summary['NOxMAF Avg'],
    marker_color='steelblue',
    error_y=dict(type='data', array=main_group_summary['NOxMAF Std']),
    text=main_group_summary['NOxMAF Avg'].round(3),
    textposition='outside'
))

fig.add_trace(go.Bar(
    name='FF',
    x=main_group_summary['Main Group'],
    y=main_group_summary['FF Avg'],
    marker_color='coral',
    text=main_group_summary['FF Avg'].round(3),
    textposition='outside'
))

fig.add_trace(go.Bar(
    name='CANBUS',
    x=main_group_summary['Main Group'],
    y=main_group_summary['CANBUS Avg'],
    marker_color='green',
    text=main_group_summary['CANBUS Avg'].round(3),
    textposition='outside'
))

fig.update_layout(
    title='Average Engine Load by Main Group (Comparison of Measurement Sources)',
    xaxis_title='Main Equipment Group',
    yaxis_title='Average Engine Load',
    barmode='group',
    height=600,
    xaxis_tickangle=-45,
    legend_title='Measurement Source'
)

fig.show()

print("\nMain Group Summary:")
print(main_group_summary.to_string(index=False))

In [None]:
# Box plot showing load distribution by Main Group
fig = px.box(df.dropna(subset=['NOxMAF_motorbelasting', 'MainGroupLabel']),
             x='MainGroupLabel',
             y='NOxMAF_motorbelasting',
             color='MainGroupLabel',
             points='outliers',
             title='Engine Load Distribution by Main Equipment Group',
             labels={
                 'MainGroupLabel': 'Main Equipment Group',
                 'NOxMAF_motorbelasting': 'NOxMAF Engine Load'
             },
             color_discrete_map=main_group_colors)

fig.update_layout(
    height=600,
    xaxis_tickangle=-45,
    showlegend=True,
    legend_title='Main Group'
)

fig.show()

In [None]:
# Visualize with Main Group color coding
fig = go.Figure()

# Add bars for each equipment type, colored by Main Group
for idx, row in equipment_load_detailed.iterrows():
    fig.add_trace(go.Bar(
        x=[row['Equipment Type']],
        y=[row['NOxMAF Avg Load']],
        name=row['Main Group'],
        legendgroup=row['Main Group'],
        showlegend=row['Main Group'] not in [trace.legendgroup for trace in fig.data if hasattr(trace, 'legendgroup')],
        marker_color=main_group_colors[row['Main Group']],
        error_y=dict(type='data', array=[row['NOxMAF Std Dev']]),
        text=[f"{row['NOxMAF Avg Load']:.3f}"],
        textposition='outside',
        hovertemplate=f"<b>{row['Equipment Type']}</b><br>" +
                     f"Main Group: {row['Main Group']}<br>" +
                     f"Avg Load: %{{y:.4f}}<br>" +
                     f"Data Points: {row['Data Points']:.0f}<br>" +
                     f"Avg Power: {row['Avg Power (kW)']:.1f} kW<extra></extra>"
    ))

fig.update_layout(
    title='Average NOxMAF Engine Load by Equipment Type (Color-coded by Main Group)',
    xaxis_title='Equipment Type',
    yaxis_title='Average NOxMAF Engine Load',
    height=700,
    xaxis_tickangle=-45,
    barmode='group',
    legend_title='Main Group',
    hovermode='closest'
)

fig.show()

In [None]:
# Enhanced overview with MainGroup color coding
# Calculate average engine load per equipment type with MainGroup
equipment_load_detailed = df.groupby(['TypeOfEquipment', 'MainGroupLabel']).agg({
    'NOxMAF_motorbelasting': ['mean', 'std', 'count', 'min', 'max'],
    'FF_motorbelasting': 'mean',
    'CANBUS_motorbelasting': 'mean',
    'Power': 'mean',
    'duration_from_rows': 'mean'
}).round(4)

# Flatten column names
equipment_load_detailed.columns = ['_'.join(col).strip() for col in equipment_load_detailed.columns.values]
equipment_load_detailed = equipment_load_detailed.reset_index()

# Rename columns
equipment_load_detailed.columns = [
    'Equipment Type',
    'Main Group',
    'NOxMAF Avg Load',
    'NOxMAF Std Dev',
    'Data Points',
    'NOxMAF Min',
    'NOxMAF Max',
    'FF Avg Load',
    'CANBUS Avg Load',
    'Avg Power (kW)',
    'Avg Duration (hrs)'
]

# Sort by Main Group first, then by NOxMAF Avg Load within each group
equipment_load_detailed = equipment_load_detailed.sort_values(['Main Group', 'NOxMAF Avg Load'], 
                                                               ascending=[True, False])

# Create color map for Main Groups
main_groups = equipment_load_detailed['Main Group'].unique()
color_palette = px.colors.qualitative.Set3 + px.colors.qualitative.Pastel
main_group_colors = {group: color_palette[i % len(color_palette)] for i, group in enumerate(main_groups)}

print("=" * 90)
print("AVERAGE ENGINE LOAD BY EQUIPMENT TYPE (Grouped by Main Category)")
print("=" * 90)
print("\nSorted by Main Group, then by NOxMAF Average Load:\n")
print(equipment_load_detailed[['Main Group', 'Equipment Type', 'NOxMAF Avg Load', 
                                'NOxMAF Std Dev', 'Data Points', 'Avg Power (kW)', 
                                'Avg Duration (hrs)']].to_string(index=False))

print(f"\n\nBreakdown by Main Group:")
for group in equipment_load_detailed['Main Group'].unique():
    group_data = equipment_load_detailed[equipment_load_detailed['Main Group'] == group]
    print(f"\n  {group}:")
    print(f"    ‚Ä¢ Equipment types: {len(group_data)}")
    print(f"    ‚Ä¢ Avg NOxMAF load: {group_data['NOxMAF Avg Load'].mean():.4f}")
    print(f"    ‚Ä¢ Total data points: {group_data['Data Points'].sum():.0f}")
    print(f"    ‚Ä¢ Avg power: {group_data['Avg Power (kW)'].mean():.1f} kW")

In [None]:
# Heatmap showing load vs power relationship by equipment type
fig = px.scatter(equipment_load_summary,
                 x='Avg Power (kW)',
                 y='NOxMAF Avg Load',
                 size='Data Points',
                 color='NOxMAF Avg Load',
                 hover_name='Equipment Type',
                 hover_data={
                     'Avg Power (kW)': ':.1f',
                     'NOxMAF Avg Load': ':.4f',
                     'Data Points': True,
                     'Avg Duration (hrs)': ':.2f'
                 },
                 color_continuous_scale='RdYlGn_r',
                 title='Equipment Type: Power vs Engine Load',
                 labels={
                     'Avg Power (kW)': 'Average Power (kW)',
                     'NOxMAF Avg Load': 'Average NOxMAF Engine Load',
                     'Data Points': 'Number of Measurements'
                 })

# Add equipment type labels
for idx, row in equipment_load_summary.iterrows():
    fig.add_annotation(
        x=row['Avg Power (kW)'],
        y=row['NOxMAF Avg Load'],
        text=row['Equipment Type'][:15] + ('...' if len(row['Equipment Type']) > 15 else ''),
        showarrow=False,
        font=dict(size=8),
        xshift=0,
        yshift=15
    )

fig.update_layout(height=600)
fig.show()

In [None]:
# Visualize average engine load by equipment type
fig = make_subplots(rows=2, cols=1,
                    subplot_titles=('Average NOxMAF Engine Load by Equipment Type',
                                   'Comparison: NOxMAF vs FF vs CANBUS Load'),
                    vertical_spacing=0.15,
                    row_heights=[0.6, 0.4])

# Top chart - NOxMAF load with error bars
fig.add_trace(go.Bar(
    x=equipment_load_summary['Equipment Type'],
    y=equipment_load_summary['NOxMAF Avg Load'],
    error_y=dict(type='data', array=equipment_load_summary['NOxMAF Std Dev']),
    marker_color='steelblue',
    name='NOxMAF Load',
    text=equipment_load_summary['NOxMAF Avg Load'].round(3),
    textposition='outside',
    hovertemplate='<b>%{x}</b><br>Avg Load: %{y:.4f}<br>Data Points: %{customdata}<extra></extra>',
    customdata=equipment_load_summary['Data Points']
), row=1, col=1)

# Bottom chart - Comparison of all three load measurements
x_pos = list(range(len(equipment_load_summary)))

fig.add_trace(go.Scatter(
    x=equipment_load_summary['Equipment Type'],
    y=equipment_load_summary['NOxMAF Avg Load'],
    mode='lines+markers',
    name='NOxMAF',
    line=dict(color='steelblue', width=2),
    marker=dict(size=8)
), row=2, col=1)

fig.add_trace(go.Scatter(
    x=equipment_load_summary['Equipment Type'],
    y=equipment_load_summary['FF Avg Load'],
    mode='lines+markers',
    name='FF',
    line=dict(color='coral', width=2),
    marker=dict(size=8)
), row=2, col=1)

fig.add_trace(go.Scatter(
    x=equipment_load_summary['Equipment Type'],
    y=equipment_load_summary['CANBUS Avg Load'],
    mode='lines+markers',
    name='CANBUS',
    line=dict(color='green', width=2),
    marker=dict(size=8)
), row=2, col=1)

# Update axes
fig.update_xaxes(tickangle=-45, row=1, col=1)
fig.update_xaxes(tickangle=-45, row=2, col=1)
fig.update_yaxes(title_text="Average Engine Load", row=1, col=1)
fig.update_yaxes(title_text="Average Load", row=2, col=1)

fig.update_layout(
    title_text='Average Engine Load Analysis by Equipment Type',
    height=900,
    showlegend=True,
    legend=dict(x=1.02, y=0.5)
)

fig.show()

In [None]:
# Calculate average engine load per equipment type
equipment_load_summary = df.groupby('TypeOfEquipment').agg({
    'NOxMAF_motorbelasting': ['mean', 'std', 'count', 'min', 'max'],
    'FF_motorbelasting': 'mean',
    'CANBUS_motorbelasting': 'mean',
    'Power': 'mean',
    'duration_from_rows': 'mean'
}).round(4)

# Flatten column names
equipment_load_summary.columns = ['_'.join(col).strip() for col in equipment_load_summary.columns.values]
equipment_load_summary = equipment_load_summary.reset_index()

# Rename columns for clarity
equipment_load_summary.columns = [
    'Equipment Type',
    'NOxMAF Avg Load',
    'NOxMAF Std Dev',
    'Data Points',
    'NOxMAF Min',
    'NOxMAF Max',
    'FF Avg Load',
    'CANBUS Avg Load',
    'Avg Power (kW)',
    'Avg Duration (hrs)'
]

# Sort by NOxMAF average load
equipment_load_summary = equipment_load_summary.sort_values('NOxMAF Avg Load', ascending=False)

print("=" * 80)
print("AVERAGE ENGINE LOAD BY EQUIPMENT TYPE")
print("=" * 80)
print("\nSorted by NOxMAF Average Load (Highest to Lowest):\n")
print(equipment_load_summary.to_string(index=False))

print(f"\n\nKey Insights:")
print(f"  ‚Ä¢ Highest average load: {equipment_load_summary.iloc[0]['Equipment Type']} ({equipment_load_summary.iloc[0]['NOxMAF Avg Load']:.4f})")
print(f"  ‚Ä¢ Lowest average load: {equipment_load_summary.iloc[-1]['Equipment Type']} ({equipment_load_summary.iloc[-1]['NOxMAF Avg Load']:.4f})")
print(f"  ‚Ä¢ Overall average NOxMAF load: {df['NOxMAF_motorbelasting'].mean():.4f}")
print(f"  ‚Ä¢ Equipment types analyzed: {len(equipment_load_summary)}")

## 5c. Average Engine Load by Equipment Type

In [None]:
# Line plot filtered for days with > 1 hour operation
# Select machines with strongest trends (positive and negative) and most stable

# Get representative machines
top_5_positive = df_machine_corr.nlargest(5, 'correlation')['name'].tolist()
top_5_negative = df_machine_corr.nsmallest(5, 'correlation')['name'].tolist()
most_stable = df_machine_corr.nsmallest(5, 'abs_correlation')['name'].tolist()

# Filter for days with > 1 hour
df_filtered = df[df['duration_from_rows'] > 1].copy()

# Create subplots
fig = make_subplots(rows=3, cols=1,
                    subplot_titles=('Machines with Increasing Load (Filtered: >1 hour/day)',
                                   'Machines with Decreasing Load (Filtered: >1 hour/day)',
                                   'Machines with Stable Load (Filtered: >1 hour/day)'),
                    vertical_spacing=0.08)

colors = ['red', 'orange', 'gold', 'yellowgreen', 'green',
          'blue', 'purple', 'pink', 'brown', 'gray',
          'steelblue', 'teal', 'navy', 'maroon', 'olive']

# Plot increasing load machines
for idx, machine in enumerate(top_5_positive):
    machine_data = df_filtered[df_filtered['name'] == machine].sort_values('date')
    fig.add_trace(go.Scatter(
        x=machine_data['date'],
        y=machine_data['NOxMAF_motorbelasting'],
        mode='lines+markers',
        name=machine,
        line=dict(width=2, color=colors[idx]),
        marker=dict(size=5),
        hovertemplate='<b>%{text}</b><br>Date: %{x}<br>Load: %{y:.3f}<br>Hours: %{customdata:.1f}<extra></extra>',
        text=[machine] * len(machine_data),
        customdata=machine_data['duration_from_rows'],
        showlegend=True
    ), row=1, col=1)

# Plot decreasing load machines
for idx, machine in enumerate(top_5_negative):
    machine_data = df_filtered[df_filtered['name'] == machine].sort_values('date')
    fig.add_trace(go.Scatter(
        x=machine_data['date'],
        y=machine_data['NOxMAF_motorbelasting'],
        mode='lines+markers',
        name=machine,
        line=dict(width=2, color=colors[idx + 5]),
        marker=dict(size=5),
        hovertemplate='<b>%{text}</b><br>Date: %{x}<br>Load: %{y:.3f}<br>Hours: %{customdata:.1f}<extra></extra>',
        text=[machine] * len(machine_data),
        customdata=machine_data['duration_from_rows'],
        showlegend=True
    ), row=2, col=1)

# Plot stable load machines
for idx, machine in enumerate(most_stable):
    machine_data = df_filtered[df_filtered['name'] == machine].sort_values('date')
    fig.add_trace(go.Scatter(
        x=machine_data['date'],
        y=machine_data['NOxMAF_motorbelasting'],
        mode='lines+markers',
        name=machine,
        line=dict(width=2, color=colors[idx + 10]),
        marker=dict(size=5),
        hovertemplate='<b>%{text}</b><br>Date: %{x}<br>Load: %{y:.3f}<br>Hours: %{customdata:.1f}<extra></extra>',
        text=[machine] * len(machine_data),
        customdata=machine_data['duration_from_rows'],
        showlegend=True
    ), row=3, col=1)

fig.update_xaxes(title_text="Date", row=3, col=1)
fig.update_yaxes(title_text="NOxMAF Engine Load", row=1, col=1)
fig.update_yaxes(title_text="NOxMAF Engine Load", row=2, col=1)
fig.update_yaxes(title_text="NOxMAF Engine Load", row=3, col=1)

# Add text annotation showing filter
fig.add_annotation(
    text="Filter: Only days with >1 hour operation shown",
    xref="paper", yref="paper",
    x=0.5, y=1.02,
    showarrow=False,
    font=dict(size=12, color="red"),
    xanchor='center'
)

fig.update_layout(
    title_text='Engine Load Over Time - Individual Machines (>1 Hour Operation)',
    height=1200,
    hovermode='x unified'
)

fig.show()

# Print statistics about filtering
total_rows_before = len(df)
total_rows_after = len(df_filtered)
print(f"\nFiltering Statistics:")
print(f"  Total data points before filter: {total_rows_before:,}")
print(f"  Total data points after filter (>1 hour): {total_rows_after:,}")
print(f"  Data points removed: {total_rows_before - total_rows_after:,} ({(total_rows_before - total_rows_after)/total_rows_before*100:.1f}%)")
print(f"  Data points retained: {total_rows_after/total_rows_before*100:.1f}%")

In [None]:
# Line plot with color-coded markers based on engine hours
# Select machines with strongest trends (positive and negative) and most stable

# Get representative machines
top_5_positive = df_machine_corr.nlargest(5, 'correlation')['name'].tolist()
top_5_negative = df_machine_corr.nsmallest(5, 'correlation')['name'].tolist()
most_stable = df_machine_corr.nsmallest(5, 'abs_correlation')['name'].tolist()

# Create subplots
fig = make_subplots(rows=3, cols=1,
                    subplot_titles=('Machines with Increasing Load (Top 5) - Colored by Engine Hours',
                                   'Machines with Decreasing Load (Top 5) - Colored by Engine Hours',
                                   'Machines with Stable Load (Top 5) - Colored by Engine Hours'),
                    vertical_spacing=0.08)

# Plot increasing load machines
for idx, machine in enumerate(top_5_positive):
    machine_data = df[df['name'] == machine].sort_values('date')
    
    fig.add_trace(go.Scatter(
        x=machine_data['date'],
        y=machine_data['NOxMAF_motorbelasting'],
        mode='lines+markers',
        name=machine,
        line=dict(width=1.5, color='lightgray'),
        marker=dict(
            size=10,
            color=machine_data['duration_from_rows'],
            colorscale='Viridis',
            showscale=(idx == 0),  # Show colorbar only for first trace
            colorbar=dict(
                title="Engine<br>Hours",
                x=1.12,
                len=0.3,
                y=0.85
            ),
            line=dict(width=0.5, color='white')
        ),
        hovertemplate='<b>%{text}</b><br>Date: %{x}<br>Load: %{y:.3f}<br>Hours: %{marker.color:.1f}<extra></extra>',
        text=[machine] * len(machine_data),
        showlegend=True
    ), row=1, col=1)

# Plot decreasing load machines
for idx, machine in enumerate(top_5_negative):
    machine_data = df[df['name'] == machine].sort_values('date')
    
    fig.add_trace(go.Scatter(
        x=machine_data['date'],
        y=machine_data['NOxMAF_motorbelasting'],
        mode='lines+markers',
        name=machine,
        line=dict(width=1.5, color='lightgray'),
        marker=dict(
            size=10,
            color=machine_data['duration_from_rows'],
            colorscale='Viridis',
            showscale=(idx == 0),  # Show colorbar only for first trace
            colorbar=dict(
                title="Engine<br>Hours",
                x=1.12,
                len=0.3,
                y=0.5
            ),
            line=dict(width=0.5, color='white')
        ),
        hovertemplate='<b>%{text}</b><br>Date: %{x}<br>Load: %{y:.3f}<br>Hours: %{marker.color:.1f}<extra></extra>',
        text=[machine] * len(machine_data),
        showlegend=True
    ), row=2, col=1)

# Plot stable load machines
for idx, machine in enumerate(most_stable):
    machine_data = df[df['name'] == machine].sort_values('date')
    
    fig.add_trace(go.Scatter(
        x=machine_data['date'],
        y=machine_data['NOxMAF_motorbelasting'],
        mode='lines+markers',
        name=machine,
        line=dict(width=1.5, color='lightgray'),
        marker=dict(
            size=10,
            color=machine_data['duration_from_rows'],
            colorscale='Viridis',
            showscale=(idx == 0),  # Show colorbar only for first trace
            colorbar=dict(
                title="Engine<br>Hours",
                x=1.12,
                len=0.3,
                y=0.15
            ),
            line=dict(width=0.5, color='white')
        ),
        hovertemplate='<b>%{text}</b><br>Date: %{x}<br>Load: %{y:.3f}<br>Hours: %{marker.color:.1f}<extra></extra>',
        text=[machine] * len(machine_data),
        showlegend=True
    ), row=3, col=1)

fig.update_xaxes(title_text="Date", row=3, col=1)
fig.update_yaxes(title_text="NOxMAF Engine Load", row=1, col=1)
fig.update_yaxes(title_text="NOxMAF Engine Load", row=2, col=1)
fig.update_yaxes(title_text="NOxMAF Engine Load", row=3, col=1)

fig.update_layout(
    title_text='Engine Load Over Time - Individual Machines (Colored by Operating Hours)',
    height=1200,
    hovermode='closest'
)

fig.show()

In [None]:
# Line plot showing engine load over time for selected machines
# Select machines with strongest trends (positive and negative) and most stable

# Get representative machines
top_5_positive = df_machine_corr.nlargest(5, 'correlation')['name'].tolist()
top_5_negative = df_machine_corr.nsmallest(5, 'correlation')['name'].tolist()
most_stable = df_machine_corr.nsmallest(5, 'abs_correlation')['name'].tolist()

# Combine selected machines
selected_machines = top_5_positive + top_5_negative + most_stable

# Create subplots
fig = make_subplots(rows=3, cols=1,
                    subplot_titles=('Machines with Increasing Load (Top 5 Positive Correlation)',
                                   'Machines with Decreasing Load (Top 5 Negative Correlation)',
                                   'Machines with Stable Load (Top 5 Most Stable)'),
                    vertical_spacing=0.08)

colors = ['red', 'orange', 'gold', 'yellowgreen', 'green',
          'blue', 'purple', 'pink', 'brown', 'gray',
          'steelblue', 'teal', 'navy', 'maroon', 'olive']

# Plot increasing load machines
for idx, machine in enumerate(top_5_positive):
    machine_data = df[df['name'] == machine].sort_values('date')
    fig.add_trace(go.Scatter(
        x=machine_data['date'],
        y=machine_data['NOxMAF_motorbelasting'],
        mode='lines+markers',
        name=machine,
        line=dict(width=2, color=colors[idx]),
        marker=dict(size=4),
        showlegend=True
    ), row=1, col=1)

# Plot decreasing load machines
for idx, machine in enumerate(top_5_negative):
    machine_data = df[df['name'] == machine].sort_values('date')
    fig.add_trace(go.Scatter(
        x=machine_data['date'],
        y=machine_data['NOxMAF_motorbelasting'],
        mode='lines+markers',
        name=machine,
        line=dict(width=2, color=colors[idx + 5]),
        marker=dict(size=4),
        showlegend=True
    ), row=2, col=1)

# Plot stable load machines
for idx, machine in enumerate(most_stable):
    machine_data = df[df['name'] == machine].sort_values('date')
    fig.add_trace(go.Scatter(
        x=machine_data['date'],
        y=machine_data['NOxMAF_motorbelasting'],
        mode='lines+markers',
        name=machine,
        line=dict(width=2, color=colors[idx + 10]),
        marker=dict(size=4),
        showlegend=True
    ), row=3, col=1)

fig.update_xaxes(title_text="Date", row=3, col=1)
fig.update_yaxes(title_text="NOxMAF Engine Load", row=1, col=1)
fig.update_yaxes(title_text="NOxMAF Engine Load", row=2, col=1)
fig.update_yaxes(title_text="NOxMAF Engine Load", row=3, col=1)

fig.update_layout(
    title_text='Engine Load Over Time - Individual Machine Trends',
    height=1200,
    hovermode='x unified'
)

fig.show()

In [None]:
# Top machines with strongest positive and negative trends
fig = make_subplots(rows=1, cols=2,
                    subplot_titles=('Top 10 Increasing Load Over Time', 
                                   'Top 10 Decreasing Load Over Time'))

top_positive = df_machine_corr.nlargest(10, 'correlation')
top_negative = df_machine_corr.nsmallest(10, 'correlation')

# Positive correlations
fig.add_trace(go.Bar(
    y=top_positive['name'],
    x=top_positive['correlation'],
    orientation='h',
    marker_color='green',
    text=top_positive['correlation'].round(3),
    textposition='outside',
    name='Positive'
), row=1, col=1)

# Negative correlations
fig.add_trace(go.Bar(
    y=top_negative['name'],
    x=top_negative['correlation'],
    orientation='h',
    marker_color='red',
    text=top_negative['correlation'].round(3),
    textposition='outside',
    name='Negative'
), row=1, col=2)

fig.update_xaxes(title_text="Correlation", row=1, col=1)
fig.update_xaxes(title_text="Correlation", row=1, col=2)
fig.update_yaxes(title_text="Machine", row=1, col=1)

fig.update_layout(
    title_text='Machines with Strongest Temporal Trends',
    height=600,
    showlegend=False
)

fig.show()

In [None]:
# Scatter plot: Correlation vs number of data points and date range
fig = make_subplots(rows=1, cols=2,
                    subplot_titles=('Correlation vs Data Points', 
                                   'Correlation vs Date Range (days)'))

# Correlation vs data points
fig.add_trace(go.Scatter(
    x=df_machine_corr['data_points'],
    y=df_machine_corr['correlation'],
    mode='markers',
    marker=dict(
        size=8,
        color=df_machine_corr['correlation'],
        colorscale='RdYlGn',
        showscale=True,
        colorbar=dict(title="Correlation", x=1.15)
    ),
    text=df_machine_corr['name'],
    hovertemplate='<b>%{text}</b><br>Data Points: %{x}<br>Correlation: %{y:.3f}<extra></extra>',
    name='Machines'
), row=1, col=1)

# Correlation vs date range
fig.add_trace(go.Scatter(
    x=df_machine_corr['date_range_days'],
    y=df_machine_corr['correlation'],
    mode='markers',
    marker=dict(
        size=8,
        color=df_machine_corr['correlation'],
        colorscale='RdYlGn',
        showscale=False
    ),
    text=df_machine_corr['name'],
    hovertemplate='<b>%{text}</b><br>Date Range: %{x} days<br>Correlation: %{y:.3f}<extra></extra>',
    name='Machines'
), row=1, col=2)

# Add horizontal reference lines
for col in [1, 2]:
    fig.add_hline(y=0.3, line_dash="dash", line_color="green", opacity=0.5, row=1, col=col)
    fig.add_hline(y=-0.3, line_dash="dash", line_color="red", opacity=0.5, row=1, col=col)
    fig.add_hline(y=0, line_dash="dot", line_color="gray", opacity=0.3, row=1, col=col)

fig.update_xaxes(title_text="Number of Data Points", row=1, col=1)
fig.update_xaxes(title_text="Date Range (days)", row=1, col=2)
fig.update_yaxes(title_text="Correlation", row=1, col=1)

fig.update_layout(
    title_text='Machine-Level Correlation Analysis',
    height=500,
    showlegend=False
)

fig.show()

In [None]:
# Visualize correlation distribution by machine
fig = go.Figure()

# Histogram of correlations
fig.add_trace(go.Histogram(
    x=df_machine_corr['correlation'],
    nbinsx=30,
    name='Correlation Distribution',
    marker_color='steelblue',
    opacity=0.7
))

# Add vertical lines for thresholds
fig.add_vline(x=-0.3, line_dash="dash", line_color="red", 
              annotation_text="Strong Negative", annotation_position="top left")
fig.add_vline(x=0.3, line_dash="dash", line_color="green", 
              annotation_text="Strong Positive", annotation_position="top right")
fig.add_vline(x=0, line_dash="dot", line_color="gray", 
              annotation_text="No Correlation")

fig.update_layout(
    title='Distribution of Temporal Correlations Across Individual Machines',
    xaxis_title='Correlation (Date vs NOxMAF Engine Load)',
    yaxis_title='Number of Machines',
    height=500,
    showlegend=False
)

fig.show()

In [None]:
# Temporal correlation per individual machine
print("\n" + "=" * 60)
print("TEMPORAL CORRELATION BY INDIVIDUAL MACHINE")
print("=" * 60)

# Calculate correlation for each machine
machine_correlations = []

for machine_name in df['name'].unique():
    machine_data = df[df['name'] == machine_name].copy()
    
    if len(machine_data) > 3:  # Need at least 4 data points for meaningful correlation
        machine_data['date_numeric'] = (machine_data['date'] - machine_data['date'].min()).dt.days
        
        # Calculate correlation for NOxMAF_motorbelasting
        valid_data = machine_data[['NOxMAF_motorbelasting', 'date_numeric']].dropna()
        
        if len(valid_data) > 3:
            corr = valid_data['date_numeric'].corr(valid_data['NOxMAF_motorbelasting'])
            
            machine_correlations.append({
                'name': machine_name,
                'equipment_type': machine_data['TypeOfEquipment'].iloc[0],
                'brand': machine_data['BrandLabel'].iloc[0],
                'correlation': corr,
                'data_points': len(valid_data),
                'date_range_days': machine_data['date_numeric'].max()
            })

# Convert to DataFrame and sort by absolute correlation
corr_df = pd.DataFrame(machine_correlations)
corr_df['abs_correlation'] = corr_df['correlation'].abs()
corr_df = corr_df.sort_values('abs_correlation', ascending=False)

print(f"\nTotal machines analyzed: {len(corr_df)}")
print(f"\nTop 10 machines with STRONGEST correlation (positive or negative):")
print(corr_df[['name', 'equipment_type', 'correlation', 'data_points', 'date_range_days']].head(10).to_string(index=False))

print(f"\n\nMachines with WEAKEST correlation (most stable over time):")
print(corr_df[['name', 'equipment_type', 'correlation', 'data_points', 'date_range_days']].tail(10).to_string(index=False))

# Statistics
print(f"\n\nStatistics:")
print(f"  ‚Ä¢ Average correlation: {corr_df['correlation'].mean():.4f}")
print(f"  ‚Ä¢ Median correlation: {corr_df['correlation'].median():.4f}")
print(f"  ‚Ä¢ Std deviation: {corr_df['correlation'].std():.4f}")
print(f"  ‚Ä¢ Machines with positive trend: {(corr_df['correlation'] > 0.3).sum()} ({(corr_df['correlation'] > 0.3).sum()/len(corr_df)*100:.1f}%)")
print(f"  ‚Ä¢ Machines with negative trend: {(corr_df['correlation'] < -0.3).sum()} ({(corr_df['correlation'] < -0.3).sum()/len(corr_df)*100:.1f}%)")
print(f"  ‚Ä¢ Machines with stable load: {((corr_df['correlation'] >= -0.3) & (corr_df['correlation'] <= 0.3)).sum()} ({((corr_df['correlation'] >= -0.3) & (corr_df['correlation'] <= 0.3)).sum()/len(corr_df)*100:.1f}%)")

# Store for visualization
df_machine_corr = corr_df.copy()

In [None]:
# Summary of temporal correlation findings
print("\n" + "=" * 60)
print("TEMPORAL CORRELATION SUMMARY")
print("=" * 60)

print("\nüìÖ Date vs Engine Load Correlations:")
print(f"  ‚Ä¢ FF_motorbelasting: {correlations.get('FF_motorbelasting', 0):.4f} (weak negative)")
print(f"  ‚Ä¢ CANBUS_motorbelasting: {correlations.get('CANBUS_motorbelasting', 0):.4f} (very weak)")
print(f"  ‚Ä¢ NOxMAF_motorbelasting: {correlations.get('NOxMAF_motorbelasting', 0):.4f} (weak positive)")

print("\nüìä Interpretation:")
print("  ‚Ä¢ Very weak to weak correlations indicate minimal linear relationship")
print("  ‚Ä¢ Engine load does NOT strongly depend on date/time")
print("  ‚Ä¢ FF load shows slight decrease over time (negative correlation)")
print("  ‚Ä¢ NOxMAF load shows slight increase over time (positive correlation)")

print("\nüìÜ Day of Week Patterns:")
dow_means = df.groupby('day_of_week')['NOxMAF_motorbelasting'].mean()
print(f"  ‚Ä¢ Weekday average (Mon-Fri): {dow_means[:5].mean():.4f}")
print(f"  ‚Ä¢ Weekend average (Sat-Sun): {dow_means[5:].mean():.4f}")
print(f"  ‚Ä¢ Weekend loads are ~{((dow_means[5:].mean() / dow_means[:5].mean() - 1) * 100):.1f}% higher")

print("\n‚úì Conclusion: Equipment usage patterns vary by day of week")
print("  but show minimal correlation with calendar date progression")
print("=" * 60)

In [None]:
# Engine load by day of week
day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
dow_stats = df.groupby('day_of_week').agg({
    'NOxMAF_motorbelasting': ['mean', 'std', 'count'],
    'FF_motorbelasting': 'mean'
}).reset_index()

dow_stats['day_name'] = dow_stats['day_of_week'].apply(lambda x: day_names[x])

fig = make_subplots(rows=1, cols=2,
                    subplot_titles=('Average NOxMAF Load by Day of Week', 
                                   'Average FF Load by Day of Week'))

# NOxMAF by day of week
fig.add_trace(go.Bar(x=dow_stats['day_name'], 
                     y=dow_stats[('NOxMAF_motorbelasting', 'mean')],
                     name='NOxMAF Load',
                     marker_color='coral',
                     error_y=dict(type='data', array=dow_stats[('NOxMAF_motorbelasting', 'std')])),
              row=1, col=1)

# FF by day of week
fig.add_trace(go.Bar(x=dow_stats['day_name'], 
                     y=dow_stats[('FF_motorbelasting', 'mean')],
                     name='FF Load',
                     marker_color='steelblue'),
              row=1, col=2)

fig.update_xaxes(tickangle=-45, row=1, col=1)
fig.update_xaxes(tickangle=-45, row=1, col=2)
fig.update_yaxes(title_text="Engine Load", row=1, col=1)
fig.update_yaxes(title_text="Engine Load", row=1, col=2)

fig.update_layout(title_text='Weekly Pattern Analysis',
                  height=500, showlegend=False)
fig.show()

In [None]:
# Daily average engine load over time
daily_load = df.groupby('date').agg({
    'NOxMAF_motorbelasting': 'mean',
    'FF_motorbelasting': 'mean',
    'CANBUS_motorbelasting': 'mean',
    'name': 'count'
}).reset_index()

fig = make_subplots(rows=2, cols=1,
                    subplot_titles=('NOxMAF Engine Load Over Time', 'FF Engine Load Over Time'),
                    vertical_spacing=0.12)

# NOxMAF load over time
fig.add_trace(go.Scatter(x=daily_load['date'], 
                         y=daily_load['NOxMAF_motorbelasting'],
                         mode='lines+markers',
                         name='NOxMAF Load',
                         line=dict(color='coral', width=2),
                         opacity=0.7),
              row=1, col=1)

# FF load over time
fig.add_trace(go.Scatter(x=daily_load['date'], 
                         y=daily_load['FF_motorbelasting'],
                         mode='lines+markers',
                         name='FF Load',
                         line=dict(color='steelblue', width=2),
                         opacity=0.7),
              row=2, col=1)

fig.update_xaxes(title_text="Date", row=2, col=1)
fig.update_yaxes(title_text="NOxMAF Engine Load", row=1, col=1)
fig.update_yaxes(title_text="FF Engine Load", row=2, col=1)

fig.update_layout(title_text='Engine Load Trends Over Time',
                  height=700, showlegend=True)
fig.show()

In [None]:
# Check correlation between date and engine load
print("Temporal Correlation Analysis")
print("=" * 60)

# Create numeric date column
df_corr = df.copy()
df_corr['date_numeric'] = (df_corr['date'] - df_corr['date'].min()).dt.days

# Calculate correlations
correlations = {}
for col in ['FF_motorbelasting', 'CANBUS_motorbelasting', 'NOxMAF_motorbelasting']:
    valid_data = df_corr[[col, 'date_numeric']].dropna()
    if len(valid_data) > 0:
        corr = valid_data['date_numeric'].corr(valid_data[col])
        correlations[col] = corr
        print(f"{col} vs Date: {corr:.4f}")

print(f"\nDate range: {df['date'].min().date()} to {df['date'].max().date()}")
print(f"Number of unique dates: {df['date'].nunique()}")

# Add day of week for pattern analysis
df['day_of_week'] = df['date'].dt.dayofweek
df['week_of_year'] = df['date'].dt.isocalendar().week
df['month'] = df['date'].dt.month

## 5b. Temporal Analysis - Date vs Engine Load Correlation

## 6. CO2 Emissions Analysis

In [None]:
# CO2 Total by Equipment Type
fig = px.box(df, x='TypeOfEquipment', y='C02Total',
             color='Pilot',
             title='Total CO2 Emissions by Equipment Type',
             labels={'C02Total': 'Total CO2', 'TypeOfEquipment': 'Equipment Type'})
fig.update_layout(xaxis_tickangle=-45, height=600)
fig.show()

In [None]:
# NOx vs CO2 relationship
fig = px.scatter(df, x='C02Total', y='NOxTotal',
                 color='TypeOfEquipment',
                 size='Power',
                 hover_data=['name', 'BrandLabel', 'EngineClassificationLabel'],
                 title='NOx vs CO2 Emissions',
                 labels={'C02Total': 'Total CO2', 'NOxTotal': 'Total NOx'},
                 opacity=0.6,
                 trendline='ols')
fig.update_layout(height=600)
fig.show()

## 7. Operational Duration Analysis

In [None]:
# Duration comparison
duration_cols = ['FF_validated_duration', 'CANBUS_validated_duration', 'duration_from_rows']
df_duration = df[duration_cols + ['TypeOfEquipment']].dropna()

fig = px.box(df_duration.melt(id_vars=['TypeOfEquipment'], 
                              value_vars=duration_cols,
                              var_name='Source', 
                              value_name='Duration'),
             x='TypeOfEquipment', y='Duration', color='Source',
             title='Operational Duration by Equipment Type and Source',
             labels={'Duration': 'Duration (hours)', 'TypeOfEquipment': 'Equipment Type'})
fig.update_layout(xaxis_tickangle=-45, height=600)
fig.show()

## 8. Engine Classification Comparison

In [None]:
# NOx per liter by engine classification
fig = px.box(df.dropna(subset=['NOxMAF_NOxPerLiter', 'EngineClassificationLabel']), 
             x='EngineClassificationLabel', y='NOxMAF_NOxPerLiter',
             color='TypeOfEquipment',
             title='NOx per Liter by Engine Classification',
             labels={'NOxMAF_NOxPerLiter': 'NOx per Liter', 
                     'EngineClassificationLabel': 'Engine Classification'})
fig.update_layout(height=600)
fig.show()

In [None]:
# Average emissions by engine classification
engine_stats = df.groupby('EngineClassificationLabel').agg({
    'NOxTotal': 'mean',
    'C02Total': 'mean',
    'FF_NOxPerLiter': 'mean',
    'Power': 'mean'
}).reset_index()

fig = px.bar(engine_stats, x='EngineClassificationLabel', 
             y=['NOxTotal', 'C02Total'],
             title='Average Emissions by Engine Classification',
             labels={'value': 'Average Emissions', 'variable': 'Emission Type'},
             barmode='group')
fig.update_layout(height=500)
fig.show()

## 9. Pilot Program Comparison

In [None]:
# Compare metrics by pilot
pilot_stats = df.groupby('Pilot').agg({
    'NOxTotal': 'mean',
    'C02Total': 'mean',
    'FF_validated_fuel': 'mean',
    'Power': 'mean',
    'name': 'count'
}).reset_index()
pilot_stats.columns = ['Pilot', 'Avg_NOx', 'Avg_CO2', 'Avg_Fuel', 'Avg_Power', 'Count']

fig = make_subplots(rows=2, cols=2,
                    subplot_titles=('Avg NOx', 'Avg CO2', 'Avg Fuel', 'Equipment Count'))

fig.add_trace(go.Bar(x=pilot_stats['Pilot'], y=pilot_stats['Avg_NOx'], 
                     name='Avg NOx', marker_color='indianred'),
              row=1, col=1)
fig.add_trace(go.Bar(x=pilot_stats['Pilot'], y=pilot_stats['Avg_CO2'], 
                     name='Avg CO2', marker_color='lightseagreen'),
              row=1, col=2)
fig.add_trace(go.Bar(x=pilot_stats['Pilot'], y=pilot_stats['Avg_Fuel'], 
                     name='Avg Fuel', marker_color='lightsalmon'),
              row=2, col=1)
fig.add_trace(go.Bar(x=pilot_stats['Pilot'], y=pilot_stats['Count'], 
                     name='Count', marker_color='lightblue'),
              row=2, col=2)

fig.update_layout(title_text='Comparison Across Pilot Programs',
                  height=700, showlegend=False)
fig.show()

## 10. Correlation Analysis

In [None]:
# Select numeric columns for correlation
numeric_cols = ['Power', 'NOxTotal', 'C02Total', 'FF_validated_fuel', 
                'FF_NOxPerLiter', 'FF_FPH', 'FF_motorbelasting', 
                'duration_from_rows', 'CANBUS_validated_fuel']

corr_df = df[numeric_cols].dropna()
correlation_matrix = corr_df.corr()

fig = px.imshow(correlation_matrix,
                labels=dict(color="Correlation"),
                x=correlation_matrix.columns,
                y=correlation_matrix.columns,
                color_continuous_scale='RdBu_r',
                aspect="auto",
                title='Correlation Matrix of Key Metrics')
fig.update_layout(height=700, width=800)
fig.show()

## 11. Summary Statistics by Equipment Type

In [None]:
# Comprehensive summary by equipment type
equipment_summary = df.groupby('TypeOfEquipment').agg({
    'name': 'count',
    'Power': 'mean',
    'NOxTotal': 'mean',
    'C02Total': 'mean',
    'FF_validated_fuel': 'mean',
    'FF_NOxPerLiter': 'mean',
    'FF_motorbelasting': 'mean'
}).round(2)

equipment_summary.columns = ['Count', 'Avg Power (kW)', 'Avg NOx', 'Avg CO2', 
                             'Avg Fuel', 'Avg NOx/L', 'Avg Engine Load']
equipment_summary = equipment_summary.sort_values('Count', ascending=False)

print("\nSummary Statistics by Equipment Type:")
equipment_summary

## 12. Time-based Analysis

In [None]:
# Convert datekey to datetime
df['date'] = pd.to_datetime(df['datekey'], format='%Y%m%d')

# Daily aggregated metrics
daily_metrics = df.groupby('date').agg({
    'NOxTotal': 'sum',
    'C02Total': 'sum',
    'FF_validated_fuel': 'sum',
    'name': 'count'
}).reset_index()
daily_metrics.columns = ['Date', 'Total NOx', 'Total CO2', 'Total Fuel', 'Equipment Count']

# Create time series plot
fig = make_subplots(rows=2, cols=2,
                    subplot_titles=('Daily NOx Emissions', 'Daily CO2 Emissions', 
                                  'Daily Fuel Consumption', 'Equipment Count'),
                    specs=[[{"secondary_y": False}, {"secondary_y": False}],
                          [{"secondary_y": False}, {"secondary_y": False}]])

fig.add_trace(go.Scatter(x=daily_metrics['Date'], y=daily_metrics['Total NOx'],
                        mode='lines+markers', name='NOx', line=dict(color='red')),
              row=1, col=1)
fig.add_trace(go.Scatter(x=daily_metrics['Date'], y=daily_metrics['Total CO2'],
                        mode='lines+markers', name='CO2', line=dict(color='green')),
              row=1, col=2)
fig.add_trace(go.Scatter(x=daily_metrics['Date'], y=daily_metrics['Total Fuel'],
                        mode='lines+markers', name='Fuel', line=dict(color='orange')),
              row=2, col=1)
fig.add_trace(go.Bar(x=daily_metrics['Date'], y=daily_metrics['Equipment Count'],
                    name='Count', marker_color='lightblue'),
              row=2, col=2)

fig.update_layout(title_text='Daily Metrics Over Time',
                  height=700, showlegend=False)
fig.show()

## 13. Equipment Efficiency Analysis

In [None]:
# Calculate efficiency metrics
df['NOx_per_kW'] = df['NOxTotal'] / df['Power']
df['CO2_per_kW'] = df['C02Total'] / df['Power']
df['Fuel_per_kW'] = df['FF_validated_fuel'] / df['Power']

# Bubble chart: Efficiency overview
efficiency_df = df.dropna(subset=['NOx_per_kW', 'CO2_per_kW', 'Fuel_per_kW', 'Power'])

fig = px.scatter(efficiency_df, 
                 x='NOx_per_kW', 
                 y='CO2_per_kW',
                 size='Power',
                 color='TypeOfEquipment',
                 hover_data=['name', 'BrandLabel', 'EngineClassificationLabel'],
                 title='Equipment Efficiency: NOx and CO2 per kW',
                 labels={'NOx_per_kW': 'NOx per kW', 'CO2_per_kW': 'CO2 per kW'},
                 opacity=0.6)
fig.update_layout(height=600)
fig.show()

## 14. Load Type Analysis

In [None]:
# Analyze by load type (Belastingtype)
if 'Belastingtype' in df.columns:
    load_summary = df.groupby('Belastingtype').agg({
        'name': 'count',
        'NOxTotal': 'mean',
        'C02Total': 'mean',
        'FF_motorbelasting': 'mean',
        'Power': 'mean'
    }).reset_index()
    
    fig = px.bar(load_summary, x='Belastingtype', y='name',
                 title='Equipment Count by Load Type',
                 labels={'name': 'Count', 'Belastingtype': 'Load Type'},
                 color='name',
                 color_continuous_scale='viridis')
    fig.update_layout(height=500)
    fig.show()
    
    # NOx by load type
    fig = px.box(df, x='Belastingtype', y='NOxTotal',
                 color='TypeOfEquipment',
                 title='NOx Distribution by Load Type',
                 labels={'NOxTotal': 'Total NOx', 'Belastingtype': 'Load Type'})
    fig.update_layout(height=600)
    fig.show()

## 15. Key Insights and Conclusions

In [None]:
# Generate key statistics
print("=" * 60)
print("KEY INSIGHTS")
print("=" * 60)

print(f"\nüìä Dataset Overview:")
print(f"  - Total records: {len(df):,}")
print(f"  - Unique equipment: {df['name'].nunique()}")
print(f"  - Date range: {df['date'].min().date()} to {df['date'].max().date()}")
print(f"  - Equipment types: {df['TypeOfEquipment'].nunique()}")

print(f"\n‚ö° Power Statistics:")
print(f"  - Average power: {df['Power'].mean():.1f} kW")
print(f"  - Power range: {df['Power'].min():.0f} - {df['Power'].max():.0f} kW")

print(f"\nüè≠ Emissions Summary:")
print(f"  - Average NOx total: {df['NOxTotal'].mean():.2f}")
print(f"  - Average CO2 total: {df['C02Total'].mean():.2f}")
print(f"  - Average NOx per liter (FF): {df['FF_NOxPerLiter'].mean():.6f}")

print(f"\n‚õΩ Fuel Consumption:")
print(f"  - Average fuel (FF): {df['FF_validated_fuel'].mean():.2f}")
print(f"  - Average FPH (FF): {df['FF_FPH'].mean():.2f}")

print(f"\nüîß Engine Classification Distribution:")
for engine_type, count in df['EngineClassificationLabel'].value_counts().items():
    print(f"  - {engine_type}: {count} ({count/len(df)*100:.1f}%)")

print(f"\nüèÜ Top Equipment by Count:")
for equipment, count in df['TypeOfEquipment'].value_counts().head(5).items():
    print(f"  - {equipment}: {count}")

print("\n" + "=" * 60)