# EV Charging Data Exploration

Quick sanity checks against the synthetic TimescaleDB data: counts, utilization, and fault/uptime trends.


In [1]:
import os
import sys
from pathlib import Path

import pandas as pd
from sqlalchemy import create_engine, text

# Ensure project root on path so we can import src.config
ROOT = Path.cwd().resolve().parents[0]
os.chdir(ROOT)
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from src.config import DATABASE_URL  # noqa: E402

engine = create_engine(DATABASE_URL)
DATABASE_URL


'postgresql://postgres:postgres@127.0.0.1:5433/ev_charging'

In [2]:
# Basic table counts
with engine.connect() as conn:
    counts = {tbl: conn.execute(text(f"select count(*) from {tbl}")).scalar() for tbl in ['sites','chargers','charging_sessions','charger_status']}
counts


{'sites': 10,
 'chargers': 20,
 'charging_sessions': 424,
 'charger_status': 65191}

In [3]:
# Charger utilization (hours used per hour) over the last 7 days
util_query = text(
    """
    SELECT date_trunc('hour', start_time) AS hour,
           SUM(duration_minutes) / 60.0 AS hours_used
    FROM charging_sessions
    WHERE start_time >= now() - interval '7 days'
    GROUP BY 1
    ORDER BY 1
    """
)
util_df = pd.read_sql(util_query, engine)
util_df.head()


Unnamed: 0,hour,hours_used
0,2025-11-27 23:00:00+00:00,1.033333
1,2025-11-28 00:00:00+00:00,5.266667
2,2025-11-28 01:00:00+00:00,0.633333
3,2025-11-28 02:00:00+00:00,1.216667
4,2025-11-28 03:00:00+00:00,0.983333


In [4]:
import plotly.express as px
fig = px.line(util_df, x='hour', y='hours_used', title='Fleet utilization (hours used per hour, last 7 days)')
fig.update_layout(xaxis_title='Hour (UTC)', yaxis_title='Charger hours in use')
fig.show()


In [5]:
# Fault / offline rate over the last 14 days
fault_query = text(
    """
    SELECT date_trunc('day', time) AS day,
           SUM(CASE WHEN status IN ('FAULTED','OFFLINE') THEN 1 ELSE 0 END)::float / COUNT(*) AS fault_rate,
           COUNT(*) AS samples
    FROM charger_status
    WHERE time >= now() - interval '14 days'
    GROUP BY 1
    ORDER BY 1
    """
)
fault_df = pd.read_sql(fault_query, engine)
fault_df.head()


Unnamed: 0,day,fault_rate,samples
0,2025-11-27 00:00:00+00:00,0.135179,7982
1,2025-11-28 00:00:00+00:00,0.175716,9356
2,2025-11-29 00:00:00+00:00,0.173181,9262
3,2025-11-30 00:00:00+00:00,0.163901,9341
4,2025-12-01 00:00:00+00:00,0.147106,9313


In [6]:
fig = px.bar(fault_df, x='day', y='fault_rate', title='Fault/Offline rate (last 14 days)')
fig.update_layout(xaxis_title='Day (UTC)', yaxis_title='Fault/Offline fraction')
fig.show()


## Top reliability offenders

Top 10 chargers by fault rate (status pings) and by lost session minutes (failed sessions).

In [7]:

# Top 10 chargers by fault rate (last 14 days)
faulty_chargers = pd.read_sql(
    text(
        '''
        SELECT
            c.external_id,
            s.name AS site_name,
            c.model,
            c.connector_type,
            COUNT(*) AS samples,
            SUM(CASE WHEN cs.status IN ('FAULTED','OFFLINE') THEN 1 ELSE 0 END) AS faults,
            SUM(CASE WHEN cs.status IN ('FAULTED','OFFLINE') THEN 1 ELSE 0 END)::float / COUNT(*) AS fault_rate
        FROM charger_status cs
        JOIN chargers c ON cs.charger_id = c.charger_id
        JOIN sites s ON c.site_id = s.site_id
        WHERE cs.time >= now() - interval '14 days'
        GROUP BY c.external_id, s.name, c.model, c.connector_type
        HAVING COUNT(*) > 0
        ORDER BY fault_rate DESC
        LIMIT 10
        '''
    ),
    engine,
)
faulty_chargers


Unnamed: 0,external_id,site_name,model,connector_type,samples,faults,fault_rate
0,CHR-0001,Central Mall,ABB-Terra,CHAdeMO,3216,1259,0.39148
1,CHR-0004,Harbor Station,EVgo-Fast,CHAdeMO,3248,1259,0.387623
2,CHR-0006,University Quad,ABB-Terra,CHAdeMO,3252,483,0.148524
3,CHR-0009,Innovation Center,EVgo-Fast,CCS,3231,467,0.144537
4,CHR-0018,Lakeside,EVgo-Fast,CHAdeMO,3239,463,0.142945
5,CHR-0007,Tech Park,EVgo-Fast,CCS,3257,460,0.141234
6,CHR-0017,Central Mall,ABB-Terra,NACS,3279,460,0.140287
7,CHR-0005,Transit Hub,Delta-50,CHAdeMO,3277,458,0.139762
8,CHR-0008,University Quad,ChargePoint-Express,CCS,3243,440,0.135677
9,CHR-0014,Central Mall,ABB-Terra,CHAdeMO,3275,444,0.135573


In [8]:

# Top 10 chargers by lost session minutes (last 30 days)
lost_minutes = pd.read_sql(
    text(
        '''
        SELECT
            c.external_id,
            s.name AS site_name,
            c.model,
            c.connector_type,
            COUNT(*) AS sessions,
            SUM(duration_minutes) AS total_minutes,
            SUM(CASE WHEN success = false THEN duration_minutes ELSE 0 END) AS lost_minutes
        FROM charging_sessions cs
        JOIN chargers c ON cs.charger_id = c.charger_id
        JOIN sites s ON c.site_id = s.site_id
        WHERE cs.start_time >= now() - interval '30 days'
        GROUP BY c.external_id, s.name, c.model, c.connector_type
        HAVING SUM(CASE WHEN success = false THEN duration_minutes ELSE 0 END) > 0
        ORDER BY lost_minutes DESC
        LIMIT 10
        '''
    ),
    engine,
)
lost_minutes


Unnamed: 0,external_id,site_name,model,connector_type,sessions,total_minutes,lost_minutes
0,CHR-0003,Tech Park,ABB-Terra,CCS,15,787,173
1,CHR-0006,University Quad,ABB-Terra,CHAdeMO,23,1280,133
2,CHR-0004,Harbor Station,EVgo-Fast,CHAdeMO,22,1026,120
3,CHR-0007,Tech Park,EVgo-Fast,CCS,16,891,113
4,CHR-0012,Tech Park,ChargePoint-Express,Type2,25,1474,107
5,CHR-0002,Innovation Center,EVgo-Fast,CCS,24,1354,88
6,CHR-0017,Central Mall,ABB-Terra,NACS,27,1543,79
7,CHR-0001,Central Mall,ABB-Terra,CHAdeMO,21,1147,74
8,CHR-0011,Downtown Hub,ChargePoint-Express,CCS,23,1128,69
9,CHR-0016,Downtown Hub,Delta-50,NACS,21,1179,66


## Data Story: Charger Reliability Snapshot

**Goal:** Summarize fleet health and reliability patterns for the synthetic dataset.

**What we looked at (run the cells above):**
- Volume: table counts (sites, chargers, sessions, status pings) to size the dataset.
- Utilization: hours-in-use per hour over the last 7 days to see load patterns.
- Fault/Offline rate: daily fault fraction over the last 14 days.
- Offenders: top 10 chargers by fault rate (status pings) and by lost session minutes (failed sessions).

**Suggested reading of the results:**
- Check if fault/offline spikes align with utilization peaks—could indicate load-related instability.
- See which models/connector types appear most in the offenders list; that can inform vendor follow-ups.
- Lost session minutes highlights user-facing pain; prioritize those chargers even if their fault rate is modest.

**Questions to answer next:**
- Do certain sites/cities show higher fault rates (e.g., weather, maintenance)?
- Are OFFLINE windows clustered at specific times of day (e.g., network maintenance)?
- How quickly do chargers recover (MTTR) and how long do they run before next fault (MTBF)? Use `/api/reliability`.

**Actionable next steps:**
- Create alerts for sustained OFFLINE/FAULTED (>10–15 minutes) and for chargers in the top-10 lists.
- Drill into top offenders with status timelines to confirm whether faults interrupt active sessions.
- Compare model/connector hotspots with installation dates to see if aging hardware correlates with faults.


### Do faults align with utilization peaks?
Compare hourly utilization vs fault/offline rate over the last 7 days to spot load-related instability.

In [9]:

# Hourly utilization vs fault/offline rate (last 7 days)
fault_hourly = pd.read_sql(
    text(
        '''
        SELECT date_trunc('hour', time) AS hour,
               SUM(CASE WHEN status IN ('FAULTED','OFFLINE') THEN 1 ELSE 0 END)::float / COUNT(*) AS fault_rate,
               COUNT(*) AS samples
        FROM charger_status
        WHERE time >= now() - interval '7 days'
        GROUP BY 1
        ORDER BY 1
        '''
    ),
    engine,
)

util_hourly = pd.read_sql(
    text(
        '''
        SELECT date_trunc('hour', start_time) AS hour,
               SUM(duration_minutes) / 60.0 AS hours_used
        FROM charging_sessions
        WHERE start_time >= now() - interval '7 days'
        GROUP BY 1
        ORDER BY 1
        '''
    ),
    engine,
)

merged = fault_hourly.merge(util_hourly, on='hour', how='outer').fillna(0)
merged['fault_rate_pct'] = merged['fault_rate'] * 100
corr = merged[['fault_rate', 'hours_used']].corr().iloc[0,1]
print(f"Correlation (fault_rate vs hours_used): {corr:.3f}")
merged.tail()


Correlation (fault_rate vs hours_used): -0.087


Unnamed: 0,hour,fault_rate,samples,hours_used,fault_rate_pct
144,2025-12-03 23:00:00+00:00,0.090452,398,3.15,9.045226
145,2025-12-04 00:00:00+00:00,0.139896,386,1.966667,13.989637
146,2025-12-04 01:00:00+00:00,0.158442,385,0.0,15.844156
147,2025-12-04 02:00:00+00:00,0.132432,370,0.0,13.243243
148,2025-12-04 03:00:00+00:00,0.08046,174,0.0,8.045977


### Which models/connectors are most common in offenders?
Rank models/connectors by fault rate and by lost session minutes to guide vendor follow-ups.

In [10]:

# Fault rates by model/connector (last 14 days)
model_faults = pd.read_sql(
    text(
        '''
        SELECT
            c.model,
            c.connector_type,
            COUNT(*) AS samples,
            SUM(CASE WHEN cs.status IN ('FAULTED','OFFLINE') THEN 1 ELSE 0 END) AS faults,
            SUM(CASE WHEN cs.status IN ('FAULTED','OFFLINE') THEN 1 ELSE 0 END)::float / COUNT(*) AS fault_rate
        FROM charger_status cs
        JOIN chargers c ON cs.charger_id = c.charger_id
        WHERE cs.time >= now() - interval '14 days'
        GROUP BY c.model, c.connector_type
        HAVING COUNT(*) > 0
        ORDER BY fault_rate DESC
        '''
    ),
    engine,
)
model_faults.head(10)


Unnamed: 0,model,connector_type,samples,faults,fault_rate
0,EVgo-Fast,CHAdeMO,6487,1722,0.265454
1,ABB-Terra,CHAdeMO,12947,2500,0.193095
2,ABB-Terra,NACS,3279,460,0.140287
3,Delta-50,CHAdeMO,3277,458,0.139762
4,EVgo-Fast,CCS,9742,1309,0.134367
5,Delta-50,NACS,3327,444,0.133454
6,ChargePoint-Express,NACS,3281,425,0.129534
7,ChargePoint-Express,CCS,6559,844,0.128678
8,ChargePoint-Express,Type2,6502,820,0.126115
9,ChargePoint-Express,CHAdeMO,6548,815,0.124465


### Lost session minutes vs fault rate
Highlight chargers causing the most lost session minutes, even if their fault rate isn’t extreme.

In [11]:

# Lost session minutes and fault rate per charger
fault_rate_charger = pd.read_sql(
    text(
        '''
        SELECT
            c.external_id,
            s.name AS site_name,
            c.model,
            c.connector_type,
            COUNT(*) AS samples,
            SUM(CASE WHEN cs.status IN ('FAULTED','OFFLINE') THEN 1 ELSE 0 END)::float / COUNT(*) AS fault_rate
        FROM charger_status cs
        JOIN chargers c ON cs.charger_id = c.charger_id
        JOIN sites s ON c.site_id = s.site_id
        WHERE cs.time >= now() - interval '30 days'
        GROUP BY c.external_id, s.name, c.model, c.connector_type
        '''
    ),
    engine,
)

lost_minutes_charger = pd.read_sql(
    text(
        '''
        SELECT
            c.external_id,
            s.name AS site_name,
            c.model,
            c.connector_type,
            COUNT(*) AS sessions,
            SUM(duration_minutes) AS total_minutes,
            SUM(CASE WHEN success = false THEN duration_minutes ELSE 0 END) AS lost_minutes
        FROM charging_sessions cs
        JOIN chargers c ON cs.charger_id = c.charger_id
        JOIN sites s ON c.site_id = s.site_id
        WHERE cs.start_time >= now() - interval '30 days'
        GROUP BY c.external_id, s.name, c.model, c.connector_type
        HAVING SUM(CASE WHEN success = false THEN duration_minutes ELSE 0 END) > 0
        '''
    ),
    engine,
)

combo = lost_minutes_charger.merge(fault_rate_charger, on=['external_id','site_name','model','connector_type'], how='left')
combo = combo.sort_values('lost_minutes', ascending=False)
combo.head(10)


Unnamed: 0,external_id,site_name,model,connector_type,sessions,total_minutes,lost_minutes,samples,fault_rate
15,CHR-0003,Tech Park,ABB-Terra,CCS,15,787,173,3242,0.123689
3,CHR-0006,University Quad,ABB-Terra,CHAdeMO,23,1280,133,3252,0.148524
12,CHR-0004,Harbor Station,EVgo-Fast,CHAdeMO,22,1026,120,3248,0.387623
9,CHR-0007,Tech Park,EVgo-Fast,CCS,16,891,113,3257,0.141234
0,CHR-0012,Tech Park,ChargePoint-Express,Type2,25,1474,107,3255,0.133026
10,CHR-0002,Innovation Center,EVgo-Fast,CCS,24,1354,88,3254,0.117394
2,CHR-0017,Central Mall,ABB-Terra,NACS,27,1543,79,3279,0.140287
14,CHR-0001,Central Mall,ABB-Terra,CHAdeMO,21,1147,74,3216,0.39148
4,CHR-0011,Downtown Hub,ChargePoint-Express,CCS,23,1128,69,3316,0.121834
6,CHR-0016,Downtown Hub,Delta-50,NACS,21,1179,66,3327,0.133454


## Business-friendly reliability snapshots
Quick visuals for stakeholders: utilization vs faults, top offender chargers, lost minutes, and model/connector fault rates.

In [12]:

# Utilization vs fault/offline rate (last 7 days)
fault_hourly = pd.read_sql(
    text(
        '''
        SELECT date_trunc('hour', time) AS hour,
               SUM(CASE WHEN status IN ('FAULTED','OFFLINE') THEN 1 ELSE 0 END)::float / COUNT(*) AS fault_rate
        FROM charger_status
        WHERE time >= now() - interval '7 days'
        GROUP BY 1
        ORDER BY 1
        '''
    ),
    engine,
)

util_hourly = pd.read_sql(
    text(
        '''
        SELECT date_trunc('hour', start_time) AS hour,
               SUM(duration_minutes) / 60.0 AS hours_used
        FROM charging_sessions
        WHERE start_time >= now() - interval '7 days'
        GROUP BY 1
        ORDER BY 1
        '''
    ),
    engine,
)

merged = fault_hourly.merge(util_hourly, on='hour', how='outer').fillna(0)
merged['fault_rate_pct'] = merged['fault_rate'] * 100
corr = merged[['fault_rate', 'hours_used']].corr().iloc[0,1]

import plotly.express as px
fig = px.line(merged, x='hour', y=['fault_rate_pct','hours_used'],
              labels={'value':'Value','variable':'Metric'},
              title=f"Fault rate vs utilization (last 7 days) | corr={corr:.3f}")
fig.update_layout(yaxis_title='Fault rate (%) / Hours used', legend_title='')
fig.show()


In [13]:

# Top 10 chargers by fault rate (last 14 days)
top_fault = pd.read_sql(
    text(
        '''
        SELECT
            c.external_id,
            s.name AS site_name,
            c.model,
            c.connector_type,
            COUNT(*) AS samples,
            SUM(CASE WHEN cs.status IN ('FAULTED','OFFLINE') THEN 1 ELSE 0 END)::float / COUNT(*) AS fault_rate
        FROM charger_status cs
        JOIN chargers c ON cs.charger_id = c.charger_id
        JOIN sites s ON c.site_id = s.site_id
        WHERE cs.time >= now() - interval '14 days'
        GROUP BY c.external_id, s.name, c.model, c.connector_type
        HAVING COUNT(*) > 0
        ORDER BY fault_rate DESC
        LIMIT 10
        '''
    ), engine)

fig = px.bar(top_fault, x='external_id', y='fault_rate', color='site_name',
             hover_data=['model','connector_type','samples'],
             title='Top 10 chargers by fault rate (last 14 days)')
fig.update_layout(yaxis_title='Fault rate', xaxis_title='Charger')
fig.show()


In [14]:

# Top 10 chargers by lost session minutes (last 30 days)
lost = pd.read_sql(
    text(
        '''
        SELECT
            c.external_id,
            s.name AS site_name,
            c.model,
            c.connector_type,
            COUNT(*) AS sessions,
            SUM(duration_minutes) AS total_minutes,
            SUM(CASE WHEN success = false THEN duration_minutes ELSE 0 END) AS lost_minutes
        FROM charging_sessions cs
        JOIN chargers c ON cs.charger_id = c.charger_id
        JOIN sites s ON c.site_id = s.site_id
        WHERE cs.start_time >= now() - interval '30 days'
        GROUP BY c.external_id, s.name, c.model, c.connector_type
        HAVING SUM(CASE WHEN success = false THEN duration_minutes ELSE 0 END) > 0
        ORDER BY lost_minutes DESC
        LIMIT 10
        '''
    ), engine)

fig = px.bar(lost, x='external_id', y='lost_minutes', color='site_name',
             hover_data=['model','connector_type','sessions','total_minutes'],
             title='Top 10 chargers by lost session minutes (last 30 days)')
fig.update_layout(yaxis_title='Lost minutes', xaxis_title='Charger')
fig.show()


In [15]:

# Fault rate by model/connector (last 14 days)
model_faults = pd.read_sql(
    text(
        '''
        SELECT
            c.model,
            c.connector_type,
            COUNT(*) AS samples,
            SUM(CASE WHEN cs.status IN ('FAULTED','OFFLINE') THEN 1 ELSE 0 END) AS faults,
            SUM(CASE WHEN cs.status IN ('FAULTED','OFFLINE') THEN 1 ELSE 0 END)::float / COUNT(*) AS fault_rate
        FROM charger_status cs
        JOIN chargers c ON cs.charger_id = c.charger_id
        WHERE cs.time >= now() - interval '14 days'
        GROUP BY c.model, c.connector_type
        HAVING COUNT(*) > 0
        ORDER BY fault_rate DESC
        '''
    ), engine)

fig = px.bar(model_faults, x='model', y='fault_rate', color='connector_type', barmode='group',
             title='Fault rate by model / connector (last 14 days)')
fig.update_layout(yaxis_title='Fault rate', xaxis_title='Model')
fig.show()
