## Historic demand data (2001-2025) (Author: Sander Wijns)

## Loading data

In this section we will load the Historic demand data

In [193]:
from pathlib import Path
import pandas as pd

# Try a few likely relative locations for the data directory (normally ../Data/demand_prices)
candidates = [Path('../Data/demand_prices')]
data_dir = None
for p in candidates:
    if p.exists() and p.is_dir():
        data_dir = p
        break

if data_dir is None:
    raise FileNotFoundError('Could not find the Data/demand_prices directory. Checked candidates: ' + ', '.join(str(p) for p in candidates))

# Read all CSV files and collect first row headers and full data
files = sorted(data_dir.glob('*.csv'))
if not files:
    raise FileNotFoundError(f'No CSV files found in {data_dir}')

dfs = []
headers_list = []  # will hold the first data row (as dataframe) from each file
for f in files:
    try:
        df = pd.read_csv(f)
        # Convert column names to lowercase immediately after reading
        df.columns = df.columns.str.lower()
    except Exception as e:
        print(f'Failed to read {f}: {e}')
        continue

    # If file has at least one data row, take the first row as a DataFrame
    if len(df) > 0:
        header_row = df.iloc[[0]].copy()
        # record source file name for traceability
        header_row['source_file'] = f.name
        headers_list.append(header_row)
    else:
        # create an empty one-row dataframe with only source_file so we keep track of the file
        headers_list.append(pd.DataFrame({'source_file': [f.name]}))

    # Optionally add source_file to the full df as well
    df['source_file'] = f.name
    dfs.append(df)

# Concatenate into a single DataFrame
if dfs:
    demand_df = pd.concat(dfs, ignore_index=True)
else:
    demand_df = pd.DataFrame()

# Build headers_demand DataFrame (union of columns across first rows)
if headers_list:
    headers_demand = pd.concat(headers_list, ignore_index=True, sort=False)
    # normalize column names on headers_demand too
    headers_demand.columns = headers_demand.columns.str.lower()
else:
    headers_demand = pd.DataFrame()

print(f'Combined {len(dfs)} files from {data_dir} — total rows: {len(demand_df)}')
print(f'Collected first-row headers from {len(headers_list)} files — headers_demand shape: {headers_demand.shape}')

# Show a quick preview and summary
print('\nSample of combined demand_df:')
display(demand_df.head())
demand_df.info()

print('\nSample of headers_demand (first row from each CSV):')
display(headers_demand.head(25))
headers_demand.info()

Combined 25 files from ..\Data\demand_prices — total rows: 434014
Collected first-row headers from 25 files — headers_demand shape: (25, 23)

Sample of combined demand_df:


Unnamed: 0,settlement_date,settlement_period,nd,tsd,england_wales_demand,embedded_wind_generation,embedded_wind_capacity,embedded_solar_generation,embedded_solar_capacity,non_bm_stor,...,ifa2_flow,britned_flow,moyle_flow,east_west_flow,nemo_flow,nsl_flow,eleclink_flow,viking_flow,greenlink_flow,source_file
0,2001-01-01,1,38631,,34060,,,,,0,...,,,,,,,,,,demanddata_2001.csv
1,2001-01-01,2,39808,,35370,,,,,0,...,,,,,,,,,,demanddata_2001.csv
2,2001-01-01,3,40039,,35680,,,,,0,...,,,,,,,,,,demanddata_2001.csv
3,2001-01-01,4,39339,,35029,,,,,0,...,,,,,,,,,,demanddata_2001.csv
4,2001-01-01,5,38295,,34047,,,,,0,...,,,,,,,,,,demanddata_2001.csv


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 434014 entries, 0 to 434013
Data columns (total 23 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   settlement_date            434014 non-null  object 
 1   settlement_period          434014 non-null  int64  
 2   nd                         434014 non-null  int64  
 3   tsd                        363886 non-null  float64
 4   england_wales_demand       434014 non-null  int64  
 5   embedded_wind_generation   328846 non-null  float64
 6   embedded_wind_capacity     328846 non-null  float64
 7   embedded_solar_generation  293758 non-null  float64
 8   embedded_solar_capacity    293758 non-null  float64
 9   non_bm_stor                434014 non-null  int64  
 10  pump_storage_pumping       434014 non-null  int64  
 11  scottish_transfer          48334 non-null   float64
 12  ifa_flow                   434014 non-null  int64  
 13  ifa2_flow                  29

Unnamed: 0,settlement_date,settlement_period,nd,tsd,england_wales_demand,embedded_wind_generation,embedded_wind_capacity,embedded_solar_generation,embedded_solar_capacity,non_bm_stor,...,ifa2_flow,britned_flow,moyle_flow,east_west_flow,nemo_flow,nsl_flow,eleclink_flow,viking_flow,greenlink_flow,source_file
0,2001-01-01,1,38631,,34060,,,,,0,...,,,,,,,,,,demanddata_2001.csv
1,2002-01-01,1,39544,,35034,,,,,0,...,,,,,,,,,,demanddata_2002.csv
2,2003-01-01,1,36977,,32817,,,,,14,...,,,,,,,,,,demanddata_2003.csv
3,2004-01-01,1,39462,,35168,,,,,0,...,,,,,,,,,,demanddata_2004.csv
4,2005-01-01,1,37313,0.0,33019,,,,,0,...,,,0.0,,,,,,,demanddata_2005.csv
5,2006-01-01,1,38596,39660.0,34982,,,,,0,...,,,-169.0,,,,,,,demanddata_2006.csv
6,2007-01-01,1,35628,36409.0,32120,0.0,0.0,,,0,...,,,16.0,,,,,,,demanddata_2007.csv
7,2008-01-01,1,36164,36847.0,32466,59.0,1163.0,,,0,...,,,-75.0,,,,,,,demanddata_2008.csv
8,01-JAN-2009,1,37910,38704.0,33939,54.0,1403.0,0.0,0.0,0,...,0.0,0.0,-161.0,0.0,0.0,,,,,demanddata_2009.csv
9,01-JAN-2010,1,36453,37593.0,32391,906.0,1786.0,0.0,0.0,0,...,0.0,0.0,-234.0,0.0,0.0,,,,,demanddata_2010.csv


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   settlement_date            25 non-null     object 
 1   settlement_period          25 non-null     int64  
 2   nd                         25 non-null     int64  
 3   tsd                        21 non-null     float64
 4   england_wales_demand       25 non-null     int64  
 5   embedded_wind_generation   19 non-null     float64
 6   embedded_wind_capacity     19 non-null     float64
 7   embedded_solar_generation  17 non-null     float64
 8   embedded_solar_capacity    17 non-null     float64
 9   non_bm_stor                25 non-null     int64  
 10  pump_storage_pumping       25 non-null     int64  
 11  scottish_transfer          3 non-null      float64
 12  ifa_flow                   25 non-null     int64  
 13  ifa2_flow                  17 non-null     float64
 

Above we can see that the date formats used in the csv files is not always the same, this might lead to issues as we will see below:

In [194]:
# Find the settlement date column (might be different capitalizations)
date_col = 'settlement_date'
year = '2008'
n = 10

# Filter for year dates and show first n rows
demand_year = demand_df[demand_df[date_col].str.startswith(year)].head(n)
print(f"\nFirst {n} rows from {year} ({len(demand_year)} shown out of {len(demand_df[demand_df[date_col].str.startswith(year)])} total {year} rows):")
display(demand_year)


First 10 rows from 2008 (10 shown out of 17568 total 2008 rows):


Unnamed: 0,settlement_date,settlement_period,nd,tsd,england_wales_demand,embedded_wind_generation,embedded_wind_capacity,embedded_solar_generation,embedded_solar_capacity,non_bm_stor,...,ifa2_flow,britned_flow,moyle_flow,east_west_flow,nemo_flow,nsl_flow,eleclink_flow,viking_flow,greenlink_flow,source_file
122688,2008-01-01,1,36164,36847.0,32466,59.0,1163.0,,,0,...,,,-75.0,,,,,,,demanddata_2008.csv
122689,2008-01-01,2,36368,37052.0,32689,50.0,1163.0,,,0,...,,,-75.0,,,,,,,demanddata_2008.csv
122690,2008-01-01,3,36138,36822.0,32452,50.0,1163.0,,,0,...,,,-75.0,,,,,,,demanddata_2008.csv
122691,2008-01-01,4,35017,35700.0,31339,29.0,1163.0,,,0,...,,,-75.0,,,,,,,demanddata_2008.csv
122692,2008-01-01,5,33927,34610.0,30297,29.0,1163.0,,,0,...,,,-75.0,,,,,,,demanddata_2008.csv
122693,2008-01-01,6,33276,33974.0,29617,20.0,1163.0,,,0,...,,,-75.0,,,,,,,demanddata_2008.csv
122694,2008-01-01,7,31956,33453.0,28277,20.0,1163.0,,,0,...,,,-75.0,,,,,,,demanddata_2008.csv
122695,2008-01-01,8,30537,32169.0,27054,29.0,1163.0,,,0,...,,,-75.0,,,,,,,demanddata_2008.csv
122696,2008-01-01,9,29346,30972.0,26052,29.0,1163.0,,,0,...,,,-75.0,,,,,,,demanddata_2008.csv
122697,2008-01-01,10,28714,30345.0,25559,33.0,1163.0,,,0,...,,,-75.0,,,,,,,demanddata_2008.csv


In [195]:
# Find the settlement date column (might be different capitalizations)
date_col = 'settlement_date'
year = '2015'
n = 10

# Filter for year dates and show first n rows
demand_year = demand_df[demand_df[date_col].str.startswith(year)].head(n)
print(f"\nFirst {n} rows from {year} ({len(demand_year)} shown out of {len(demand_df[demand_df[date_col].str.startswith(year)])} total {year} rows):")
display(demand_year)


First 10 rows from 2015 (0 shown out of 0 total 2015 rows):


Unnamed: 0,settlement_date,settlement_period,nd,tsd,england_wales_demand,embedded_wind_generation,embedded_wind_capacity,embedded_solar_generation,embedded_solar_capacity,non_bm_stor,...,ifa2_flow,britned_flow,moyle_flow,east_west_flow,nemo_flow,nsl_flow,eleclink_flow,viking_flow,greenlink_flow,source_file


When showing the records from the year 2008, there is no problem if we look for the records where the date starts with '2008'. If we want to find the records in the same way for 2015, there will be no records found because of the different data formatting. This will be solved below in the cleaning part.

Meaning columns:

*Dates*:
- SETTLEMENT_DATE: The date the historic outturn occurred.
- SETTLEMENT_PERIOD: The half hourly period for the historic outturn occurred.

*Measures (all in MW)*:
- ND (National Demand): The sum of metered generation (customer demand).
- TSD (Transmission System Demand): Equal to ND + the additional generation required to meet station load, pump storage pumping and interconnector exports.
- ENGLAND_WALES_DEMAND: ND of England and Wales.

*Sustainable generation*:
- EMBEDDED_WIND_GENERATION: Estimate of GB wind generation from wind farms which do NOT have Transmission System metering installed (more wind = less demand on net).
- EMBEDDED_WIND_CAPACITY: Installed embedded wind capacity in GB.
- EMBEDDED_SOLAR_GENERATION: Estimate of GB solar generation from PV panels.
- EMBEDDED_SOLAR_CAPACITY: Installed embedded solar capacity in GB.

*Other operational factors*:
- NON_BM_STOR (Non-Balancing Mechanism Short-Term Operating Reserve): For units not included in ND generator definition (generation or demand reduction).
- PUMP_STORAGE_PUMPING: Additional demand on system due to pumping at hydro pump storage units.

*Regional transfer*:
- SCOTTISH_TRANSFER: Power flow from Scotland to England and Wales. Negative values mean flow from England and Wales to Scotland.

*Interconnector flows (all in MW; -ve = export from GB, +ve = import to GB)*:
- IFA_FLOW: Power flow on the IFA interconnector (GB ↔ France).
- IFA2_FLOW: Power flow on the IFA2 interconnector (GB ↔ France).
- BRITNED_FLOW: Power flow on the BritNed interconnector (GB ↔ Netherlands).
- MOYLE_FLOW: Power flow on the Moyle interconnector (GB ↔ Northern Ireland).
- EAST_WEST_FLOW: Power flow on the East West interconnector (GB ↔ Ireland).
- NEMO_FLOW: Power flow on the Nemo interconnector (GB ↔ Belgium).
- NSL_FLOW: Power flow on the North Sea Link interconnector (GB ↔ Norway).
- ELECLINK_FLOW: Power flow on the ElecLink interconnector (GB ↔ France via the Channel Tunnel).
- VIKING_FLOW: Power flow on the Viking interconnector (GB ↔ Denmark).
- GREENLINK_FLOW: Power flow on the Greenlink interconnector (GB ↔ Ireland).

## Cleaning

**settlement_date**:
The settlement date is formatted like dd-mm-yy from 2001-2008 but like dd-MMM-yy from 2009 and onwards (exceptions are 2023 and 2025). These dates need to be stored in the same format before we can use them.

We'll use a function to clean the settlement_date so they are in the same format and show the headers containing the first record of each csv file to check if the dates are now correctly formatted:

In [196]:
def clean_settlement_date(df, col='settlement_date'):
    # df[col] = df[col].replace('########', np.nan) no longer necessary, correct in csv files
    d1 = pd.to_datetime(df[col], format='%Y-%m-%d', errors='coerce')
    d2 = pd.to_datetime(df[col], format='%d-%b-%y', errors='coerce')
    d3 = pd.to_datetime(df[col], format='%d-%b-%Y', errors='coerce')
    df[col] = d1.fillna(d2).fillna(d3)
    # df = df.dropna(subset=[col])
    df[col] = df[col].dt.strftime('%Y-%m-%d')
    return df

headers_demand = clean_settlement_date(headers_demand, col='settlement_date')

display(headers_demand.head(25))
headers_demand.info()

Unnamed: 0,settlement_date,settlement_period,nd,tsd,england_wales_demand,embedded_wind_generation,embedded_wind_capacity,embedded_solar_generation,embedded_solar_capacity,non_bm_stor,...,ifa2_flow,britned_flow,moyle_flow,east_west_flow,nemo_flow,nsl_flow,eleclink_flow,viking_flow,greenlink_flow,source_file
0,2001-01-01,1,38631,,34060,,,,,0,...,,,,,,,,,,demanddata_2001.csv
1,2002-01-01,1,39544,,35034,,,,,0,...,,,,,,,,,,demanddata_2002.csv
2,2003-01-01,1,36977,,32817,,,,,14,...,,,,,,,,,,demanddata_2003.csv
3,2004-01-01,1,39462,,35168,,,,,0,...,,,,,,,,,,demanddata_2004.csv
4,2005-01-01,1,37313,0.0,33019,,,,,0,...,,,0.0,,,,,,,demanddata_2005.csv
5,2006-01-01,1,38596,39660.0,34982,,,,,0,...,,,-169.0,,,,,,,demanddata_2006.csv
6,2007-01-01,1,35628,36409.0,32120,0.0,0.0,,,0,...,,,16.0,,,,,,,demanddata_2007.csv
7,2008-01-01,1,36164,36847.0,32466,59.0,1163.0,,,0,...,,,-75.0,,,,,,,demanddata_2008.csv
8,2009-01-01,1,37910,38704.0,33939,54.0,1403.0,0.0,0.0,0,...,0.0,0.0,-161.0,0.0,0.0,,,,,demanddata_2009.csv
9,2010-01-01,1,36453,37593.0,32391,906.0,1786.0,0.0,0.0,0,...,0.0,0.0,-234.0,0.0,0.0,,,,,demanddata_2010.csv


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   settlement_date            25 non-null     object 
 1   settlement_period          25 non-null     int64  
 2   nd                         25 non-null     int64  
 3   tsd                        21 non-null     float64
 4   england_wales_demand       25 non-null     int64  
 5   embedded_wind_generation   19 non-null     float64
 6   embedded_wind_capacity     19 non-null     float64
 7   embedded_solar_generation  17 non-null     float64
 8   embedded_solar_capacity    17 non-null     float64
 9   non_bm_stor                25 non-null     int64  
 10  pump_storage_pumping       25 non-null     int64  
 11  scottish_transfer          3 non-null      float64
 12  ifa_flow                   25 non-null     int64  
 13  ifa2_flow                  17 non-null     float64
 

As we can see above, the dates now all use the same format ('%Y-%m-%d'). We only applied the function to the headers to quickly confirm if it works, now we can apply the function to the entire demand_df:

In [197]:
demand_df = clean_settlement_date(demand_df, col='settlement_date')

display(demand_df.head())
demand_df.info()

Unnamed: 0,settlement_date,settlement_period,nd,tsd,england_wales_demand,embedded_wind_generation,embedded_wind_capacity,embedded_solar_generation,embedded_solar_capacity,non_bm_stor,...,ifa2_flow,britned_flow,moyle_flow,east_west_flow,nemo_flow,nsl_flow,eleclink_flow,viking_flow,greenlink_flow,source_file
0,2001-01-01,1,38631,,34060,,,,,0,...,,,,,,,,,,demanddata_2001.csv
1,2001-01-01,2,39808,,35370,,,,,0,...,,,,,,,,,,demanddata_2001.csv
2,2001-01-01,3,40039,,35680,,,,,0,...,,,,,,,,,,demanddata_2001.csv
3,2001-01-01,4,39339,,35029,,,,,0,...,,,,,,,,,,demanddata_2001.csv
4,2001-01-01,5,38295,,34047,,,,,0,...,,,,,,,,,,demanddata_2001.csv


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 434014 entries, 0 to 434013
Data columns (total 23 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   settlement_date            434014 non-null  object 
 1   settlement_period          434014 non-null  int64  
 2   nd                         434014 non-null  int64  
 3   tsd                        363886 non-null  float64
 4   england_wales_demand       434014 non-null  int64  
 5   embedded_wind_generation   328846 non-null  float64
 6   embedded_wind_capacity     328846 non-null  float64
 7   embedded_solar_generation  293758 non-null  float64
 8   embedded_solar_capacity    293758 non-null  float64
 9   non_bm_stor                434014 non-null  int64  
 10  pump_storage_pumping       434014 non-null  int64  
 11  scottish_transfer          48334 non-null   float64
 12  ifa_flow                   434014 non-null  int64  
 13  ifa2_flow                  29

If we now try to find the records from 2008 or 2015 in the same way, this should work without problems:

In [198]:
# Find the settlement date column (might be different capitalizations)
date_col = 'settlement_date'
year = '2008'
n = 10

# Filter for year dates and show first n rows
demand_year = demand_df[demand_df[date_col].str.startswith(year)].head(n)
print(f"\nFirst {n} rows from {year} ({len(demand_year)} shown out of {len(demand_df[demand_df[date_col].str.startswith(year)])} total {year} rows):")
display(demand_year)


First 10 rows from 2008 (10 shown out of 17568 total 2008 rows):


Unnamed: 0,settlement_date,settlement_period,nd,tsd,england_wales_demand,embedded_wind_generation,embedded_wind_capacity,embedded_solar_generation,embedded_solar_capacity,non_bm_stor,...,ifa2_flow,britned_flow,moyle_flow,east_west_flow,nemo_flow,nsl_flow,eleclink_flow,viking_flow,greenlink_flow,source_file
122688,2008-01-01,1,36164,36847.0,32466,59.0,1163.0,,,0,...,,,-75.0,,,,,,,demanddata_2008.csv
122689,2008-01-01,2,36368,37052.0,32689,50.0,1163.0,,,0,...,,,-75.0,,,,,,,demanddata_2008.csv
122690,2008-01-01,3,36138,36822.0,32452,50.0,1163.0,,,0,...,,,-75.0,,,,,,,demanddata_2008.csv
122691,2008-01-01,4,35017,35700.0,31339,29.0,1163.0,,,0,...,,,-75.0,,,,,,,demanddata_2008.csv
122692,2008-01-01,5,33927,34610.0,30297,29.0,1163.0,,,0,...,,,-75.0,,,,,,,demanddata_2008.csv
122693,2008-01-01,6,33276,33974.0,29617,20.0,1163.0,,,0,...,,,-75.0,,,,,,,demanddata_2008.csv
122694,2008-01-01,7,31956,33453.0,28277,20.0,1163.0,,,0,...,,,-75.0,,,,,,,demanddata_2008.csv
122695,2008-01-01,8,30537,32169.0,27054,29.0,1163.0,,,0,...,,,-75.0,,,,,,,demanddata_2008.csv
122696,2008-01-01,9,29346,30972.0,26052,29.0,1163.0,,,0,...,,,-75.0,,,,,,,demanddata_2008.csv
122697,2008-01-01,10,28714,30345.0,25559,33.0,1163.0,,,0,...,,,-75.0,,,,,,,demanddata_2008.csv


In [199]:
# Find the settlement date column (might be different capitalizations)
date_col = 'settlement_date'
year = '2015'
n = 10

# Filter for year dates and show first n rows
demand_year = demand_df[demand_df[date_col].str.startswith(year)].head(n)
print(f"\nFirst {n} rows from {year} ({len(demand_year)} shown out of {len(demand_df[demand_df[date_col].str.startswith(year)])} total {year} rows):")
display(demand_year)


First 10 rows from 2015 (10 shown out of 17520 total 2015 rows):


Unnamed: 0,settlement_date,settlement_period,nd,tsd,england_wales_demand,embedded_wind_generation,embedded_wind_capacity,embedded_solar_generation,embedded_solar_capacity,non_bm_stor,...,ifa2_flow,britned_flow,moyle_flow,east_west_flow,nemo_flow,nsl_flow,eleclink_flow,viking_flow,greenlink_flow,source_file
245424,2015-01-01,1,28726,29490.0,26351,2211.0,4039.0,0.0,5994.0,0,...,0.0,854.0,-25.0,-121.0,0.0,,,,,demanddata_2015.csv
245425,2015-01-01,2,29391,30135.0,27001,2177.0,4039.0,0.0,5994.0,0,...,0.0,903.0,3.0,-121.0,0.0,,,,,demanddata_2015.csv
245426,2015-01-01,3,29231,30448.0,26874,2224.0,4039.0,0.0,5994.0,0,...,0.0,888.0,-53.0,-121.0,0.0,,,,,demanddata_2015.csv
245427,2015-01-01,4,28095,29381.0,25824,2270.0,4039.0,0.0,5994.0,0,...,0.0,864.0,-40.0,-56.0,0.0,,,,,demanddata_2015.csv
245428,2015-01-01,5,26896,28667.0,24752,2246.0,4039.0,0.0,5994.0,0,...,0.0,881.0,-82.0,-7.0,0.0,,,,,demanddata_2015.csv
245429,2015-01-01,6,26355,28175.0,24236,2222.0,4039.0,0.0,5994.0,0,...,0.0,876.0,-82.0,-6.0,0.0,,,,,demanddata_2015.csv
245430,2015-01-01,7,25319,27248.0,23239,2240.0,4039.0,0.0,5994.0,0,...,0.0,877.0,-90.0,35.0,0.0,,,,,demanddata_2015.csv
245431,2015-01-01,8,24237,26366.0,22270,2259.0,4039.0,0.0,5994.0,0,...,0.0,876.0,-150.0,113.0,0.0,,,,,demanddata_2015.csv
245432,2015-01-01,9,23388,26473.0,21543,2160.0,4039.0,0.0,5994.0,0,...,0.0,877.0,-151.0,211.0,0.0,,,,,demanddata_2015.csv
245433,2015-01-01,10,23035,26221.0,21249,2060.0,4039.0,0.0,5994.0,0,...,0.0,876.0,-151.0,269.0,0.0,,,,,demanddata_2015.csv


After the date is cleaned, we can find records per year easily in the same way. We can now export this data so we can use this more cleaned data in other notebooks:

In [203]:
demand_df = demand_df.drop(columns=['source_file'])
demand_df.to_csv("../Data/demand_prices/cleaned_data_neso_demand_2001_2025.csv", index=False, compression='gzip')