# Power Plant Annual Output

<br>

### Imports

In [1]:
import numpy as np
import pandas as pd

import dask
import dask.dataframe as dd

import os
from tqdm import tqdm

<br>

### User Inputs

In [2]:
static_data_dir = 'C:/Users/Ayrto/Desktop/Freelance Work/FEA/work/data-hub/data/static'
powerdict_fp = 'C:/Users/Ayrto/Desktop/Side Projects/Power-Station-Dictionary/data/dictionary/ids.csv'

In [3]:
@dask.delayed
def read_B1610_file(filename, columns):
    df_B1610_week = pd.read_csv(filename)
    cols_to_add = list(set(columns) - set(df_B1610_week.columns))
    df_B1610_week[cols_to_add] = np.NaN
    df_B1610_week = df_B1610_week[columns]
    
    return df_B1610_week

def get_B1610_columns(static_dir, source_name='bmrs', stream='B1610'):
    B1610_files = [f for f in os.listdir(f'{static_dir}/{source_name}/{stream}') if '.csv' in f]

    columns = []

    for B1610_file in tqdm(B1610_files):
        df_B1610_week = pd.read_csv(f'{static_dir}/{source_name}/{stream}/{B1610_file}')
        columns += list(df_B1610_week.columns)

    columns = ['datetime'] + sorted(list(set(columns)-set(['datetime'])))

    return columns

def load_B1610_dask_stream_df(static_dir, source_name, stream, dt_col='datetime'):
    # Identifying columns
    columns = get_B1610_columns(static_dir)
    
    # Loading data
    B1610_files = [f for f in os.listdir(f'{static_dir}/{source_name}/{stream}') if '.csv' in f]
    df_B1610 = dd.from_delayed([read_B1610_file(f'{static_dir}/{source_name}/{stream}/{B1610_file}', columns) for B1610_file in B1610_files])

    # Formatting date index
    if dt_col is not None:
        df_B1610[dt_col] = df_B1610[dt_col].map(lambda dt: pd.to_datetime(dt, format='%Y-%m-%d %H:%M:%S', errors='coerce', utc=True))
        df_B1610 = df_B1610.set_index(dt_col)
    
    return df_B1610

In [4]:
source_name = 'bmrs'
stream = 'B1610'

df_B1610 = load_B1610_dask_stream_df(static_data_dir, source_name, stream)

df_B1610.head(3)

100%|████████████████████████████████████████████████████████████████████████████████| 288/288 [00:04<00:00, 61.76it/s]
  self[col] = value
  self[col] = value


Unnamed: 0_level_0,ABRBO-1,ABRTW-1,ABTH7,ABTH7G,ABTH8,ABTH8G,ABTH9,ABTH9G,ACHRW-1,AKGLW-2,...,WILCT-1,WLNYO-2,WLNYO-3,WLNYO-4,WLNYW-1,WTMSO-1,WYLF-1,WYLF-2,WYLF-3,WYLF-4
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-10-04 23:00:00+00:00,,,421.692,,0.0,,,,,,...,,132.9,,,95.602,9.858,227.38,212.41,,
2015-10-04 23:30:00+00:00,,,425.096,,0.0,,,,,,...,,143.02,,,107.324,14.89,227.47,212.53,,
2015-10-05 00:00:00+00:00,,,423.292,,0.0,,,,,,...,,158.98,,,118.342,23.074,227.666,212.72,,


In [5]:
df_powerdict = pd.read_csv(powerdict_fp)

df_powerdict.head(3)

Unnamed: 0,osuked_id,gppd_idnr,esail_id,name,sett_bmu_id,ngc_bmu_id,4c_offshore_id,windpowernet_id,wikidata_id,wikipedia_id,power_technology_id,eutl_id
0,10000,,MARK,Rothes Bio-Plant CHP,"E_MARK-1, E_MARK-2","MARK-1, MARK-2",,,,,,
1,10001,"GBR1000377, GBR1000369",DIDC,Didcot,"T_DIDC1, T_DIDC2, T_DIDC4, T_DIDC3, T_DIDC1G, ...","DIDC1, DIDC2, DIDC4, DIDC3, DIDC1G, DIDC2G, DI...",,,,,,97165.0
2,10002,"GBR1000374, GBR1000375",ABTH,Aberthaw B,"T_ABTH7, T_ABTH8, T_ABTH9, T_ABTH7G, T_ABTH8G,...","ABTH7, ABTH8, ABTH9, ABTH7G, ABTH8G, ABTH9G",,,,,,97175.0


In [6]:
flatten_list = lambda list_: [item for sublist in list_ for item in sublist]

powerdict_ngc_bmu_ids = sorted(list(set(flatten_list(df_powerdict['ngc_bmu_id'].dropna().str.split(', ').to_list()))))

len(powerdict_ngc_bmu_ids)

444

In [7]:
B1610_ngc_bmu_ids = sorted(list(df_B1610.columns))

missing_ngc_bmu_ids_from_powerdict = sorted(list(set(B1610_ngc_bmu_ids) - set(powerdict_ngc_bmu_ids)))

missing_ngc_bmu_ids_from_powerdict

['GRAI1G', 'GRAI4G', 'KNLCV-1', 'LCSMH-1', 'SEEL-1', 'WILCT-1']

In [8]:
%%time

common_ngc_bmu_ids = sorted(list(set(B1610_ngc_bmu_ids).intersection(set(powerdict_ngc_bmu_ids))))

start_dt = pd.to_datetime('2016', utc=True)
end_dt = pd.to_datetime('2020-12-31-23:30', utc=True)

df_B1610_annual_totals = (df_B1610
                          .loc[start_dt:end_dt, common_ngc_bmu_ids]
                          .compute()
                          .resample('Y')
                          .sum())

df_B1610_annual_totals.index = df_B1610_annual_totals.index.year

df_B1610_annual_totals

  self[col] = value


Wall time: 30.7 s


Unnamed: 0_level_0,ABRBO-1,ABRTW-1,ABTH7,ABTH7G,ABTH8,ABTH8G,ABTH9,ABTH9G,ACHRW-1,AKGLW-2,...,WHILW-2,WLNYO-2,WLNYO-3,WLNYO-4,WLNYW-1,WTMSO-1,WYLF-1,WYLF-2,WYLF-3,WYLF-4
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016,0.0,0.0,4773455.212,60.988,4072688.248,54.616,4953762.588,60.488,0.0,0.0,...,568804.272,1020563.502,0.0,0.0,1212042.748,1661921.95,0.0,0.0,0.0,0.0
2017,0.0,0.0,1781808.064,0.0,1870113.436,10.62,1755050.164,49.08,0.0,0.0,...,574340.278,1598147.57,323384.332,0.066,1401498.666,1834894.012,0.0,0.0,0.0,0.0
2018,0.0,0.0,346680.444,250.912,355742.248,154.848,358133.512,163.14,0.0,0.0,...,656634.394,1439868.308,2487479.998,1786502.736,1217166.504,1746317.078,0.0,0.0,18.218,50.746
2019,580494.43,175057.484,297628.44,108.912,267689.204,103.116,871648.0,124.028,205695.972,474725.21,...,638690.354,1525665.418,2789203.376,2624407.566,1247539.556,1752546.228,27.602,32.362,0.0,0.0
2020,625901.122,194075.514,0.0,49.428,0.0,37.264,0.0,52.824,217333.422,434032.238,...,658479.61,1508376.002,2855488.434,2181718.642,1329573.086,1904411.46,0.0,0.0,0.0,0.0


In [9]:
df_B1610_annual_totals_long = (df_B1610_annual_totals
                               .divide(2) # convert MW to MWh
                               .unstack()
                               .reset_index(name='output_MWh')
                               .rename(columns={
                                   'level_0': 'ngc_bmu_id',
                                   'datetime': 'year'
                               })
                              )
                               
df_B1610_annual_totals_long.head()

Unnamed: 0,ngc_bmu_id,year,output_MWh
0,ABRBO-1,2016,0.0
1,ABRBO-1,2017,0.0
2,ABRBO-1,2018,0.0
3,ABRBO-1,2019,290247.215
4,ABRBO-1,2020,312950.561


In [10]:
output_fp = '../datasets/annual-output/annual-output.csv'

save = 'long'

if save == 'long':
    df_B1610_annual_totals_long.to_csv(output_fp, index=False)

elif save == 'wide':
    (df_B1610_annual_totals
     .T
     .reset_index()
     .rename(columns={'index': 'ngc_bmu_id'})
     .to_csv(output_fp, index=False)
    )
else:
    pass