# Defining Outputs & Inputs

<br>

### Imports

In [1]:
import json
import pandas as pd

import os
from typing import Any

In [2]:
from IPython.display import JSON

<br>

### User Inputs

In [3]:
raw_data_dir = '../data/raw'
definitions_dir = '../data/definitions'

<br>

### Output Definitions

Before we collate the data sources into a single dataset we want to first define the attribute sets we want to have in our outputs, for attributes that are not categorical we will specify `None`.

In [4]:
outputs = {
    'fuel_type': [
        'gas',
        'coal',
        'wind',
        'solar',
        'oil',
        'hydro',
        'nuclear',
        'biomass',
        'other' # e.g. for batteries or aggregators
    ],
    'plant_type': [
        'ccgt',
        'ocgt',
        'coal',
        'onshore_wind',
        'offshore_wind',
        'floating_wind',
        'conc_solar',
        'pv_solar',
        'oil',
        'run_of_river',
        'pumped_storage',
        'nuclear',
        'aggregator',
        'battery',
        'biomass',
        'other' # ideally no plants should come under this, the preference is to create a new category
    ],
    'capacity': 'float',
    'sett_bmu_id': 'str',
    'longitude': 'float',
    'latitude': 'float'
}

JSON(outputs)

<IPython.core.display.JSON object>

<br>

We'll save this to the definitions directory

In [5]:
with open(f'{definitions_dir}/outputs.json', 'w') as f:
    json.dump(outputs, f)

<br>

### Source Definitions

#### ESAIL

We'll start by loading the dataset in

In [6]:
filename = 'ESAIL.csv'

df_ESAIL = pd.read_csv(f'{raw_data_dir}/{filename}')

df_ESAIL.head()

Unnamed: 0,sett_bmu_id,ngc_bmu_id,bmu_root,name,primary_fuel_type,detailed_fuel_type,longitude,latitude
0,E_MARK-1,MARK-1,MARK,Rothes Bio-Plant CHP 1,biomass,bone,-3.603516,57.480403
1,E_MARK-2,MARK-2,MARK,Rothes Bio-Plant CHP 2,biomass,bone,-3.603516,57.480403
2,T_DIDC1,DIDC1,DIDC,Didcot A (G) 1,coal,coalgas_opt_out,-1.26757,51.62363
3,T_DIDC2,DIDC2,DIDC,Didcot A (G) 2,coal,coalgas_opt_out,-1.26757,51.62363
4,T_DIDC4,DIDC4,DIDC,Didcot A (G) 4,coal,coalgas_opt_out,-1.26757,51.62363


<br>

We'll then define the key column, as well as the mapping from the OSUKED key to the ESAIL key

In [7]:
def check_key_input_col(df, key_input_col):
    df[key_input_col].isnull().sum() == 0, f'{key_input_col} can not contain missing values'
    return df

key_input_col = 'bmu_root'
key_output_col = 'bmu_root'

key_map = (df_ESAIL
           .pipe(check_key_input_col, key_input_col)
           [[key_input_col]]
           .drop_duplicates()
           .reset_index()
           .pipe(lambda df: df.assign(index=df.index+10000))
           .set_index('index')
           [key_input_col]
           .to_dict()
          )

JSON([key_map])

<IPython.core.display.JSON object>

<br>

We'll also define how we want to extract data from the attribute columns, specifically we provide the name of the new output column, the rank of the source in regards to this column, and the value mapping necessary for that columns content to match the desired output.

In [8]:
attr_cols = {
    'primary_fuel_type': {
        'output_col': 'fuel_type',
        'output_rank': 0, # rank to determine which input to use when multiple are provided, 0 is highest
        'value_map': {
            'wind': 'wind', 
            'gas': 'gas', 
            'coal': 'coal', 
            'fuel_oil': 'oil', 
            'nuclear': 'nuclear', 
            'run_of_river': 'hydro',
            'pumped_storage': 'hydro', 
            'aggregator': 'other', 
            'other': 'other', 
            'rgt': 'gas', 
            'biomass': 'biomass', 
            'battery': 'other'
        }
    },
    'sett_bmu_id': {
        'output_col': 'sett_bmu_id',
        'output_rank': 0, 
        'value_map': None
    },
    'longitude': {
        'output_col': 'longitude',
        'output_rank': 0, 
        'value_map': None
    },
    'latitude': {
        'output_col': 'latitude',
        'output_rank': 0, 
        'value_map': None
    },
}

<br>

We can now combine this into a single metadata object for the source

In [9]:
ESAIL_def = {
    'filename': filename,
    'key_input_col': key_input_col,
    'key_output_col': key_output_col,
    'key_map': key_map,
    'attr_cols': attr_cols
}

JSON(ESAIL_def)

<IPython.core.display.JSON object>

<br>

We'll save this source definition before moving on

In [10]:
with open(f'{definitions_dir}/ESAIL.json', 'w') as f:
    json.dump(ESAIL_def, f)

<br>

#### GPPDB

In [11]:
# 

In [None]:
#hide
from nbdev.export import *
notebook2script()