# Database Construction

<br>

### Imports

In [1]:
#exports
import json
import pandas as pd

import os

In [2]:
from IPython.display import JSON

<br>

### User Inputs

In [35]:
raw_data_dir = '../data/raw'
definitions_dir = '../data/definitions'
intermediate_data_dir = '../data/intermediate'

<br>

### Constructing the Database

We'll begin by loading in and combining the definitions of each source

In [14]:
#exports
def load_source_definitions(definitions_dir):
    source_definitions = dict()
    source_def_filenames = [f for f in os.listdir(definitions_dir) if '.json' in f and f != 'outputs.json']

    for filename in source_def_filenames:
        source_name = filename.replace('.json', '')

        with open(f'{definitions_dir}/{filename}', 'r') as f:
            source_definitions[source_name] = json.load(f)
            
    return source_definitions

In [15]:
source_definitions = load_source_definitions(definitions_dir)
        
JSON(source_definitions)

<IPython.core.display.JSON object>

In [16]:
#exports
def identify_primary_keys(source_definitions):
    primary_keys = []

    for source in source_definitions.values():
        primary_keys += list(source['key_map'].keys())

    primary_keys = sorted(set(primary_keys))
    
    return primary_keys

In [17]:
primary_keys = identify_primary_keys(source_definitions)
    
primary_keys[:5]

['10000', '10001', '10002', '10003', '10004']

In [20]:
#exports
def check_source_for_disallowed_cols(attr_cols, outputs):
    output_cols_to_be_added = [attr_col['output_col'] for attr_col in attr_cols.values()]
    output_cols_allowed = outputs.keys()

    disallowed_output_cols = list(set(output_cols_to_be_added) - set(output_cols_allowed))

    assert len(disallowed_output_cols)==0, f"The following columns are not allowed in the output dataset: {', '.join(disallowed_output_cols)}"
    
def check_sources_for_disallowed_cols(source_definitions, definitions_dir):
    with open(f'{definitions_dir}/outputs.json', 'r') as f:
        outputs = json.load(f)
    
    for source, definition in source_definitions.items():
        check_source_for_disallowed_cols(definition['attr_cols'], outputs)
        
    return

In [21]:
check_sources_for_disallowed_cols(source_definitions, definitions_dir)

In [22]:
#exports
def identify_inputs_for_output_cols(source_definitions):
    keys_to_keep = ['output_col', 'output_rank']
    output_cols = list()
    inputs_for_output_cols = dict()
    
    for source, definition in source_definitions.items():
        for input_col, value in definition['attr_cols'].items():
            output_cols += [(source, input_col, value['output_col'], value['output_rank'])]
            
    df_output_cols = pd.DataFrame(output_cols, columns=['source', 'input_col', 'output_col', 'output_rank'])
    
    for output_col in df_output_cols['output_col']:
        inputs_for_output_cols[output_col] = (df_output_cols
                                              .query('`output_col`==output_col')
                                              .sort_values('output_rank', ascending=False)
                                              [['source', 'input_col']]
                                              .apply(tuple, axis=1)
                                              .to_list()
                                             )
    return inputs_for_output_cols

In [23]:
inputs_for_output_cols = identify_inputs_for_output_cols(source_definitions)

JSON(inputs_for_output_cols)

<IPython.core.display.JSON object>

In [24]:
#exports
def get_primary_key_to_attr(source, input_col, source_definitions, raw_data_dir):
    filename = source_definitions[source]['filename']
    key_input_col = source_definitions[source]['key_input_col']
    key_map = source_definitions[source]['key_map']
    
    df_input = pd.read_csv(f"{raw_data_dir}/{filename}")
    assert (df_input[key_input_col].nunique()/df_input.shape[0]) == 1, f'The key column: {key_input_col}, is not unique for: {filename}'

    primary_key_to_attr = (pd.Series(key_map)
                           .map(df_input
                                .set_index(key_input_col)
                                [input_col])
                           .to_dict())
    
    return primary_key_to_attr

In [25]:
source = 'ESAIL'
input_col = 'primary_fuel_type'

primary_key_to_attr = get_primary_key_to_attr(source, input_col, source_definitions, raw_data_dir)

JSON([primary_key_to_attr])

<IPython.core.display.JSON object>

In [None]:
#exports
def update_df_col(df, update_col, update_dict):
    s_update = pd.Series(df.index.map(update_dict), df.index).dropna()
    df.loc[s_update.index, update_col] = s_update.values

    return df

def add_attr_cols(df, source_definitions, raw_data_dir):
    inputs_for_output_cols = identify_inputs_for_output_cols(source_definitions)
    
    for output_col, inputs in inputs_for_output_cols.items():
        for (source, input_col) in inputs:
            primary_key_to_attr = get_primary_key_to_attr(source, input_col, source_definitions, raw_data_dir)
            df = update_df_col(df, output_col, primary_key_to_attr)
            
    return df

In [30]:
df = pd.DataFrame(index=primary_keys)
df = add_attr_cols(df, source_definitions, raw_data_dir)

df.head()

Unnamed: 0,fuel_type
10000,biomass
10001,biomass
10002,coal
10003,coal
10004,coal


In [26]:
#exports
def add_key_cols(df, source_definitions):
    for source in source_definitions.keys():
        key_map = source_definitions[source]['key_map']
        key_output_col = source_definitions[source]['key_output_col']
        df = update_df_col(df, key_output_col, key_map)
        
    return df

In [31]:
df = pd.DataFrame(index=primary_keys)
df = add_key_cols(df, source_definitions)

df.head()

Unnamed: 0,bmu_id
10000,E_MARK-1
10001,E_MARK-2
10002,T_DIDC1
10003,T_DIDC2
10004,T_DIDC4


In [32]:
#exports
def construct_output_df(source_definitions, raw_data_dir):
    primary_keys = identify_primary_keys(source_definitions)
    
    df = (pd.DataFrame(index=primary_keys)
          .pipe(add_key_cols, source_definitions)
          .pipe(add_attr_cols, source_definitions, raw_data_dir)
         )
    
    df.index.name = 'osuked_id'
    
    return df

In [33]:
df = construct_output_df(source_definitions, raw_data_dir)

df.head()

Unnamed: 0_level_0,bmu_id,fuel_type
osuked_id,Unnamed: 1_level_1,Unnamed: 2_level_1
10000,E_MARK-1,biomass
10001,E_MARK-2,biomass
10002,T_DIDC1,coal
10003,T_DIDC2,coal
10004,T_DIDC4,coal


In [37]:
df.to_csv(f'{intermediate_data_dir}/power_stations.csv')