In [1]:
# default_exp update

# Data Checks, Updates & Cleaning

<br>

### Imports

In [2]:
#exports
import json
import pandas as pd

import os
from warnings import warn

from powerdict import construct

In [3]:
from IPython.display import JSON

<br>

### User Inputs

In [12]:
intermediate_data_dir = '../data/intermediate'
updates_data_dir = '../data/updates'
definitions_dir = '../data/definitions'

<br>

### Loading in Intermediate Data

In [5]:
df = pd.read_csv(f'{intermediate_data_dir}/power_stations.csv').astype(str).set_index('osuked_id')

df.head()

Unnamed: 0_level_0,bmu_root,fuel_type,sett_bmu_id,longitude,latitude
osuked_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10000,MARK,biomass,"E_MARK-1, E_MARK-2",-3.603516,57.480403
10001,DIDC,coal,"T_DIDC1, T_DIDC2, T_DIDC4, T_DIDC3",-1.26757,51.62363
10002,ABTH,coal,"T_ABTH7, T_ABTH8, T_ABTH9",-3.404866,51.387312
10003,COTPS,coal,"T_COTPS-1, T_COTPS-2, T_COTPS-3, T_COTPS-4",-0.648193,53.245495
10004,DRAXX,coal,"T_DRAXX-1, T_DRAXX-2, T_DRAXX-3, T_DRAXX-4, T_...",-0.626221,53.748711


<br>

### Data Checks

We'll start by confirming that none of the plants have multiple locations assigned to them

In [6]:
#exports
def filter_cols_for_one_to_many(df, cols=['longitude', 'latitude']):
    df_one_to_many_filt = df[df[cols].astype(str).agg(''.join, axis=1).str.contains(', ')]
    return df_one_to_many_filt

In [7]:
df_multiple_locs = filter_cols_for_one_to_many(df)

assert df_multiple_locs.size == 0, 'There should not be multiple locations for a single site'

In [8]:
#exports
def apply_updates(df, updates_data_dir):
    update_files = [f for f in os.listdir(updates_data_dir) if f.replace('.json', '') in df.columns]

    for update_file in update_files:    
        with open(f'{updates_data_dir}/{update_file}', 'r') as f:
            update_dict = json.load(f)

        update_col = update_file.replace('.json', '')
        df = construct.update_df_col(df, update_col, update_dict)
        
    return df

In [9]:
df = apply_updates(df, updates_data_dir)

df.head()

Unnamed: 0_level_0,bmu_root,fuel_type,sett_bmu_id,longitude,latitude
osuked_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10000,MARK,biomass,"E_MARK-1, E_MARK-2",-3.603516,57.480403
10001,DIDC,coal,"T_DIDC1, T_DIDC2, T_DIDC4, T_DIDC3",-1.26757,51.62363
10002,ABTH,coal,"T_ABTH7, T_ABTH8, T_ABTH9",-3.404866,51.387312
10003,COTPS,coal,"T_COTPS-1, T_COTPS-2, T_COTPS-3, T_COTPS-4",-0.648193,53.245495
10004,DRAXX,"coal, biomass","T_DRAXX-1, T_DRAXX-2, T_DRAXX-3, T_DRAXX-4, T_...",-0.626221,53.748711


In [14]:
with open(f'{definitions_dir}/outputs.json', 'r') as f:
    outputs = json.load(f)
    
JSON(outputs)

<IPython.core.display.JSON object>

In [None]:
# apply output def checks then apply dtypes