In [1]:
# default_exp idupdate

In [2]:
#exports
import json
import numpy as np
import pandas as pd

import os
import typer
from typing import Any
from dataclasses import dataclass

import gspread
from oauth2client.service_account import ServiceAccountCredentials

In [3]:
from IPython.display import JSON

In [6]:
#exports
@dataclass
class SheetManager:
    creds_fp: str='../gcloud/power-station-dictionary-f6814eb419e1.json'
    sheet_name: str='Power Station Dictionary - ID Submission (Responses)'
    sheet_index: int=0

    def __post_init__(self):
        scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']
        
        if os.path.exists(self.creds_fp) == True:
            creds = ServiceAccountCredentials.from_json_keyfile_name(self.creds_fp, scope)
        elif 'GCP_SA_KEY' in os.environ.keys():
            creds = ServiceAccountCredentials.from_json_keyfile_dict(json.loads(os.getenv('GCP_SA_KEY')))
        else:
            raise ValueError(f"No valid credentials (or filepath) was passed, GCP_SA_KEY: {'GCP_SA_KEY' in os.environ.keys()}")
            
        self.client = gspread.authorize(creds)
        
        sheet = self.client.open(self.sheet_name)
        self.sheet_instance = sheet.get_worksheet(self.sheet_index)
        
        return 

    load_sheet_df = lambda self: pd.DataFrame.from_dict(self.sheet_instance.get_all_records())
    
    def update_sheet_col_values(
        self,
        df_sheet: pd.DataFrame,
        col_name: str='Processed'
    ):            
        col_insert = [[col_name]+df_sheet[col_name].to_list()]
        col_idx = [idx+1 for idx, idx_col_name in enumerate(df_sheet.columns) if idx_col_name==col_name][0]
        
        self.sheet_instance.delete_columns(col_idx)
        self.sheet_instance.insert_cols(col_insert, col=col_idx)
        
        return 

In [8]:
sheet_manager = SheetManager()
df_sheet = sheet_manager.load_sheet_df()

df_sheet.head()

Unnamed: 0,Timestamp,Primary Key,ID Type,ID Value,Title,Name,Description,Value Type,URL Format,Processed
0,03/11/2021 11:18:39,10167,WindPowerNet ID,windfarm_en_22062_windy-standard-extension,,,,,,1
1,03/11/2021 11:20:42,10167,REPD ID (Old),B1290,,,,,,1
2,03/11/2021 11:20:59,10167,REPD ID (New),4615,,,,,,1
3,2021-11-03 14:49:01,10150,REPD ID (Old),EN00273,,,,,,1
4,2021-11-04 14:49:01,10150,REPD ID (New),4110,,,,,,1


In [9]:
#exports
def load_powerdict_data(
    ids_fp: str='../data/dictionary/ids.csv',
    metadata_fp: str='../data/dictionary/datapackage.json'
):
    df_powerdict_ids = pd.read_csv(ids_fp)
    df_powerdict_ids = df_powerdict_ids.set_index('dictionary_id')

    with open(metadata_fp) as f:
        powerdict_metadata = json.load(f)
        
    return df_powerdict_ids, powerdict_metadata

In [10]:
df_powerdict_ids, powerdict_metadata = load_powerdict_data()

JSON(powerdict_metadata)

<IPython.core.display.JSON object>

In [11]:
#exports
def construct_field_title_to_name_map(powerdict_metadata: dict):
    field_title_to_name_map = {
        field['title']: field['name'] 
        for field 
        in powerdict_metadata['resources'][0]['schema']['fields']
    }
    
    return field_title_to_name_map

In [12]:
field_title_to_name_map = construct_field_title_to_name_map(powerdict_metadata)

new_id_title = 'REPD ID (New)'
new_id_name = field_title_to_name_map[new_id_title]

new_id_name

'new_repd_id'

In [15]:
#exports
def assign_new_id_value(
    df_powerdict_ids: pd.DataFrame,
    dictionary_id: int,
    new_id_title: str,
    new_id_value: Any,
    field_title_to_name_map: dict,
):
    new_id_name = field_title_to_name_map[new_id_title]
    current_id = df_powerdict_ids.loc[dictionary_id, new_id_name]
    new_id_in_old_ids = str(new_id_value) in str(current_id).split(', ')
    
    if isinstance(current_id, (int, float)):
        no_previous_id = np.isnan(current_id)
    elif current_id == '' or current_id == 'np.nan':
        no_previous_id = True
    else:
        no_previous_id = False

    if new_id_in_old_ids:
        pass
    elif no_previous_id:
        df_powerdict_ids.loc[dictionary_id, new_id_name] = new_id_value
    else:
        df_powerdict_ids.loc[dictionary_id, new_id_name] = f'{current_id}, {new_id_value}'

    return df_powerdict_ids

def assign_all_new_id_values(
    df_powerdict_ids: pd.DataFrame,
    df_sheet: pd.DataFrame,
    field_title_to_name_map: dict
):
    idxs_to_process = df_sheet.index[df_sheet['Processed'].replace('', np.nan).isnull()]
    
    for dictionary_id, (new_id_title, new_id_value) in df_sheet.loc[idxs_to_process].set_index('Primary Key')[['ID Type', 'ID Value']].iterrows():
        df_powerdict_ids = assign_new_id_value(
            df_powerdict_ids,
            dictionary_id,
            new_id_title,
            new_id_value,
            field_title_to_name_map,
        )
        
    df_sheet.loc[idxs_to_process, 'Processed'] = 1
        
    return df_powerdict_ids, df_sheet

In [16]:
df_powerdict_ids, df_sheet = assign_all_new_id_values(df_powerdict_ids, df_sheet, field_title_to_name_map)
    
dictionary_id = df_sheet.loc[0, 'Primary Key']
df_powerdict_ids.loc[dictionary_id]

gppd_idnr                                              GBR0004431
esail_id                                                    WISTW
name                                     Brockloch Rig 2 Windfarm
sett_bmu_id                                             T_WISTW-2
ngc_bmu_id                                                WISTW-2
4c_offshore_id                                                NaN
windpowernet_id        windfarm_en_22062_windy-standard-extension
wikidata_id                                                   NaN
wikipedia_id                                                  NaN
power_technology_id                                           NaN
eutl_id                                                       NaN
eic_id                                                        NaN
cfd_id                                                        NaN
jrc_id                                                        NaN
iaea_id                                                       NaN
old_repd_i

In [17]:
sheet_manager.update_sheet_col_values(df_sheet)

In [20]:
#exports
app = typer.Typer()

@app.command()
def update_powerdict_ids_df(
    creds_fp: str ='gcloud/power-station-dictionary-f6814eb419e1.json',
    sheet_name: str ='Power Station Dictionary - ID Submission (Responses)',
    sheet_index: int=0,
    ids_fp: str ='data/dictionary/ids.csv',
    metadata_fp: str ='data/dictionary/datapackage.json',
    processed_col_name: str='Processed'
):
    sheet_manager = SheetManager(creds_fp=creds_fp, sheet_name=sheet_name, sheet_index=sheet_index)
    df_sheet = sheet_manager.load_sheet_df()

    df_powerdict_ids, powerdict_metadata = load_powerdict_data(ids_fp=ids_fp, metadata_fp=metadata_fp)
    field_title_to_name_map = construct_field_title_to_name_map(powerdict_metadata)

    df_powerdict_ids, df_sheet = assign_all_new_id_values(df_powerdict_ids, df_sheet, field_title_to_name_map)
    sheet_manager.update_sheet_col_values(df_sheet, col_name=processed_col_name)
    df_powerdict_ids.to_csv(ids_fp)
    
    return df_powerdict_ids

In [21]:
%%time

df_powerdict_ids = update_powerdict_ids_df(
    creds_fp='../gcloud/power-station-dictionary-f6814eb419e1.json',
    ids_fp='../data/dictionary/ids.csv',
    metadata_fp='../data/dictionary/datapackage.json'
)

df_powerdict_ids.head(2)

Wall time: 2.24 s


Unnamed: 0_level_0,gppd_idnr,esail_id,name,sett_bmu_id,ngc_bmu_id,4c_offshore_id,windpowernet_id,wikidata_id,wikipedia_id,power_technology_id,eutl_id,eic_id,cfd_id,jrc_id,iaea_id,old_repd_id,new_repd_id
dictionary_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
10000,,MARK,Rothes Bio-Plant CHP,"E_MARK-1, E_MARK-2","MARK-1, MARK-2",,,,,,,48W000000MARK-1D,,,,,
10001,"GBR1000377, GBR1000369",DIDC,Didcot,"T_DIDC1, T_DIDC2, T_DIDC4, T_DIDC3, T_DIDC1G, ...","DIDC1, DIDC2, DIDC4, DIDC3, DIDC1G, DIDC2G, DI...",,,,,,97165.0,"48W00000DIDC01G1, 48W00000DIDC02GZ, 48W00000DI...",,,,,


In [22]:
#exports
if __name__ == '__main__' and '__file__' in globals():
    app()

In [23]:
#hide
from nbdev.export import *
notebook2script()

Converted 00-documentation.ipynb.
Converted 01-dictionary-page.ipynb.
Converted 02-attribute extraction.ipynb.
Converted 03-page-population.ipynb.
Converted 04-cli.ipynb.
Converted 05-carbon-intensity.ipynb.
Converted 06-cfd-capture-price-comparison.ipynb.
Converted 07-dataset-pages.ipynb.
Converted 08-papers.ipynb.
Converted 09-id-submission.ipynb.
