# Test Transform Module

In [None]:
# To DO :


### Setup

In [59]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [60]:
# Standard libraries
import logging
import sys
import os
import pathlib
import random

import pandas as pd
import numpy as np
import sqlalchemy as sa

# Local libraries
import pudl
from pudl.analysis.fill_ferc1_fuel_gaps import *
from pudl.analysis.flag_ferc1_totals import *
from pudl.analysis.clean_combine_ferc1 import *

# Enable viewing of logging outputs
logger=logging.getLogger()
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

In [61]:
# Establish connection to pudl database
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_engine = sa.create_engine(pudl_settings['pudl_db'])
pudl_out = pudl.output.pudltabl.PudlTabl(
    pudl_engine=pudl_engine,
    freq='AS'
)

ferc_engine = sa.create_engine(pudl_settings['ferc1_db'])

In [62]:
# View random utility groups! (thanks, Trenton)
def view_random_utility(df):
    df = df.reset_index(drop=True)
    util_groups = df.groupby(['utility_id_ferc1'])
    random_utility = random.choice(list(util_groups.groups.keys()))
    return df.iloc[util_groups.groups[random_utility]]

-------------

### Basic Transform Stuff

In [83]:
# # Establish table connections
steam = pudl_out.plants_steam_ferc1().assign(primary_fuel=np.nan)
hydro = pudl_out.plants_hydro_ferc1()
pumped_storage = pudl_out.plants_pumped_storage_ferc1()
fbp = pudl_out.fbp_ferc1()
small_plants_clean = pudl_out.plants_small_ferc1()#.dropna(subset=['plant_name_ferc1'])
eia = pudl_out.gens_eia860().copy()


# Now because we don't want all the header rows dropped, we need to pull in a raw version of the small plants table
# and run it manually through the transform process.
# Get raw version of small plants
small_plants_raw = pd.read_sql("f1_gnrt_plant", ferc_engine)
# Here we create a fake raw dfs dictionary with just the small plants df to run it through
# Zane's existing transform feature.
fake_dict = {'plants_small_ferc1': small_plants_raw}
new_dict = {}
small_plants_dict = pudl.transform.ferc1.plants_small(fake_dict, new_dict)
small_plants = small_plants_dict['plants_small_ferc1']
# Add pudl id column
from pudl.output.ferc1 import plants_utils_ferc1
small_plants = pd.merge(
    small_plants, plants_utils_ferc1(pudl_engine), 
    on=['utility_id_ferc1', 'plant_name_ferc1'], how='left')
small_plants.loc[:,'plant_id_pudl'] = small_plants.plant_id_pudl.astype('Int64')
small_plants.loc[:,'utility_id_pudl'] = small_plants.utility_id_pudl.astype('Int64')

first_cols = ['report_year', 'utility_id_ferc1', 'utility_id_pudl',
              'utility_name_ferc1', 'plant_id_pudl', 'plant_name_ferc1', 'plant_name_original']

small_plants = pudl.helpers.organize_cols(small_plants, first_cols)

# Drop rows with no plant name because we can't use that
# small_plants = small_plants.dropna(subset=['plant_name_original'])

  warn(msg)


In [64]:
ferc1_tables = {'steam': steam, 'small': small_plants, 'hydro': hydro, 'pumped': pumped_storage}

In [87]:
steam_ferc = SteamTable('steam', ferc1_tables['steam'].copy(), steam_value_cols, pudl_out)
small_ferc = SmallTable('small', ferc1_tables['small'].copy(), small_value_cols)
hydro_ferc = HydroTable('hydro', ferc1_tables['hydro'].copy(), hydro_value_cols)
ps_ferc = HydroTable('pumped-storage', ferc1_tables['pumped'].copy(), pumped_value_cols)
table_list = [steam_ferc, small_ferc, hydro_ferc, ps_ferc]

In [88]:
for table in table_list:
    table.transform()


*** TRANSFORMING STEAM TABLE ***
Cleaning steam table
Flagging totals rows for steam table
 - flagging specific totals
 - adding manual totals
 - backfilling totals by capacity
Labeling fuel types for steam table
 - loading EIA table
 - adding fuel types
  * filling fuels with obvious names
    27072 / 29102 rows left unfilled
  * filling in primary fuel by mmbtu
    3872 / 29102 rows left unfilled
  * filling in eia plants with one reported fuel
    2527 / 29104 rows left unfilled
  * filling in primary fuel by cost
    2258 / 29104 rows left unfilled
  * filling in raw ferc1 fuels
    2114 / 29104 rows left unfilled
  * filling in ferc plants with one fuel
    1316 / 29104 rows left unfilled
  * filling in pudl plants with one fuel
    1187 / 29104 rows left unfilled
  * filling in manually mapped fuels
    1098 / 29104 rows left unfilled
  * front and backfilling values with the same ferc1 id
    842 / 29104 rows left unfilled
  * flipping single fuel outliers for plant_id_ferc1
  

100%|██████████| 1880/1880 [00:20<00:00, 90.90it/s] 

 - validating clump findings
Assigning headers to groups
 - likely headers that have not been mapped: ['hydraulic' 'other:' 'other production:' 'lewiston canal facilities:'
 'other' 'hydraulic (1):' 'hydraulic:'
 'other general ops. supervision & engineering' 'other-leased:'
 'renewables' 'renewables:']
 - creating header groups
 - assigning headers to groups





Labeling all obvious headers
 - labeling all headers with a tech name in their name
 - labeling all records with a ferc license hydro

header matches manual plant type: 4902
total manual plant types: 6534
total headers mapped: 14892
total headers with manual: 15837
total rows: 17356


*** TRANSFORMING HYDRO TABLE ***
Cleaning hydro table
Flagging totals rows for hydro table
 - using basic total flag
Labeling fuel types for hydro table

*** TRANSFORMING PUMPED-STORAGE TABLE ***
Cleaning pumped-storage table
Flagging totals rows for pumped-storage table
 - using basic total flag
Labeling fuel types for pumped-storage table


In [78]:
test = small_ferc.df.copy()

In [85]:
print(len(test))
print(len(test[test['plant_id_pudl'].isna()]))
sgp_id = test['plant_id_pudl'].unique()
eia_ids = eia['plant_id_pudl'].unique()

17356
2201


In [86]:
# PUDL IDS that also appear in EIA
len([x for x in sgp_id if x in eia_ids])

648

### Totals and Aggregation

In [48]:
steam_ferc.df.total_type.unique()

array([None, 'plant total', 'utility owned extra', 'utility owned total',
       'utility owned steam extra',
       'utility owned combustion turbine extra',
       'utility owned nuclear extra'], dtype=object)

In [38]:
aa = pd.read_excel('/Users/aesharpe/Desktop/manual_total_types.xlsx')

In [39]:
bb = aa.assign(report_year=lambda x: ([list(range(start, end)) for start, end in x[['start_year', 'end_year']].values]))

In [41]:
cc = bb.explode('report_year')

In [45]:
dd = cc.assign(record_id=lambda x: ('f1_steam_' + x.report_year.astype('str') + x.id_suffix))[['record_id', 'total_type_manual']].copy()

In [47]:
dd.total_type_manual.unique()

array(['utility owned plant total', 'plant total', 'unit total',
       'utility owned plant total steam', 'utility owned total',
       'utility owned subtotal', 'utility owned total nuclear',
       'utility owned plant extra'], dtype=object)

### Show change in technology description in EIA records 

In [16]:
eia['report_year'] = eia['report_date'].dt.year

In [17]:
eia['tech_diff'] = (
    eia.groupby(['plant_id_eia', 'generator_id',])['technology_description']
    .transform(lambda x: len(x.dropna().unique()) > 1)
)

In [46]:
# View plants that change technology description
import random

regs = ['report_date','plant_id_eia','plant_id_pudl','plant_name_eia', 'generator_id', 'technology_description', 'operating_date',
        'current_planned_operating_date', 'energy_source_code_1', 'fuel_type_code_pudl', 'energy_source_code_2', 'multiple_fuels', 'planned_modifications',
        'operating_switch', 'operational_status', 'planned_modifications', 'planned_repower_date']

aa = eia[eia['tech_diff']==True].copy()
bb = aa.drop_duplicates(subset=['plant_id_eia', 'generator_id']).copy().reset_index()

idx = bb.iloc[random.randint(0,len(bb)-1)]['index']
plt_id = eia.loc[eia.index==idx]['plant_id_eia'].item()
gen_id = eia.loc[eia.index==idx]['generator_id'].item()
eia[(eia['plant_id_eia']==plt_id) & (eia['generator_id']==gen_id)][regs]

Unnamed: 0,report_date,plant_id_eia,plant_id_pudl,plant_name_eia,generator_id,technology_description,operating_date,current_planned_operating_date,energy_source_code_1,fuel_type_code_pudl,energy_source_code_2,multiple_fuels,planned_modifications,operating_switch,operational_status,planned_modifications.1,planned_repower_date
354707,2004-01-01,56134,4820,Stockton Regional Water Control Facility,101,,2000-11-01,,OBG,gas,NG,,,,existing,,
335129,2005-01-01,56134,4820,Stockton Regional Water Control Facility,101,,2000-11-01,,OBG,gas,NG,,,,existing,,
315225,2006-01-01,56134,4820,Stockton Regional Water Control Facility,101,,2000-11-01,,OBG,gas,NG,True,,,existing,,
294796,2007-01-01,56134,4820,Stockton Regional Water Control Facility,101,,2000-11-01,,OBG,gas,NG,True,True,,existing,True,
273487,2008-01-01,56134,4820,Stockton Regional Water Control Facility,101,,2000-11-01,,OBG,gas,NG,True,False,,existing,False,
251659,2009-01-01,56134,4820,Stockton Regional Water Control Facility,101,,2000-11-01,,OBG,gas,NG,True,False,,existing,False,
230969,2010-01-01,56134,4820,Stockton Regional Water Control Facility,101,,2000-11-01,,OBG,gas,NG,True,False,,existing,False,
209570,2011-01-01,56134,4820,Stockton Regional Water Control Facility,101,,2000-11-01,,OBG,gas,NG,True,False,,existing,False,
187764,2012-01-01,56134,4820,Stockton Regional Water Control Facility,101,,2000-11-01,,OBG,gas,NG,True,False,,existing,False,
165181,2013-01-01,56134,4820,Stockton Regional Water Control Facility,101,,2000-11-01,,NG,gas,OBG,,,,existing,,


### MUL Stuff

In [None]:
# Read in MUL
import pickle

with open('/Users/aesharpe/Desktop/ferc1_to_eia_full.pkl', 'rb') as handle:
    mul_ferc = pickle.load(handle)