# Test Transform Module

In [None]:
# To DO :


### Setup

In [5]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
# Standard libraries
import logging
import sys
import os
import pathlib
import random

import pandas as pd
import numpy as np
import sqlalchemy as sa

# Local libraries
import pudl
from pudl.analysis.fill_ferc1_fuel_gaps import *
from pudl.analysis.flag_ferc1_totals import *
from pudl.analysis.clean_combine_ferc1 import *

# Enable viewing of logging outputs
logger=logging.getLogger()
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

In [7]:
# Establish connection to pudl database
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_engine = sa.create_engine(pudl_settings['pudl_db'])
ferc_engine = sa.create_engine(pudl_settings['ferc1_db'])
pudl_out = pudl.output.pudltabl.PudlTabl(
    pudl_engine=pudl_engine,
    #ferc_engine=ferc_engine,
    freq='AS'
)

ferc_engine = sa.create_engine(pudl_settings['ferc1_db'])

In [8]:
# View random utility groups! (thanks, Trenton)
def view_random_utility(df):
    df = df.reset_index(drop=True)
    util_groups = df.groupby(['utility_id_ferc1'])
    random_utility = random.choice(list(util_groups.groups.keys()))
    return df.iloc[util_groups.groups[random_utility]]

-------------

### Basic Transform Stuff

In [9]:
# # Establish table connections
steam = pudl_out.plants_steam_ferc1().assign(primary_fuel=np.nan)
hydro = pudl_out.plants_hydro_ferc1()
pumped_storage = pudl_out.plants_pumped_storage_ferc1()
fbp = pudl_out.fbp_ferc1()
small_plants_clean = pudl_out.plants_small_ferc1()#.dropna(subset=['plant_name_ferc1'])
eia = pudl_out.gens_eia860().copy()


# Now because we don't want all the header rows dropped, we need to pull in a raw version of the small plants table
# and run it manually through the transform process.
# Get raw version of small plants
small_plants_raw = pd.read_sql("f1_gnrt_plant", ferc_engine)
# Here we create a fake raw dfs dictionary with just the small plants df to run it through
# Zane's existing transform feature.
fake_dict = {'plants_small_ferc1': small_plants_raw}
new_dict = {}
small_plants_dict = pudl.transform.ferc1.plants_small(fake_dict, new_dict)
small_plants = small_plants_dict['plants_small_ferc1']
# Add pudl id column
from pudl.output.ferc1 import plants_utils_ferc1
small_plants = pd.merge(
    small_plants, plants_utils_ferc1(pudl_engine), 
    on=['utility_id_ferc1', 'plant_name_ferc1'], how='left')
small_plants.loc[:,'plant_id_pudl'] = small_plants.plant_id_pudl.astype('Int64')
small_plants.loc[:,'utility_id_pudl'] = small_plants.utility_id_pudl.astype('Int64')

first_cols = ['report_year', 'utility_id_ferc1', 'utility_id_pudl',
              'utility_name_ferc1', 'plant_id_pudl', 'plant_name_ferc1', 'plant_name_original']

small_plants = pudl.helpers.organize_cols(small_plants, first_cols)

# Drop rows with no plant name because we can't use that
# small_plants = small_plants.dropna(subset=['plant_name_original'])

  warn(msg)


In [10]:
ferc1_tables = {'steam': steam, 'small': small_plants, 'hydro': hydro, 'pumped': pumped_storage}

In [11]:
steam_ferc = SteamTable('steam', ferc1_tables['steam'].copy(), steam_value_cols, pudl_out)
small_ferc = SmallTable('small', ferc1_tables['small'].copy(), small_value_cols)
hydro_ferc = HydroTable('hydro', ferc1_tables['hydro'].copy(), hydro_value_cols)
ps_ferc = HydroTable('pumped-storage', ferc1_tables['pumped'].copy(), pumped_value_cols)
table_list = [steam_ferc, small_ferc, hydro_ferc, ps_ferc]

In [12]:
for table in table_list:
    table.transform()


*** TRANSFORMING STEAM TABLE ***
Cleaning steam table
Flagging totals rows for steam table
 - flagging specific totals
 - adding manual totals
 - backfilling totals by capacity
Labeling fuel types for steam table
 - loading EIA table
 - adding fuel types
  * filling fuels with obvious names
    27072 / 29102 rows left unfilled
  * filling in primary fuel by mmbtu
    3872 / 29102 rows left unfilled
  * filling in eia plants with one reported fuel
    2527 / 29104 rows left unfilled
  * filling in primary fuel by cost
    2258 / 29104 rows left unfilled
  * filling in raw ferc1 fuels
    2114 / 29104 rows left unfilled
  * filling in ferc plants with one fuel
    1316 / 29104 rows left unfilled
  * filling in pudl plants with one fuel
    1187 / 29104 rows left unfilled
  * filling in manually mapped fuels
    1098 / 29104 rows left unfilled
  * front and backfilling values with the same ferc1 id
    842 / 29104 rows left unfilled
  * flipping single fuel outliers for plant_id_ferc1
  

100%|██████████| 1880/1880 [00:22<00:00, 83.24it/s] 

 - validating clump findings
Assigning headers to groups
 - likely headers that have not been mapped: ['hydraulic' 'other:' 'other production:' 'lewiston canal facilities:'
 'other' 'hydraulic (1):' 'hydraulic:'
 'other general ops. supervision & engineering' 'other-leased:'
 'renewables' 'renewables:']
 - creating header groups





 - assigning headers to groups
Labeling all obvious headers
 - labeling all headers with a tech name in their name
 - labeling all records with a ferc license hydro

header matches manual plant type: 4902
total manual plant types: 6534
total headers mapped: 14892
total headers with manual: 15837
total rows: 17356


*** TRANSFORMING HYDRO TABLE ***
Cleaning hydro table
Flagging totals rows for hydro table
 - using basic total flag
Labeling fuel types for hydro table

*** TRANSFORMING PUMPED-STORAGE TABLE ***
Cleaning pumped-storage table
Flagging totals rows for pumped-storage table
 - using basic total flag
Labeling fuel types for pumped-storage table


In [21]:
test = ps_ferc.df.copy()
test[test['is_total']==False].capacity_mw.sum()

288298.48

In [39]:
print(len(test))
print(len(test[test['plant_id_pudl'].isna()]))
sgp_id = test['plant_id_pudl'].unique()
eia_ids = eia['plant_id_pudl'].unique()

17356
2201


In [41]:
in_eia = [x for x in sgp_id if x in eia_ids]

In [42]:
bb = eia[eia['plant_id_pudl'].isin(in_eia)]

In [40]:
# PUDL IDS that also appear in EIA
aa['header'] = aa.header.fillna('unknown')
print(len([x for x in sgp_id if x in eia_ids]))
print(len(test))
print(len(aa[~aa['header'].str.contains('hydro|solar_pv')]))

648
17356
3822


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aa['header'] = aa.header.fillna('unknown')


In [38]:
9737/17356*100

56.10163632173312

In [20]:
aa = test[~test['plant_id_pudl'].isin(eia['plant_id_pudl'].unique())]

In [21]:
aa['header'].value_counts()

hydro                  2732
solar_pv               1065
internal_combustion     960
diesel_turbine          583
wind                    226
gas_turbine             135
steam_heat              124
waste_heat              111
fuel_cell                36
combustion_turbine        7
nuclear                   1
combined_cycle            1
Name: header, dtype: int64

In [23]:
test['header'].value_counts()

hydro                  8462
internal_combustion    1983
solar_pv               1429
diesel_turbine         1340
wind                    864
steam_heat              291
gas_turbine             277
waste_heat              137
combustion_turbine       54
fuel_cell                44
nuclear                   9
combined_cycle            2
Name: header, dtype: int64

### Totals and Aggregation

In [48]:
steam_ferc.df.total_type.unique()

array([None, 'plant total', 'utility owned extra', 'utility owned total',
       'utility owned steam extra',
       'utility owned combustion turbine extra',
       'utility owned nuclear extra'], dtype=object)

In [38]:
aa = pd.read_excel('/Users/aesharpe/Desktop/manual_total_types.xlsx')

In [39]:
bb = aa.assign(report_year=lambda x: ([list(range(start, end)) for start, end in x[['start_year', 'end_year']].values]))

In [41]:
cc = bb.explode('report_year')

In [45]:
dd = cc.assign(record_id=lambda x: ('f1_steam_' + x.report_year.astype('str') + x.id_suffix))[['record_id', 'total_type_manual']].copy()

In [47]:
dd.total_type_manual.unique()

array(['utility owned plant total', 'plant total', 'unit total',
       'utility owned plant total steam', 'utility owned total',
       'utility owned subtotal', 'utility owned total nuclear',
       'utility owned plant extra'], dtype=object)

### Show change in technology description in EIA records 

In [16]:
eia['report_year'] = eia['report_date'].dt.year

In [17]:
eia['tech_diff'] = (
    eia.groupby(['plant_id_eia', 'generator_id',])['technology_description']
    .transform(lambda x: len(x.dropna().unique()) > 1)
)

In [46]:
# View plants that change technology description
import random

regs = ['report_date','plant_id_eia','plant_id_pudl','plant_name_eia', 'generator_id', 'technology_description', 'operating_date',
        'current_planned_operating_date', 'energy_source_code_1', 'fuel_type_code_pudl', 'energy_source_code_2', 'multiple_fuels', 'planned_modifications',
        'operating_switch', 'operational_status', 'planned_modifications', 'planned_repower_date']

aa = eia[eia['tech_diff']==True].copy()
bb = aa.drop_duplicates(subset=['plant_id_eia', 'generator_id']).copy().reset_index()

idx = bb.iloc[random.randint(0,len(bb)-1)]['index']
plt_id = eia.loc[eia.index==idx]['plant_id_eia'].item()
gen_id = eia.loc[eia.index==idx]['generator_id'].item()
eia[(eia['plant_id_eia']==plt_id) & (eia['generator_id']==gen_id)][regs]

Unnamed: 0,report_date,plant_id_eia,plant_id_pudl,plant_name_eia,generator_id,technology_description,operating_date,current_planned_operating_date,energy_source_code_1,fuel_type_code_pudl,energy_source_code_2,multiple_fuels,planned_modifications,operating_switch,operational_status,planned_modifications.1,planned_repower_date
354707,2004-01-01,56134,4820,Stockton Regional Water Control Facility,101,,2000-11-01,,OBG,gas,NG,,,,existing,,
335129,2005-01-01,56134,4820,Stockton Regional Water Control Facility,101,,2000-11-01,,OBG,gas,NG,,,,existing,,
315225,2006-01-01,56134,4820,Stockton Regional Water Control Facility,101,,2000-11-01,,OBG,gas,NG,True,,,existing,,
294796,2007-01-01,56134,4820,Stockton Regional Water Control Facility,101,,2000-11-01,,OBG,gas,NG,True,True,,existing,True,
273487,2008-01-01,56134,4820,Stockton Regional Water Control Facility,101,,2000-11-01,,OBG,gas,NG,True,False,,existing,False,
251659,2009-01-01,56134,4820,Stockton Regional Water Control Facility,101,,2000-11-01,,OBG,gas,NG,True,False,,existing,False,
230969,2010-01-01,56134,4820,Stockton Regional Water Control Facility,101,,2000-11-01,,OBG,gas,NG,True,False,,existing,False,
209570,2011-01-01,56134,4820,Stockton Regional Water Control Facility,101,,2000-11-01,,OBG,gas,NG,True,False,,existing,False,
187764,2012-01-01,56134,4820,Stockton Regional Water Control Facility,101,,2000-11-01,,OBG,gas,NG,True,False,,existing,False,
165181,2013-01-01,56134,4820,Stockton Regional Water Control Facility,101,,2000-11-01,,NG,gas,OBG,,,,existing,,


### MUL Stuff

In [6]:
# Read in MUL
import pickle

path1 = '/Users/aesharpe/Desktop/Work/Catalyst_Coop/rmi-ferc1-eia/outputs/steam_to_eia.pkl'
path2 = '/Users/aesharpe/Desktop/Work/Catalyst_Coop/rmi-ferc1-eia/outputs/ferc1_to_eia.pkl'
# '/Users/aesharpe/Desktop/ferc1_to_eia_full.pkl'

with open(path2, 'rb') as handle:
    mul_steam = pickle.load(handle)

In [7]:
mul_steam.duplicated(subset=['record_id_ferc1'], keep=False).value_counts()

False    29296
dtype: int64

In [8]:
len(mul_steam[mul_steam['technology_description'].notna()]) / len(mul_steam) * 100

14.022392135445111

In [11]:
print(len(mul_steam))
print(len(steam))

29296
29270


In [12]:
[x for x in mul_steam.record_id_ferc1.unique() if x not in steam.record_id.unique()]

['f1_hydro_2018_12_144_0_1',
 'f1_hydro_2018_12_177_0_2',
 'f1_gnrt_plant_2018_12_79_0_2',
 'f1_gnrt_plant_2018_12_79_0_3',
 'f1_gnrt_plant_2018_12_148_0_1',
 'f1_gnrt_plant_2018_12_148_0_9',
 'f1_hydro_2018_12_6_0_4',
 'f1_hydro_2018_12_6_0_1',
 'f1_hydro_2018_12_6_0_2',
 'f1_hydro_2018_12_195_0_1',
 'f1_hydro_2018_12_195_0_3',
 'f1_hydro_2018_12_195_0_2',
 'f1_gnrt_plant_2018_12_148_0_4',
 'f1_gnrt_plant_2018_12_294_0_2',
 'f1_gnrt_plant_2018_12_294_0_1',
 'f1_gnrt_plant_2018_12_294_0_5',
 'f1_gnrt_plant_2018_12_132_0_9',
 'f1_gnrt_plant_2018_12_132_0_11',
 'f1_gnrt_plant_2018_12_132_0_3',
 'f1_gnrt_plant_2018_12_132_0_4',
 'f1_gnrt_plant_2018_12_132_0_7',
 'f1_gnrt_plant_2018_12_294_0_4',
 'f1_hydro_2018_12_51_0_1',
 'f1_hydro_2018_12_6_0_3',
 'f1_hydro_2018_12_6_1_1',
 'f1_gnrt_plant_2018_12_294_0_3']

In [51]:
mul_steam[mul_steam['record_id_ferc1'].str.contains('2013')]['technology_description'].unique() # see if tech-desc in report years below 2013uu

array([nan], dtype=object)

In [10]:
[x for x in small_plants.columns if x not in steam.columns]

['plant_name_original',
 'ferc_license_id',
 'fuel_cost_per_mmbtu',
 'fuel_type',
 'opex_maintenance',
 'opex_total',
 'total_cost_of_plant']

In [58]:
len(dd[dd['record_id_eia'].notna()]) / len(dd) * 100

54.17284961367241

In [56]:
dd = mul_steam[mul_steam['report_year']>2000]

Unnamed: 0,record_id_ferc1,record_id_eia,plant_name_new,plant_part,report_year,ownership,plant_name_eia,plant_id_eia,generator_id,unit_id_pudl,...,water_limited_capacity_mw,total_fuel_cost_ferc1,total_mmbtu,fuel_type_code_pudl,fuel_cost_per_mmbtu,heat_rate_mmbtu_mwh,plant_id_report_year,plant_id_report_year_util_id,_merge,report_date
0,f1_steam_2004_12_100_0_1,6641_2004_plant_owned_12685,Independence Steam Electric Station,plant,2004,owned,Independence Steam Electric Station,6641.0,,,...,413.0,4.009889e+07,2.833470e+07,coal,1.415187,10.574106,283_2004,283_2004_109,both,2004-01-01
1,f1_steam_2004_12_100_0_2,8054_2004_plant_total_12685,Gerald Andrus,plant,2004,total,Gerald Andrus,8054.0,1,,...,741.0,1.094172e+08,2.329099e+07,oil,4.697834,10.619719,216_2004,216_2004_109,both,2004-01-01
2,f1_steam_2004_12_100_0_3,2053_st_2004_plant_prime_mover_total_12685,Rex Brown ST,plant_prime_mover,2004,total,Rex Brown,2053.0,,,...,302.0,1.942178e+07,3.071118e+06,gas,6.324011,14.907543,491_2004,491_2004_109,both,2004-01-01
3,f1_steam_2004_12_100_0_4,2052_2004_plant_total_12685,Natchez,plant,2004,total,Natchez,2052.0,1,,...,73.0,0.000000e+00,0.000000e+00,,,-0.000000,406_2004,406_2004_109,both,2004-01-01
4,f1_steam_2004_12_100_0_5,2051_2004_plant_total_12685,Delta,plant,2004,total,Delta,2051.0,,,...,192.0,1.301322e+06,2.379874e+05,oil,5.468030,16.915726,154_2004,154_2004_109,both,2004-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29291,f1_steam_2019_12_89_3_1,,,,2019,,,,,,...,,0.000000e+00,0.000000e+00,,,0.000000,14154_2019,14154_2019_171,right_only,2019-01-01
29292,f1_steam_2019_12_531_1_1,,,,2019,,,,,,...,225.0,2.037197e+07,8.535292e+06,gas,2.386792,9.902719,6112_2019,6112_2019_582,right_only,2019-01-01
29293,f1_steam_2019_12_531_1_2,,,,2019,,,,,,...,45.0,7.568640e+04,8.409600e+03,gas,9.000000,16.172308,15162_2019,15162_2019_582,right_only,2019-01-01
29294,f1_steam_2019_12_531_2_2,,,,2019,,,,,,...,127.0,6.251420e+05,2.232650e+05,gas,2.800000,28.165132,15161_2019,15161_2019_582,right_only,2019-01-01
