# **Create a Technology Type Column for FERC Steam Table**
---------------

## Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [241]:
# Standard libraries
import logging
import sys
import os
import pathlib
import random

# 3rd party libraries
import geopandas as gpd
import dask.dataframe as dd
from dask.distributed import Client
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import seaborn as sns
import sqlalchemy as sa
#import pickle

# Local libraries
import pudl
from pudl.analysis.fill_ferc1_fuel_gaps import *
from pudl.analysis.flag_ferc1_totals import *

# Enable viewing of logging outputs
logger=logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

# Display settings
sns.set()
%matplotlib inline
mpl.rcParams['figure.dpi'] = 75
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

In [3]:
# Establish connection to pudl database
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_engine = sa.create_engine(pudl_settings['pudl_db'])
pudl_out = pudl.output.pudltabl.PudlTabl(
    pudl_engine=pudl_engine,
    freq='AS'
)

In [4]:
value_cols_no_cap = [
    'net_generation_mwh','avg_num_employees',
    'capex_land', 'capex_equipment', 'capex_structures', 'capex_total', 'asset_retirement_cost',
    'opex_operations', 'opex_fuel', 'opex_coolants', 'opex_steam', 'opex_steam_other', 'opex_transfer', 
    'opex_electric', 'opex_misc_power', 'opex_rents', 'opex_allowances', 'opex_engineering', 'opex_structures', 
    'opex_boiler', 'opex_plants', 'opex_misc_steam', 'opex_production_total'
]

value_cols = value_cols_no_cap + ['capacity_mw']

test_view = ['report_year', 'utility_name_ferc1', 'plant_name_ferc1', 'plant_id_pudl', 
             'plant_id_ferc1', 'primary_fuel', 'plant_type', 
             'tech_desc', 'tech_desc_flag','record_id', 'capacity_mw']

total_view = test_view + ['total_type']

ferc_merge_cols = ['report_year', 'utility_id_ferc1', 'plant_name_ferc1']
eia_merge_cols = ['report_date', 'plant_id_pudl', 'generator_id']

In [5]:
# Establish table connections
steam = pudl_out.plants_steam_ferc1().assign(primary_fuel=np.nan)
fbp = pudl_out.fbp_ferc1()
fbp_small = fbp[ferc_merge_cols + ['primary_fuel_by_mmbtu']] #, 'primary_fuel_by_cost']]
eia = pudl_out.gens_eia860()#.assign(report_year=lambda x: x.report_date.dt.year)
small_plants = pudl_out.plants_small_ferc1()#.dropna(subset=['plant_name_ferc1'])
eia_one_plant = eia[eia['plant_name_eia'].notna()].drop_duplicates(subset='plant_name_eia').reset_index()
small_plants_one_plant = small_plants[small_plants['plant_name_ferc1'].notna()].drop_duplicates(subset='plant_name_ferc1').reset_index()
fuel = pudl_out.fuel_ferc1()

## **1. Generate Tech Descriptions**

In [172]:
# Add fuel type and update plant type field in FERC
fuel_plant_df = (
    steam.pipe(impute_fuel_type, pudl_out)
    .pipe(impute_plant_type)
)

**** ADDING FUEL TYPES ****
filling fuels with obvious names
26496 / 28518 rows left unfilled
filling in primary fuel by mmbtu
3429 / 28518 rows left unfilled
filling in eia plants with one reported fuel
2193 / 28521 rows left unfilled
filling in primary fuel by cost
1940 / 28521 rows left unfilled
filling in raw ferc1 fuels
1788 / 28521 rows left unfilled
filling in ferc plants with one fuel
1058 / 28521 rows left unfilled
filling in pudl plants with one fuel
959 / 28521 rows left unfilled
filling in manually mapped fuels
870 / 28521 rows left unfilled
front and backfilling values with the same ferc1 id
645 / 28521 rows left unfilled
flipping single fuel outliers for plant_id_ferc1
645 / 28521 rows left unfilled
flipping multiple fuel outliers for groups under 7
645 / 28521 rows left unfilled
**** ADDING PLANT TYPES ****
filling plants with obvious names
783 / 28521 rows left unfilled
filling in manually mapped plant types


In [173]:
# Merge EIA technology description with FERC
tech_df = impute_tech_desc(fuel_plant_df, eia)

**** ADDING TECH TYPES ****
merging single-tech EIA technology_description with FERC
25500 / 28521 rows left unfilled
backfilling EIA technology_description by year if no new units installed
15148 / 28521 rows left unfilled
combining primary_fuel and plant_type columns
713 / 28521 rows left unfilled
making uniform tech description col


**List of plants to double check:**

In [None]:
# - 241
# - 1282 Waterside - not a lot of information on what this is or when it stopped...
# - 2259 hunterstown - con-ed petro unclear...capacities don't match what I found online at alll
# - 2260 Mountain - another weird con-ed petro plant....no idea
# - 1157 Joppa - confusing subunits
# - 1149 hopewell there is another hopewell in eia (3335) but the capacity from 1149 matches....:/
# - 45 bellmeade - some of it's right some not
# - 176 elizabeth river - plant type just wrong
# - 1132 gibson city - some weird outlier fuels and plant types
# - 488 remington - some solar plants thrown in at the end! need to fix
# - 5930 - airport - seems that the primary fuel is right (rather than petro from eia)
# - 121 coit - unknown valeus causing a problem
# - 1656 --> False is the result of a NA in the construction_year field which should probably be ignored?


# - any that are waste
# --- 15 Altavista - went from coal to biomass in 2013
# --- 646 wilmarth - waste
# --- 548 southampton - went from coal to biomass in 2013
# --- 517 rothschild - waste
# - any that are marked as coal but not coal_steam
# - any with a weird fuel flip flop that wasn't accounted for in the fixes
# - any plants with more than one technology type

-----------

### Test accuracy of fuel + plant type

In [43]:
# grab rows that were already present in eia and make a few tweaks for the analysis
tech_df_copy = (
    tech_df.loc[tech_df['tech_desc_flag'].isin([
        'direct from eia860', 'backfill from eia year', 'backfill from other year'])].copy()
    .assign(
        primary_fuel=lambda x: x.primary_fuel.replace({'oil': 'petroleum'}, regex=True),
        similar_plant=np.nan,
        similar_fuel=np.nan))

no_null_plant = tech_df_copy['plant_type'].notna()
no_null_fuel = tech_df_copy['primary_fuel'].notna()
no_null_tech = tech_df_copy['tech_desc_no_map'].notna()

mini_plant = (
    tech_df_copy[no_null_plant & no_null_tech].copy()
    .assign(similar_plant=lambda x: x.apply(lambda x: x.plant_type in x.tech_desc_no_map, axis=1))
)

mini_fuel = (
    tech_df_copy[no_null_fuel & no_null_tech].copy()
    .assign(similar_fuel=lambda x: x.apply(lambda x: x.primary_fuel in x.tech_desc_no_map, axis=1))
)

tech_df_copy.update(mini_plant, overwrite=True)
tech_df_copy.update(mini_fuel, overwrite=True)

#test2.loc[(test2['similar_plant'].notna()) & (test2['similar_fuel'].notna()), 'similar'] =  
tech_df_copy['similar'] = tech_df_copy['similar_plant'] & tech_df_copy['similar_fuel']

test = tech_df_copy[no_null_plant & no_null_fuel & no_null_tech].copy()
print(len(test), ' -- total rows with an eia description')
print(len(test[test['similar']]), ' -- rows where fuel + plant type matches the eia description')

13174  -- total rows with an eia description
10468  -- rows where fuel + plant type matches the eia description


In [49]:
n1 = test['similar_fuel'] == False
n2 = test['similar_plant'] == False

print('pudl plants with non-matching fuel+plant and eia technology descriptions:')
print('*note that many of these are waste')
test[n1 & n2][test_view].plant_id_pudl.astype('int').unique()

pudl plants with non-matching fuel+plant and eia technology descriptions:
*note that many of these are waste


array([ 242, 1282,  364, 2259, 2260,  283,  647, 1135, 1156,   15, 1148,
        549, 2628,  518,  489, 5931, 1525,    5])

### Look at plants that still have more than one technology type

In [None]:
more_than_one_fuel = tech_df.groupby(['plant_id_pudl']).filter(lambda x: len(x.tech_desc.dropna().unique()) > 1)
len(more_than_one_fuel.plant_id_pudl.unique())
more_than_one_fuel.plant_id_pudl.unique()

In [26]:
# Look for ferc1 ids that have more than one fuel associated with them
more_than_one_fuel = test2.groupby(['plant_id_ferc1']).filter(lambda x: len(x.tech_type.dropna().unique()) > 1)
len(more_than_one_fuel.plant_id_ferc1.unique())
more_than_one_fuel.plant_id_ferc1.unique().astype('int')

array([   2,    3,    5,   15,   44,   59,   64,   65,   73,   74,   75,
         76,   77,   78,   79,   80,   91,  136,  142,  175,  189,  190,
        202,  209,  210,  214,  216,  223,  246,  247,  255,  295,  296,
        327,  329,  330,  347,  356,  359,  363,  365,  372,  397,  398,
        423,  478,  481,  507,  540,  553,  554,  565,  575,  576,  578,
        601,  632,  674,  681,  708,  711,  712,  713,  729,  747,  750,
        752,  845,  897,  901,  903,  904,  908,  911,  925,  934,  940,
        946,  947,  954,  958,  978, 1009, 1020, 1034, 1056, 1057, 1062,
       1078, 1092, 1093, 1094, 1098, 1099, 1104, 1107, 1129, 1139, 1157,
       1159, 1160, 1162, 1181, 1191, 1195, 1208, 1215, 1218, 1224, 1235,
       1294, 1318, 1332, 1342, 1355, 1356, 1456, 1479, 1484, 1494, 1503,
       1532, 1543, 1545, 1554, 1556, 1557, 1605, 1618, 1619, 1623, 1659,
       1712, 1723, 1828, 1886])

In [46]:
# Shows that of the plant_id_ferc1 values with more than one fuel over time, there are none that start
# and end with the same fuel type (meaning, hopefully, there are no pockets in the middle.)
more_than_one_fuel.sort_values(['report_year'])
same_first_last = more_than_one_fuel.groupby(['plant_id_ferc1'])['tech_type'].agg(lambda x: x.max==x.min)
more_than_one_fuel['same_first_last'] = more_than_one_fuel.plant_id_ferc1.map(same_first_last)
more_than_one_fuel[more_than_one_fuel['same_first_last']][test_view + ['same_first_last']].plant_id_ferc1.unique().astype('int')

array([], dtype=int64)

In [66]:
dd = test2[(test2['tech_type'].str.contains('coal')) & (test2['tech_type'].str.contains('steam')==False)][test_view + ['tech_type']]
len(dd.plant_id_pudl.astype('int').unique())

91

## **2. Flag Totals**

In [174]:
flagged_tots_df = flag_totals(tech_df)

flagging specific totals
adding manual totals
backfilling totals by capacity


#### Total Values:

* **plant total:** pudl plant totals where the plant is owned by more than one utility.
* **utility owned total:** all of the assets owned by a utility
* **utility owned plant total:** a utility's owned portion of a pudl plant
* **utility owned plant total steam:** all of the utility's steam assets within a given pudl plant
* **utility owned plant total nuclear:** all of the utility's nuclear assets within a given pudl plant
* **utility owned subtotal:** the sum of several units within a pudl plant owned by one utility
* **utility owned plant extra:** any extra amounts that are associated with a plant
* **utility owned extra:** any extra amounts that are associated with all a utility's assets
* **unit total:** the sum of a co-owned unit (sub-pudl plant id)

### Test total groups

In [None]:
# plant total 
# -- compare total values if multiple utilities reporting these totals
# -- use total value from one utility for another (if needed)


In [175]:
pudl_id_list = flagged_tots_df[flagged_tots_df['total_type']=='plant total']['plant_id_pudl'].unique().tolist()
test_df = flagged_tots_df[flagged_tots_df['plant_id_pudl'].isin(pudl_id_list)]

In [176]:
test_df.plant_id_pudl.unique()

array([ 288,  250,   16,  451, 1208,  612,  289,  317,  296, 1087,  383,
        337,  308, 1665, 1165,  124,  530,  344,  611,  531,  168,  104,
        653])

In [187]:
one_plant = test_df[test_df['plant_id_pudl']==383]
test = one_plant.sort_values(['report_year'])[test_view + ['primary_fuel_flag', 'total_type']]

In [None]:
#flagged_tots_df.query(f"plant_id_pudl=={random.choice(outlier_plants)}")[test_view]

### Find more fuel rows to flip

In [516]:
# Figure out how to incorporate NANs

def show_year_outliers(df):
    """Check for fuel outliers that are not consistent over multiple years.
    
    This function displays the problem plants.
    
    """
    # Look at the number of fuels in a given year per plant id pudl
    fuel_count = (
        df.groupby(['report_year', 'plant_id_pudl'])
        .apply(lambda x: ', '.join(x.primary_fuel.sort_values().unique()))
        .reset_index()
        .rename(columns={0: 'unique_fuels'})
    )
    
    # Check how consistent these fuel type totals are across years
    fuel_appearances = (
        fuel_count.groupby(['plant_id_pudl', 'unique_fuels']).size()
        .reset_index()
        .rename(columns={0:'fuel_appearances'})
        .assign(
            total_appearances=lambda x: x.groupby('plant_id_pudl')['fuel_appearances'].transform(lambda x: x.sum()),
            unique_fuel_groups=lambda x: x.groupby('plant_id_pudl')['fuel_appearances'].transform('count'))
    )
    
    # Only show instances where the a fuel or fuel pairing only appears in one year. Also show that year.
    low_appearances = fuel_appearances.query("fuel_appearances < 2 and total_appearances > 1")
    out_df = pd.merge(fuel_count, low_appearances, on=['plant_id_pudl', 'unique_fuels'], how='inner')
    
    return out_df

In [517]:
show_year_outliers(test)

Unnamed: 0,report_year,plant_id_pudl,unique_fuels,fuel_appearances,total_appearances,unique_fuel_groups
0,1995,383,"coal, oil",1,26,2


In [518]:
nona = flagged_tots_df.copy()
nona['primary_fuel'] = nona.primary_fuel.fillna('unknown')
outliers = show_year_outliers(nona)

In [558]:
# Note that some pudl ids appear twice!
outliers2 = outliers.query("unique_fuel_groups==2")
outliers2.sort_values('total_appearances').reset_index()[10:15]

Unnamed: 0,index,report_year,plant_id_pudl,unique_fuels,fuel_appearances,total_appearances,unique_fuel_groups
10,46,2008,212,oil,1,15,2
11,7,1994,1064,gas,1,19,2
12,28,1999,475,coal,1,21,2
13,20,1997,345,oil,1,22,2
14,3,1994,277,"gas, oil",1,24,2


In [560]:
#flagged_tots_df.tech_desc_flag.unique()

In [562]:
# How many of these outliers aren't covered by EIA tech descriptions already?
outliers_list = list(outliers2['plant_id_pudl'].unique())
is_outlier = flagged_tots_df['plant_id_pudl'].isin(outliers_list)
no_eia_tech = ~flagged_tots_df['tech_desc'].isin(['direct from eia860', 'backfill from other year'])
flagged_tots_df[is_outlier & no_eia_tech][test_view]

Unnamed: 0,report_year,utility_name_ferc1,plant_name_ferc1,plant_id_pudl,plant_id_ferc1,primary_fuel,plant_type,tech_desc,tech_desc_flag,record_id,capacity_mw
266,1994,"Entergy New Orleans, Inc.",a.b. paterson,1064,14,gas,steam,gas_steam_turbine,FPT flipped pockets of fuel outliers in ferc1 ...,f1_steam_1994_12_114_0_2,133.00
267,1995,"Entergy New Orleans, Inc.",*a.b. paterson,1064,14,gas,steam,gas_steam_turbine,FPT flipped pockets of fuel outliers in ferc1 ...,f1_steam_1995_12_114_0_2,133.00
268,1996,"Entergy New Orleans, Inc.",*a.b. paterson,1064,14,gas,steam,gas_steam_turbine,FPT flipped pockets of fuel outliers in ferc1 ...,f1_steam_1996_12_114_0_2,133.00
269,1997,"Entergy New Orleans, Inc.",*a.b. paterson,1064,14,gas,steam,gas_steam_turbine,FPT flipped pockets of fuel outliers in ferc1 ...,f1_steam_1997_12_114_0_2,133.00
270,1998,"Entergy New Orleans, Inc.",a. b. paterson 3 & 4,1064,14,gas,steam,gas_steam_turbine,FPT flipped pockets of fuel outliers in ferc1 ...,f1_steam_1998_12_114_0_2,133.00
...,...,...,...,...,...,...,...,...,...,...,...
28432,2019,Tucson Electric Power Company,four corners,197,6677,coal,steam,coal,direct from eia860,f1_steam_2019_12_176_0_3,114.53
28464,2019,"Entergy Louisiana, LLC",waterford 3,622,6712,nuclear,nuclear,nuclear,FPT fuel in technology name,f1_steam_2019_12_454_0_3,225.00
28475,2019,"Entergy Louisiana, LLC",waterford 1 & 2,622,6723,gas,steam,gas_steam_turbine,FPT primary fuel by mmbtu,f1_steam_2019_12_454_2_5,577.00
28492,2019,Arizona Public Service Company,four corners 4,197,6741,coal,steam,coal,direct from eia860,f1_steam_2019_12_7_0_3,515.40


In [566]:
# For most of these, we should be able to do a pudl_id-wide subsitute
# For plants like 11650 - eastlake and 1229 - salem (where there is more than one fuel type per plant per year),
# there will be a need to vet not just by pudl id by also capacity to match fuel...

test2 = flagged_tots_df.query("plant_id_pudl==11650").sort_values('report_year')[test_view + ['tech_desc']].head(10)

In [567]:
test2

Unnamed: 0,report_year,utility_name_ferc1,plant_name_ferc1,plant_id_pudl,plant_id_ferc1,primary_fuel,plant_type,tech_desc,tech_desc_flag,record_id,capacity_mw,tech_desc.1
5577,1994,"Cleveland Electric Illuminating Company, The",eastlake,11650,383,coal,steam,coal,FPT primary fuel by mmbtu,f1_steam_1994_12_30_1_3,1045.0,coal
24303,1994,"Cleveland Electric Illuminating Company, The",eastlake gas,11650,2241,,combustion_turbine,,,f1_steam_1994_12_30_2_4,32.0,
5578,1995,"Cleveland Electric Illuminating Company, The",eastlake,11650,383,coal,steam,coal,FPT primary fuel by mmbtu,f1_steam_1995_12_30_2_2,1045.0,coal
18385,1995,"Cleveland Electric Illuminating Company, The",east lake,11650,1302,coal,steam,coal,FPT primary fuel by mmbtu,f1_steam_1995_12_30_1_3,1044.8,coal
24546,1995,"Cleveland Electric Illuminating Company, The",eastlake gas,11650,2495,oil,combustion_turbine,petroleum_liquids,FPT primary fuel by mmbtu,f1_steam_1995_12_30_2_3,32.0,petroleum_liquids
24549,1995,"Cleveland Electric Illuminating Company, The",eastlake gas,11650,2498,oil,combustion_turbine,petroleum_liquids,FPT primary fuel by mmbtu,f1_steam_1995_12_30_4_2,32.0,petroleum_liquids
5579,1996,"Cleveland Electric Illuminating Company, The",eastlake,11650,383,coal,steam,coal,FPT primary fuel by mmbtu,f1_steam_1996_12_30_1_3,1044.8,coal
24758,1996,"Cleveland Electric Illuminating Company, The",eastlake gas,11650,2729,oil,combustion_turbine,petroleum_liquids,FPT primary fuel by mmbtu,f1_steam_1996_12_30_2_3,32.0,petroleum_liquids
5580,1997,"Cleveland Electric Illuminating Company, The",eastlake,11650,383,coal,steam,coal,FPT primary fuel by mmbtu,f1_steam_1997_12_30_1_5,1044.8,coal
24960,1997,"Cleveland Electric Illuminating Company, The",eastlake gas,11650,2946,oil,combustion_turbine,petroleum_liquids,FPT primary fuel by mmbtu,f1_steam_1997_12_30_2_3,32.0,petroleum_liquids


## **3. Aggregate to Utility-Tech Level**

In [139]:
def agg_n_flag(flag_df, agg_col):
    """Aggregate specified column and create flag column based on total rows.
    
    """
    # For each report_year, utility_id_ferc1, plant_id_pudl, tech_desc group...
    
    # If all non-ttl rows have values and there are more than 0 just add 'em up
    if (flag_df.loc[flag_df['total_type'].isna()][agg_col].notna().all()) & (len(flag_df.loc[flag_df['total_type'].isna()]) > 0): 
        flag = 'actual values provided'#None
        agg_value = flag_df.loc[flag_df['total_type'].isna()][agg_col].sum()
        return [agg_value, flag]
    
    # If there aren't any total rows, just take the sum cuz that's all you can do
    elif flag_df['total_type'].isna().all():
        flag = 'no total rows' #None
        agg_value = flag_df[agg_col].sum()
        return [agg_value, flag]
    
    # Now there ARE total rows and there ARE value rows with NA:
    # If the total rows are all NA for the agg col, just take the sum cuz that's all you can do
    elif flag_df[flag_df['total_type'].notna()][agg_col].isna().all():
        flag = 'totals are NA'
        agg_value = flag_df[agg_col].sum()
        return [agg_value, flag]
    
    # If there are only total rows there's probably an issue with plant/fuel type mapping
    elif flag_df['total_type'].notna().all():
        flag = 'all rows only totals...'
        agg_value = flag_df[agg_col].sum() # FIX 
        return [agg_value, flag]
    
    # If non-total value cols are all NA (and by default there are total rows with values):....FIX
    elif flag_df[flag_df['total_type'].isna()][agg_col].isna().all():
        flag = flag_df[flag_df[agg_col].notna()]['total_type'].unique()[0] # this might be bad...
        agg_value = flag_df[flag_df['total_type'].notna()][agg_col].sum()
        return [agg_value, flag]

    # If some non-total value cols are NA but not all (and by default there are total rows with values):
    else:
        flag = 'some values are NA but not all'
        agg_value = flag_df[agg_col].sum() # FIX
        return [agg_value, flag]

In [140]:
def run_the_thing(df, agg_col):
    print(f'starting aggregation for {agg_col}')
    plant_group = df.groupby(['report_year', 'utility_id_ferc1', 'plant_id_pudl', 'tech_desc'])
    col_agg_series = plant_group.apply(lambda x: agg_n_flag(x, agg_col))
    col_agg_df = pd.DataFrame(col_agg_series).reset_index()
    col_agg_df[[agg_col, f'{agg_col}_flag']] = pd.DataFrame(col_agg_df[0].tolist(), index=col_agg_df.index)
    col_agg_df = col_agg_df.drop(columns=[0])
    
    util_agg_df = (
        col_agg_df.groupby(['report_year', 'utility_id_ferc1', 'tech_desc'])
        .agg({agg_col: 'sum', f'{agg_col}_flag': lambda x: ', '.join(list(x.dropna().unique()))})
    )
    return util_agg_df

In [142]:
def run_the_whole_thing(df):
    agg_df = pd.DataFrame(columns=['report_year', 'utility_id_ferc1', 'tech_desc'])
    for col in value_cols:
        one_col_df = run_the_thing(df, col)
        #df_list.append(one_col_df)
        agg_df = pd.merge(agg_df, one_col_df, on=['report_year', 'utility_id_ferc1', 'tech_desc'], how='outer')
    return agg_df

In [151]:
whole_enchilada = run_the_whole_thing(flagged_tots_df)

starting aggregation for net_generation_mwh
starting aggregation for avg_num_employees
starting aggregation for capex_land
starting aggregation for capex_equipment
starting aggregation for capex_structures
starting aggregation for capex_total
starting aggregation for asset_retirement_cost
starting aggregation for opex_operations
starting aggregation for opex_fuel
starting aggregation for opex_coolants
starting aggregation for opex_steam
starting aggregation for opex_steam_other
starting aggregation for opex_transfer
starting aggregation for opex_electric
starting aggregation for opex_misc_power
starting aggregation for opex_rents
starting aggregation for opex_allowances
starting aggregation for opex_engineering
starting aggregation for opex_structures
starting aggregation for opex_boiler
starting aggregation for opex_plants
starting aggregation for opex_misc_steam
starting aggregation for opex_production_total
starting aggregation for capacity_mw


In [146]:
whole_enchilada.to_excel('/Users/aesharpe/Desktop/full_tech_aggregation.xlsx')

## Test Small Generators Table

In [None]:
from fuzzywuzzy import process, fuzz

In [None]:
plant_name_eia = []
similarity = []
for i in small_plants_one_plant.plant_name_ferc1:
        ratio = process.extract( i, eia_one_plant.plant_name_eia, limit=1)
        plant_name_eia.append(ratio[0][0])
        similarity.append(ratio[0][1])
small_plants_one_plant['plant_name_eia'] = pd.Series(plant_name_eia)
small_plants_one_plant['similarity'] = pd.Series(similarity)