In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Standard libraries
import logging
import sys
import os
import pathlib

# 3rd party libraries
import geopandas as gpd
import dask.dataframe as dd
from dask.distributed import Client
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import seaborn as sns
import sqlalchemy as sa
import pickle

# Local libraries
import pudl
from pudl.analysis.fill_ferc1_fuel_gaps import *

In [3]:
# Enable viewing of logging outputs
logger=logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

In [4]:
# Display settings
sns.set()
%matplotlib inline
mpl.rcParams['figure.dpi'] = 75
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

In [5]:
# Establish connection to pudl database
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_engine = sa.create_engine(pudl_settings['pudl_db'])
pudl_out = pudl.output.pudltabl.PudlTabl(
    pudl_engine=pudl_engine,
    freq='AS'
)

In [6]:
value_cols_no_cap = [
    'net_generation_mwh','avg_num_employees',
    'capex_land', 'capex_equipment', 'capex_structures', 'capex_total', 'asset_retirement_cost',
    'opex_operations', 'opex_fuel', 'opex_coolants', 'opex_steam', 'opex_steam_other', 'opex_transfer', 
    'opex_electric', 'opex_misc_power', 'opex_rents', 'opex_allowances', 'opex_engineering', 'opex_structures', 
    'opex_boiler', 'opex_plants', 'opex_misc_steam', 'opex_production_total'
]

value_cols = value_cols_no_cap + ['capacity_mw']

test_view = ['report_year', 'utility_name_ferc1', 'plant_name_ferc1', 'plant_id_pudl', 
             'plant_id_ferc1', 'primary_fuel', 'plant_type', 'record_id', 'capacity_mw']

total_view = test_view + ['total_type']

ferc_merge_cols = ['report_year', 'utility_id_ferc1', 'plant_name_ferc1']
eia_merge_cols = ['report_date', 'plant_id_pudl', 'generator_id']

In [7]:
with open('/Users/aesharpe/Desktop/ferc1_transformed.pickle', 'rb') as handle:
    ferc1_transformed_dfs = pickle.load(handle)

with open('/Users/aesharpe/Desktop/steam_w_eia.pkl', 'rb') as handle:
    steam_w_eia = pickle.load(handle)

In [8]:
# Load the tables you'll need with some basic alterations
steam = ferc1_transformed_dfs['plants_steam_ferc1'].copy()
glue_dicts = pudl.glue.ferc1_eia.glue(ferc1=True) # For steam you'll have to mimic the glue process to get plant_id_pudl and add a column for primary fuel
steam = pd.merge(steam, glue_dicts['plants_ferc1'], on=['plant_name_ferc1', 'utility_id_ferc1'], how='left')
steam = pd.merge(steam, glue_dicts['utilities_ferc1'][['utility_id_ferc1', 'utility_name_ferc1']], on=['utility_id_ferc1'], how='left')
steam = steam.assign(primary_fuel=np.nan)

fbp = pudl_out.fbp_ferc1()
fbp_small = fbp[ferc_merge_cols + ['primary_fuel_by_mmbtu']] #, 'primary_fuel_by_cost']]
eia = pudl_out.gens_eia860()#.assign(report_year=lambda x: x.report_date.dt.year)

### Test Module Functionality

In [12]:
impute_tech_desc(steam, eia)

merging single-tech EIA technology_description with FERC
26220 / 29270
backfilling EIA technology_description by year if no new units installed
15830 / 29270
filling fuels with obvious names
15154 / 29270


Unnamed: 0,utility_id_ferc1,report_year,plant_name_ferc1,plant_type,construction_type,construction_year,installation_year,capacity_mw,peak_demand_mw,plant_hours_connected_while_generating,plant_capability_mw,not_water_limited_capacity_mw,water_limited_capacity_mw,avg_num_employees,capex_land,capex_structures,capex_equipment,capex_total,opex_operations,opex_fuel,opex_coolants,opex_steam,opex_steam_other,opex_transfer,opex_electric,opex_misc_power,opex_rents,opex_allowances,opex_engineering,opex_structures,opex_boiler,opex_plants,opex_misc_steam,opex_production_total,asset_retirement_cost,record_id,capex_per_mw,opex_per_mwh,net_generation_mwh,plant_id_ferc1,plant_id_pudl,utility_name_ferc1,primary_fuel,tech_desc,technology_description,dup,same_tech,tech_desc_flag,retired_unit,new_unit,backfill_by_year,backfill_by_eia_year,name_based
0,1,1994,rockport unit 1,steam,conventional,1984,1984,650.00,650.0,,,650.0,,,6395551.0,84467746.0,4.906841e+08,5.815474e+08,1032559.0,51694529.0,,442763.0,,,353599.0,1040610.0,7559.0,,427906.0,396788.0,3185935.0,631598.0,781181.0,59995027.0,,f1_steam_1994_12_1_0_1,894688.3,12.9,4668184.0,1108,530,AEP Generating Company,,Conventional Steam Coal,,,Conventional Steam Coal,backfill from other year,False,False,Conventional Steam Coal,,
1,1,1994,rockport unit 2,steam,conventional,1989,1989,650.00,650.0,,,650.0,,,74411.0,4249136.0,3.933937e+07,4.366292e+07,1026248.0,48990225.0,,446454.0,,,384283.0,1028788.0,67311927.0,,427747.0,230300.0,3374827.0,518870.0,255391.0,123995060.0,,f1_steam_1994_12_1_0_2,67173.7,27.9,4451312.0,1109,530,AEP Generating Company,,Conventional Steam Coal,,,Conventional Steam Coal,backfill from other year,False,False,Conventional Steam Coal,,
2,1,1994,rockport,steam,conventional,1984,1989,1300.00,1300.0,,,1300.0,,,6469962.0,88716882.0,5.300235e+08,6.252103e+08,2058807.0,100684754.0,,889217.0,,,737882.0,2069398.0,67319486.0,,855653.0,627088.0,6560762.0,1150468.0,1036572.0,183990087.0,,f1_steam_1994_12_1_0_3,480931.0,20.2,9119496.0,2211,530,AEP Generating Company,,Conventional Steam Coal,,,Conventional Steam Coal,backfill from other year,False,False,Conventional Steam Coal,,
3,1,1994,rockport total plant,steam,conventional,1984,1989,2600.00,2600.0,,,2600.0,,462.0,12969249.0,175466216.0,1.049180e+09,1.237616e+09,4117640.0,196297854.0,,1778431.0,,,1475766.0,4138807.0,134884608.0,,1711307.0,1254169.0,13121517.0,2300937.0,2073142.0,363154178.0,,f1_steam_1994_12_1_0_4,476006.1,20.4,17793158.0,1142,530,AEP Generating Company,,Conventional Steam Coal,,,Conventional Steam Coal,backfill from other year,False,False,Conventional Steam Coal,,
4,2,1994,gorgas,steam,conventional,1929,1972,1417.00,1294.0,8760.0,,1302.0,,438.0,312098.0,63796151.0,3.273578e+08,3.914661e+08,3065839.0,118304925.0,,2692720.0,,,1391099.0,7506206.0,,,2276025.0,1451092.0,17760784.0,5957567.0,645822.0,161052079.0,,f1_steam_1994_12_2_0_1,276264.0,21.7,7412375.0,1,230,ALABAMA POWER COMPANY,,Conventional Steam Coal,,,Conventional Steam Coal,backfill from other year,False,False,Conventional Steam Coal,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29265,51,2019,iatan (1&2),steam,semioutdoor,1980,2010,210.47,189.0,,190.0,190.0,190.0,29.0,128856.0,43381409.0,3.582827e+08,4.017929e+08,506436.0,15641733.0,,1342873.0,,,382347.0,722211.0,25589.0,,324856.0,613897.0,1944758.0,405914.0,30142.0,21940756.0,,f1_steam_2019_12_51_0_5,1909027.1,21.9,1001478.0,206,295,The Empire District Electric Company,,Conventional Steam Coal,Conventional Steam Coal,False,Conventional Steam Coal,direct from eia860,False,True,Conventional Steam Coal,,
29266,51,2019,state line,combustion_turbine,conventional,1995,1995,123.30,164.0,,96.0,,,,11897.0,1111584.0,4.165686e+07,4.278034e+07,9638.0,3466353.0,,54891.0,,,,16311.0,,,21714.0,21737.0,,68322.0,421.0,3659387.0,,f1_steam_2019_12_51_1_1,346961.4,29.1,125540.0,6729,560,The Empire District Electric Company,,,,,,,False,False,,,
29267,51,2019,sl combined cycle,combined_cycle,unknown,2001,2001,340.47,325.0,,295.0,295.0,295.0,29.0,929529.0,12270681.0,1.554461e+08,1.686463e+08,184340.0,27709275.0,,1704245.0,,,,642868.0,,,257082.0,83818.0,,5434891.0,343371.0,36359890.0,,f1_steam_2019_12_51_1_2,495333.8,24.1,1505701.0,1625,560,The Empire District Electric Company,,,,,,,True,True,,,
29268,51,2019,slcc tolling,combined_cycle,unknown,,,0.00,,,,,,,,,,,,,,,,,,,,,,,,,,,,f1_steam_2019_12_51_1_3,0.0,0.0,,1626,560,The Empire District Electric Company,,,,,,,True,True,,,


### Merge with EIA technology description and backfill

In [14]:
# Get eia plants with only one technology description (besides NA)
eia_one_tech = (
    eia.groupby('plant_id_pudl').filter(lambda x: len(x.technology_description.dropna().unique())==1)
    [['report_year', 'plant_id_pudl', 'technology_description']].drop_duplicates()
    .assign(dup=lambda x: x.duplicated(subset=['report_year', 'plant_id_pudl'], keep=False))
)
# Drop None cases when there is a None and a Tech Desc. for the same plant and year (so there is one per plant-year)
eia_one_tech = eia_one_tech.drop(eia_one_tech[(eia_one_tech['dup']==True) & (eia_one_tech['technology_description'].isna())].index)
eia_one_tech_list = list(eia_one_tech.plant_id_pudl.unique())

# Get the technology description associated with the plant regardless of year...
plant_id_tech_type = (
    eia_one_tech.groupby(['plant_id_pudl']).agg({'technology_description':lambda x: x.dropna().unique().item()})
    .rename(columns={'technology_description': 'same_tech'})
)

In [15]:
# Number of total plants in the steam table
print(len(steam.plant_id_pudl.unique()))
# VS number of plants in the steam table that coorespond to EIA plants with a single technology description
print(len([x for x in eia_one_tech_list if x in steam.plant_id_pudl.unique()]))

1482
706


In [16]:
# Check the construction_year and installation_year fields
def check_for_new_units(df, year_col, bool_col):
    init_years = df[df['report_year']==df.report_year.min()][year_col].unique()
    
    df[f'{bool_col}'] = ~df[f'{year_col}'].isin(init_years)
    
    return df

In [17]:
# Make columns to see if there have been any unit additions or retirements
steam2 = (
    steam.groupby(['plant_id_pudl']).apply(lambda x: check_for_new_units(x, 'construction_year', 'retired_unit'))
    .groupby(['plant_id_pudl']).apply(lambda x: check_for_new_units(x, 'installation_year', 'new_unit'))
)

In [13]:
# Merge the eia single technology descriptions with ferc1
merge_df = (
    pd.merge(steam2, eia_one_tech[['report_year', 'plant_id_pudl', 'technology_description']].drop_duplicates(), 
             on=['report_year', 'plant_id_pudl'], how='left')
)
merge_df2 = pd.merge(merge_df, plant_id_tech_type, on=['plant_id_pudl'], how='left')

ferc_eia_tech_desc = merge_df2[['report_year', 'plant_id_pudl', 'utility_id_ferc1', 'plant_name_ferc1', 'plant_type', 
                                'technology_description', 'same_tech', 'construction_year', 'retired_unit', 
                                'installation_year', 'new_unit', 'primary_fuel']].copy()

In [14]:
flag1 = 'direct from eia860'
flag2 = 'backfill from other year'
flag3 = 'backfill from eia year'
flag4 = 'primary fuel by mmbtu no dups'

In [15]:
# Add technology description to primary fuel
f1 = add_new_fuel_and_flag(ferc_eia_tech_desc, 'technology_description', flag1)
show_unfilled_rows(f1, 'primary_fuel')

26220 / 29270


In [None]:
# Add obvious names
f2 = fill_obvious_names_fuel(f1)

In [17]:
# Backfill based on same year (or matching year from EIA)
# Things to deal with still:
# - NA years
# - close but not quite years
# 

def backfill_if_matching_year(df):
    """Backfill years where there a technology description from EIA based on matching latest install years."""
    # If there is a technology type taken directly from eia in the ferc data:
    if df.technology_description.notna().any():
        install_years = list(df[df['technology_description'].notna()]['installation_year'].unique())
        assert len(df['technology_description'].dropna().unique()) == 1, 'backfilling only works when there is one tech description per plant...'
        tech_type = df['technology_description'].dropna().unique().item() #only works b/c there was only one per plant!!!
        df.loc[df['installation_year'].isin(install_years), 'backfill_by_year'] = tech_type
        return df
    # Else if there is a technology type but it is only in EIA data (other years not present in FERC):
    elif df.same_tech.notna().any():
        # Get the eia rows for this plant that have a tech descrip
        plant_eia = eia[(eia['plant_id_pudl']==df.plant_id_pudl.unique().item()) & eia['technology_description'].notna()]
        # Make sure there is only one technology description in the given plant group
        assert len(plant_eia['technology_description'].unique()) == 1, 'backfilling only works when there is one tech description per plant...'
        # Convert the op date to a year
        plant_eia = plant_eia.assign(operating_date=pd.to_datetime(plant_eia.operating_date).dt.year)
        install_years = list(plant_eia['operating_date'].unique())
        tech_type = plant_eia['technology_description'].unique().item()
        df.loc[df['installation_year'].isin(install_years), 'backfill_by_eia_year'] = tech_type
        return df
    else:
        return df

In [18]:
f3 = (
    f2.groupby(['plant_id_pudl']).apply(lambda x: backfill_if_matching_year(x))
    .pipe(add_new_fuel_and_flag, 'backfill_by_year', flag2)
    .pipe(add_new_fuel_and_flag, 'backfill_by_eia_year', flag3)
)

show_unfilled_rows(f3, 'primary_fuel')

15154 / 29270


In [162]:
# List of plants where there is a technology description but not for certain ferc years
# AND there has been a shift in retired units or new units. This is helpful in testing that
# the changes from backfill_if_missing_year() worked.

ff = f3[(test['same_tech'].notna()) & (f2['technology_description'].isna())]
ff[(ff['retired_unit']) | (ff['new_unit'])].plant_id_pudl.unique()

array([1148, 113, 13, 133, 2297, 1656, 2532, 190, 2107, 394, 1620, 351,
       1268, 1225, 117, 147, 136, 1187, 122, 556, 1032, 2740, 457, 330,
       440, 638, 283, 1525, 128, 546, 550, 627, 1177, 778, 464, 4741,
       8327, 1381, 1128, 1067, 343, 1069, 1157, 471, 1230, 215, 424, 1135,
       205, 439, 545, 275, 2960, 224, 267, 1108, 420, 182, 334, 11, 583,
       7616, 2561, 8547, 200, 15, 548, 1149, 221, 527, 199, 235, 1083,
       380, 18, 602, 1144, 140, 63, 109, 196, 317, 1169, 4, 26, 374, 1180,
       180, 1141, 1280, 258, 312, 354, 295, 624, 22, 585, 125, 589, 99,
       634, 654, 183, 629, 56, 1297, 385, 239, 566, 1085, 241, 649],
      dtype=object)

In [205]:
# Problems:
# - 2297 --> gap between ferc years and eia years so can't use the construction type col to see whether there was a new unit added or not...
#            Can use this as a backfill IF the operating_date from eia matches the closest one from FERC
# - 1656 --> False is the result of a NA in the construction_year field which should probably be ignored?

#ferc_eia_tech_desc[ferc_eia_tech_desc['plant_id_pudl']==190]

In [256]:
f3['dup'] = f3.duplicated(subset=ferc_merge_cols, keep=False)

In [338]:
def primary_fuel_by_mmbtu(df, fbp_small):
    """Blah."""
    print('filling in primary fuel by mmbtu')

    out_df = (
        pd.merge(df, fbp_small, on=ferc_merge_cols, how='left')
        .assign(primary_fuel_by_mmbtu=lambda x: (
            x.primary_fuel_by_mmbtu.replace({'': np.nan, 'unknown': np.nan})))
    )
    # Only add fuel to rows that have unique year-plant_name-utility combos
#     no_dups = (
#         out_df[~out_df['dup']].copy()
#         .pipe(add_new_fuel_and_flag, 'primary_fuel_by_mmbtu', flag4)
#     )
    
#     out_df.update(no_dups)

#     show_unfilled_rows(out_df, 'primary_fuel')
    
    return out_df

In [339]:
f4 = primary_fuel_by_mmbtu(f3, fbp_small)
f4.loc[(f4['primary_fuel'].notna()) & (f4['plant_type'].notna()), 'tech_type'] = pfm['primary_fuel'] + '_' + pfm['plant_type']

filling in primary fuel by mmbtu


In [332]:
f4.technology_description.unique()

array([nan, None, 'Conventional Steam Coal',
       'Natural Gas Fired Combustion Turbine', 'Onshore Wind Turbine',
       'Wood/Wood Waste Biomass', 'Natural Gas Fired Combined Cycle',
       'Natural Gas Steam Turbine', 'Nuclear', 'Petroleum Liquids',
       'Municipal Solid Waste', 'Natural Gas Internal Combustion Engine',
       'Solar Photovoltaic', 'Conventional Hydroelectric', 'Landfill Gas',
       'Geothermal'], dtype=object)

In [None]:
fuel_map = {
    
}

In [341]:
len(f4[f4['primary_fuel'].notna()].plant_id_pudl.unique())

730

In [304]:
pfm = f4[f4['primary_fuel_flag']==flag4].copy()

In [306]:
pfm['tech_type'] = pfm['primary_fuel'] + '_' + pfm['plant_type']

In [337]:
pfm['tech_type'].unique()
#pfm[pfm['tech_type']=='waste_steam']
#steam[steam['plant_id_pudl']==12299]

array(['coal_steam', 'gas_steam', 'coal_unknown',
       'gas_combustion_turbine', 'gas_combined_cycle',
       'oil_combustion_turbine', 'oil_steam', 'oil_internal_combustion',
       'oil_unknown', 'gas_unknown', 'gas_internal_combustion',
       'coal_internal_combustion', 'waste_steam', 'waste_unknown',
       'oil_combined_cycle', 'coal_combined_cycle'], dtype=object)

In [330]:
eia2 = eia.dropna(subset=['plant_name_eia'])
eia[eia['plant_id_pudl']==108]
fuel[fuel['plant_name_ferc1'].str.contains('ickasaw')]
steam[steam['plant_name_ferc1'].str.contains('esbitt')]

Unnamed: 0,utility_id_ferc1,report_year,plant_name_ferc1,plant_type,construction_type,construction_year,installation_year,capacity_mw,peak_demand_mw,plant_hours_connected_while_generating,plant_capability_mw,not_water_limited_capacity_mw,water_limited_capacity_mw,avg_num_employees,capex_land,capex_structures,capex_equipment,capex_total,opex_operations,opex_fuel,opex_coolants,opex_steam,opex_steam_other,opex_transfer,opex_electric,opex_misc_power,opex_rents,opex_allowances,opex_engineering,opex_structures,opex_boiler,opex_plants,opex_misc_steam,opex_production_total,asset_retirement_cost,record_id,capex_per_mw,opex_per_mwh,net_generation_mwh,plant_id_ferc1,plant_id_pudl,utility_name_ferc1,primary_fuel
20010,22,2010,nesbitt unit 1,steam,outdoor,1975,1975,445.5,419.0,7390.0,,422.0,,116.0,1810392.0,12221268.0,62669019.0,76700679.0,79812.0,56107431.0,,296778.0,,,298638.0,308731.0,,-622.0,315549.0,463012.0,987616.0,291272.0,262827.0,59411044.0,,f1_steam_2010_12_22_0_2,172167.6,73.4,809148.0,1803,413,Cleco Power LLC,
20996,22,2011,nesbitt unit 1,steam,outdoor,1975,1975,445.5,389.0,6590.0,,422.0,,125.0,1810392.0,14121823.0,63883179.0,79815394.0,66536.0,40730854.0,,319809.0,,,298650.0,350214.0,,-128.0,264271.0,99556.0,559016.0,315596.0,271004.0,43275378.0,,f1_steam_2011_12_22_0_2,179159.1,61.3,705984.0,1803,413,Cleco Power LLC,
22259,22,2012,nesbitt unit 1,steam,outdoor,1975,1975,445.5,417.0,5014.0,,422.0,,164.0,1810392.0,14084102.0,63967922.0,79862416.0,42966.0,21756926.0,,404378.0,,,330555.0,403712.0,,1485.0,232794.0,115899.0,1185429.0,640312.0,359676.0,25474132.0,,f1_steam_2012_12_22_0_2,179264.7,40.9,623504.0,1803,413,Cleco Power LLC,
23367,22,2013,nesbitt unit 1,steam,outdoor,1975,1975,445.5,427.0,5425.0,,421.0,,165.0,1810392.0,14128466.0,66057404.0,81996262.0,51718.0,28632475.0,,335261.0,,,326277.0,263334.0,,203.0,263683.0,448658.0,1259531.0,3346071.0,547589.0,35474800.0,,f1_steam_2013_12_22_0_2,184054.5,59.0,601032.0,1803,413,Cleco Power LLC,
24223,22,2014,nesbitt unit 1,steam,outdoor,1975,1975,445.5,427.0,1171.0,,427.0,,164.0,1810392.0,14248420.0,66800124.0,82858936.0,67493.0,9698412.0,,341808.0,,,327778.0,194618.0,,-130.0,239066.0,237076.0,995596.0,296884.0,184361.0,12582962.0,,f1_steam_2014_12_22_0_2,185990.9,75.4,166791.0,1803,413,Cleco Power LLC,
24627,22,2015,nesbitt unit 1,steam,outdoor,1975,1975,445.5,420.0,1068.0,,419.0,,162.0,1810392.0,14472075.0,69837872.0,86120339.0,73568.0,5797476.0,,464936.0,,,373329.0,152554.0,,102.0,248220.0,178491.0,1285238.0,375532.0,153581.0,9103027.0,,f1_steam_2015_12_22_0_2,193311.6,52.2,174290.0,1803,413,Cleco Power LLC,
25657,22,2016,nesbitt unit 1,steam,outdoor,1975,1975,445.5,420.0,3004.0,,421.0,,162.0,1810392.0,15127260.0,70841512.0,87779164.0,94053.0,16406374.0,,385888.0,,,364543.0,373300.0,,-4.0,304066.0,257127.0,1370567.0,626980.0,227647.0,20410541.0,,f1_steam_2016_12_22_0_2,197035.2,39.4,518662.0,1803,413,Cleco Power LLC,
26475,22,2017,nesbitt unit 1,steam,outdoor,1975,1975,445.5,424.0,2300.0,,422.0,,163.0,1810392.0,16047921.0,73000191.0,90858504.0,108470.0,13474739.0,,394589.0,,,451747.0,155949.0,,,304033.0,177740.0,629811.0,209553.0,259975.0,16166606.0,,f1_steam_2017_12_22_0_2,203947.3,46.1,350577.0,1803,413,Cleco Power LLC,
27523,22,2018,nesbitt unit 1,steam,outdoor,1975,1975,445.5,417.0,1456.0,,416.0,,160.0,1810392.0,16366967.0,74657529.0,92834888.0,433282.0,9231472.0,,393096.0,,,386543.0,404221.0,,,138673.0,133655.0,978459.0,224561.0,219460.0,12543422.0,,f1_steam_2018_12_22_0_2,208383.6,52.7,237973.0,1803,413,Cleco Power LLC,
29244,22,2019,nesbitt unit 1,steam,outdoor,1975,1975,445.5,424.0,3510.0,,424.0,,161.0,2279491.0,17114841.0,76997172.0,96420985.0,155292.0,15426822.0,,410606.0,,,437831.0,364183.0,,-3.0,323624.0,103239.0,1152622.0,279446.0,373315.0,19026977.0,29481.0,f1_steam_2019_12_22_0_2,216433.2,35.8,531456.0,1803,413,Cleco Power LLC,


In [294]:
# Make the plant type and primary fuel columns more compatible for comparison
test = (
    f4.copy()
    .assign(
        plant_type = lambda x: x.plant_type.replace('_', ' ', regex=True),
        primary_fuel = lambda x: x.primary_fuel.str.lower(),
        same_tech = lambda x: x.same_tech.str.lower())
)

In [295]:
test2 = test.dropna(subset=['plant_type', 'same_tech']).copy()
test2['similar'] = test2.apply(lambda x: x.plant_type in x.same_tech, axis=1)

In [302]:
len(test2[test2['similar']==False])

3242

In [303]:
test2[test2['similar']==False]

Unnamed: 0,report_year,plant_id_pudl,utility_id_ferc1,plant_name_ferc1,plant_type,technology_description,same_tech,construction_year,retired_unit,installation_year,new_unit,primary_fuel,primary_fuel_flag,name_based,backfill_by_year,backfill_by_eia_year,dup,primary_fuel_by_mmbtu,similar
37,1994,162,7,douglas,combustion turbine,,petroleum liquids,1972,False,1972,False,petroleum liquids,backfill from other year,,Petroleum Liquids,,False,oil,False
52,1994,2078,9,missouri,combustion turbine,,petroleum liquids,1969,False,1969,False,petroleum liquids,backfill from eia year,,,Petroleum Liquids,False,oil,False
53,1994,2077,9,middle sta,combustion turbine,,petroleum liquids,1970,False,1971,False,petroleum liquids,backfill from eia year,,,Petroleum Liquids,False,oil,False
54,1994,2076,9,cedar stat,combustion turbine,,petroleum liquids,1972,False,1972,False,petroleum liquids,backfill from eia year,,,Petroleum Liquids,False,oil,False
60,1994,1835,10,westport,steam,,natural gas fired combustion turbine,1906,False,1950,False,,,,,,True,gas,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29219,2019,14368,45,clemson chp,unknown,Natural Gas Fired Combustion Turbine,natural gas fired combustion turbine,2019,False,2019,False,natural gas fired combustion turbine,fuel in technology name,,Natural Gas Fired Combustion Turbine,,False,gas,False
29224,2019,5152,531,deer creek station,unknown,Natural Gas Fired Combined Cycle,natural gas fired combined cycle,2012,False,2012,False,natural gas fired combined cycle,fuel in technology name,,Natural Gas Fired Combined Cycle,,False,gas,False
29229,2019,2527,531,spirit mound st,unknown,Petroleum Liquids,petroleum liquids,1978,False,1978,False,petroleum liquids,fuel in technology name,,Petroleum Liquids,,False,oil,False
29247,2019,199,22,franklin,internal combustion,Natural Gas Fired Combustion Turbine,natural gas fired combustion turbine,1973,False,1973,True,natural gas fired combustion turbine,fuel in technology name,,Natural Gas Fired Combustion Turbine,,False,,False
