# Calculation of per facility ICIS emissions given related IHS processes

Assumptions -> Conversion factor used for ICIS facility is mean of conversion factors for all corresponding IHS processes
-> For Ethylene, feedstock conversion factor is mean of conversion factors linked to feedstock

In [1]:
# Import packages
import re
import numpy as np
import pandas as pd
from tqdm import tqdm

# File paths
input_path = '../data/'
output_path = '../data/combined/'

production_file = input_path+'processed/icisFacilityProduction_dedoubled.csv'
conversion_factor_file = input_path+'combined/processConversionFactors_allgases_allalloc_dedoubled.csv'
matching_file = input_path+'extra_inputs/all_icis_to_ihs_matches.csv'
matching_on = ['PRODUCT', 'ROUTE', 'TECHNOLOGY', 'LICENSOR']

## Import data

In [2]:
# Import data
icis_ihs_matches = pd.read_csv(matching_file, index_col=0)
facility_production = pd.read_csv(production_file, index_col=0)

conv_factors = pd.read_csv(conversion_factor_file)
conv_factors = conv_factors.dropna(subset=['ihs_match'])

In [3]:
## Weight ammonia conversion factor
sr_percentage = 0.8

ammonia_processes = pd.read_csv('C:/Users\lukec\PycharmProjects\petrochemical-data\data\extra_inputs/ammonia_processes_used.csv', index_col=0)
grouped_amm = conv_factors[conv_factors['Product']=='AMMONIA'].merge(ammonia_processes, on='ihs_match').groupby('Type').mean()
amm_weighted = (1-sr_percentage)*grouped_amm.iloc[0, :]+sr_percentage*grouped_amm.iloc[1, :]

amm_df = pd.DataFrame(amm_weighted).transpose().drop(columns=['Total']).astype(float)
amm_df['Product'], amm_df['ihs_match'] = 'AMMONIA', 'WEIGHTED AMMONIA'
amm_df.index = [3000]
conv_factors = pd.concat((conv_factors[conv_factors['Product']!='AMMONIA'], amm_df))

TypeError: agg function failed [how->mean,dtype->object]

In [4]:
## Get Raw Materials columns
for gas in ['CO2e_20a', 'CO2e_100a', 'Carbon dioxide', 'Carbon monoxide', 'Chloroform', 'Dinitrogen monoxide', 'Ethane', 'Methane', 'Nitric oxide','Nitrogen fluoride', 'Perfluoropentane','Sulfur hexafluoride', 'Other']:
    for col_type in [', mass allocation factor', ', mass allocation sigma', ', energy allocation factor', ', energy allocation sigma', ', economic allocation factor', ', economic allocation sigma']:
        if 'energy' in col_type:
            columns = ['Feedstock '+gas+col_type, 'Organic chemicals '+gas+col_type, 'Primary chemicals '+gas+col_type]
        else:
            columns = ['Feedstock '+gas+col_type, 'Organic chemicals '+gas+col_type, 'Primary chemicals '+gas+col_type, 'Other intermediates '+gas+col_type]
        conv_factors['Raw Material '+gas+col_type] = conv_factors[columns].sum(axis=1)

In [5]:
## Filter out outlying possible processes
poss_processes = icis_ihs_matches.merge(conv_factors, left_on=['ihs_match'], right_on=['ihs_match'], how='left')

# define a function to exclude outliers
def exclude_outliers(group, col='ihs_cradle-to-out-gate CO2e_20a, mass allocation factor'):
    #print('l'+str(len(group)))
    if len(group) > 3:  # only exclude outliers if the group has more than 3 rows
        mean = np.mean(group[col])
        std = np.std(group[col])
        max_distance = 2.5*std  # maximum distance from the mean to be considered an outlier
        distances = np.abs(group[col] - mean)  # calculate distances of each value to the mean
        filtered_group = group[distances <= max_distance]  # keep only values within the maximum distance
        #print('f'+str(len(filtered_group)))
        if len(filtered_group) < 3:  # if less than 3 rows remain, take the 3 closest to the mean
            group['dist'] = np.abs(group[col] - mean)
            closest_rows = group.nsmallest(3, 'dist', keep='all')
            print(closest_rows)
            return closest_rows.drop(columns=['dist'])
        else:
            return filtered_group
    else:
        return group

cols = ['PRODUCT', 'ROUTE', 'TECHNOLOGY', 'LICENSOR']
keep_rows = poss_processes[cols+['ihs_match', 'ihs_cradle-to-out-gate CO2e_20a, mass allocation factor']].groupby(cols).apply(exclude_outliers)
filt_processes = poss_processes.iloc[list(keep_rows.index.get_level_values(4))].reset_index(drop=True)
icis_ihs_matches = filt_processes[['ihs_match']+cols]

In [9]:
poss_processes[['PRODUCT', 'ihs_match']].drop_duplicates().groupby('PRODUCT').count()

Unnamed: 0_level_0,ihs_match
PRODUCT,Unnamed: 1_level_1
"1,4-BUTANEDIOL",8
2-ETHYLHEXANOL,6
ABS,6
ACETALDEHYDE,1
ACETIC ACID,18
...,...
UPE RESINS,0
VACUUM,1
VCM,0
VINYL ACETATE M.,9


In [28]:
poss_processes[poss_processes['ihs_match']=='METHANOL FROM ENERKEM FEEDSTOCK (PLASTICS) CHEMICAL RECYCLING PROCESS']

Unnamed: 0,ihs_match,PRODUCT,ROUTE,TECHNOLOGY,LICENSOR,Product,ei_match,ei_CO2e_20a_cradle-to-gate,ei_CO2e_20a_cradle-to-gate_sigma,ei_CO2e_20a_conv_factor,...,"Raw Material Sulfur hexafluoride, energy allocation factor","Raw Material Sulfur hexafluoride, energy allocation sigma","Raw Material Sulfur hexafluoride, economic allocation factor","Raw Material Sulfur hexafluoride, economic allocation sigma","Raw Material Other, mass allocation factor","Raw Material Other, mass allocation sigma","Raw Material Other, energy allocation factor","Raw Material Other, energy allocation sigma","Raw Material Other, economic allocation factor","Raw Material Other, economic allocation sigma"
5871,METHANOL FROM ENERKEM FEEDSTOCK (PLASTICS) CHE...,METHANOL,COAL TAR,SYNTHESIS,LURGI,METHANOL,methanol,0.057083,0.008562,1.507538,...,0.0,0.0,0.0,0.0,2.580998e-11,4.016344e-12,0.0,0.0,0.0,0.0
5897,METHANOL FROM ENERKEM FEEDSTOCK (PLASTICS) CHE...,METHANOL,COAL TAR,SYNTHESIS,n.a.,METHANOL,methanol,0.057083,0.008562,1.507538,...,0.0,0.0,0.0,0.0,2.580998e-11,4.016344e-12,0.0,0.0,0.0,0.0
5933,METHANOL FROM ENERKEM FEEDSTOCK (PLASTICS) CHE...,METHANOL,HEAVY FUEL OIL,SYNTHESIS,LURGI,METHANOL,methanol,0.057083,0.008562,1.507538,...,0.0,0.0,0.0,0.0,2.580998e-11,4.016344e-12,0.0,0.0,0.0,0.0
5959,METHANOL FROM ENERKEM FEEDSTOCK (PLASTICS) CHE...,METHANOL,HEAVY FUEL OIL,n.a.,n.a.,METHANOL,methanol,0.057083,0.008562,1.507538,...,0.0,0.0,0.0,0.0,2.580998e-11,4.016344e-12,0.0,0.0,0.0,0.0
5985,METHANOL FROM ENERKEM FEEDSTOCK (PLASTICS) CHE...,METHANOL,HEAVY FUEL OIL / COAL,SYNTHESIS,LURGI,METHANOL,methanol,0.057083,0.008562,1.507538,...,0.0,0.0,0.0,0.0,2.580998e-11,4.016344e-12,0.0,0.0,0.0,0.0
6011,METHANOL FROM ENERKEM FEEDSTOCK (PLASTICS) CHE...,METHANOL,NAPHTHAS,SYNTHESIS,HALDOR TOPSOE,METHANOL,methanol,0.057083,0.008562,1.507538,...,0.0,0.0,0.0,0.0,2.580998e-11,4.016344e-12,0.0,0.0,0.0,0.0
6037,METHANOL FROM ENERKEM FEEDSTOCK (PLASTICS) CHE...,METHANOL,NAPHTHAS,SYNTHESIS,ICI,METHANOL,methanol,0.057083,0.008562,1.507538,...,0.0,0.0,0.0,0.0,2.580998e-11,4.016344e-12,0.0,0.0,0.0,0.0
6063,METHANOL FROM ENERKEM FEEDSTOCK (PLASTICS) CHE...,METHANOL,NAPHTHAS,SYNTHESIS,LURGI,METHANOL,methanol,0.057083,0.008562,1.507538,...,0.0,0.0,0.0,0.0,2.580998e-11,4.016344e-12,0.0,0.0,0.0,0.0
6089,METHANOL FROM ENERKEM FEEDSTOCK (PLASTICS) CHE...,METHANOL,NAPHTHAS,SYNTHESIS,n.a.,METHANOL,methanol,0.057083,0.008562,1.507538,...,0.0,0.0,0.0,0.0,2.580998e-11,4.016344e-12,0.0,0.0,0.0,0.0
6115,METHANOL FROM ENERKEM FEEDSTOCK (PLASTICS) CHE...,METHANOL,NAPHTHAS,n.a.,n.a.,METHANOL,methanol,0.057083,0.008562,1.507538,...,0.0,0.0,0.0,0.0,2.580998e-11,4.016344e-12,0.0,0.0,0.0,0.0


In [27]:
filt_processes

Unnamed: 0,ihs_match,PRODUCT,ROUTE,TECHNOLOGY,LICENSOR,Product,ei_match,ei_CO2e_20a_cradle-to-gate,ei_CO2e_20a_cradle-to-gate_sigma,ei_CO2e_20a_conv_factor,...,"Raw Material Sulfur hexafluoride, energy allocation factor","Raw Material Sulfur hexafluoride, energy allocation sigma","Raw Material Sulfur hexafluoride, economic allocation factor","Raw Material Sulfur hexafluoride, economic allocation sigma","Raw Material Other, mass allocation factor","Raw Material Other, mass allocation sigma","Raw Material Other, energy allocation factor","Raw Material Other, energy allocation sigma","Raw Material Other, economic allocation factor","Raw Material Other, economic allocation sigma"
0,"1,4-BUTANEDIOL FROM ACETYLENE AND FORMALDEHYDE","1,4-BUTANEDIOL",ACETYLENE,REPPE,LINDE,"1,4-BUTANEDIOL",butane,1.41024,0.211536,1.241409,...,0.0,0.0,4.069767e-07,1.424520e-07,5.243664e-10,7.873657e-11,0.0,0.0,5.257408e-10,1.840224e-10
1,"1,4-BUTANEDIOL FROM ACETYLENE AND FORMALDEHYDE...","1,4-BUTANEDIOL",ACETYLENE,REPPE,LINDE,"1,4-BUTANEDIOL",butane,1.41024,0.211536,1.241409,...,0.0,0.0,4.064575e-07,1.423271e-07,5.153565e-10,7.782506e-11,0.0,0.0,5.242698e-10,1.835808e-10
2,"1,4-BUTANEDIOL FROM ACETYLENE AND FORMALDEHYDE","1,4-BUTANEDIOL",ACETYLENE,REPPE,n.a.,"1,4-BUTANEDIOL",butane,1.41024,0.211536,1.241409,...,0.0,0.0,4.069767e-07,1.424520e-07,5.243664e-10,7.873657e-11,0.0,0.0,5.257408e-10,1.840224e-10
3,"1,4-BUTANEDIOL FROM ACETYLENE AND FORMALDEHYDE...","1,4-BUTANEDIOL",ACETYLENE,REPPE,n.a.,"1,4-BUTANEDIOL",butane,1.41024,0.211536,1.241409,...,0.0,0.0,4.064575e-07,1.423271e-07,5.153565e-10,7.782506e-11,0.0,0.0,5.242698e-10,1.835808e-10
4,"1,4-BUTANEDIOL FROM ACETYLENE AND FORMALDEHYDE","1,4-BUTANEDIOL",ACETYLENE,n.a.,n.a.,"1,4-BUTANEDIOL",butane,1.41024,0.211536,1.241409,...,0.0,0.0,4.069767e-07,1.424520e-07,5.243664e-10,7.873657e-11,0.0,0.0,5.257408e-10,1.840224e-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10665,,VISBREAKING,VISBREAKING,LUMMUS,n.a.,,,,,,...,,,,,,,,,,
10666,,VISBREAKING,VISBREAKING,SHELL,n.a.,,,,,,...,,,,,,,,,,
10667,,VISBREAKING,VISBREAKING,SNAM PROGETTI,n.a.,,,,,,...,,,,,,,,,,
10668,,VISBREAKING,VISBREAKING,UNIVERSAL OIL PRODUCTS CO,n.a.,,,,,,...,,,,,,,,,,


In [25]:
## Find values in one list that are not in the other
#

poss_processes['ihs_match'].unique()[~pd.Series(poss_processes['ihs_match'].unique()).isin(pd.Series(filt_processes['ihs_match'].unique()))]

array(['ETHYLENE FROM ATOL™ PROCESS BY AXENS',
       'POLYETHYLENE, HD, BY LOW PRESSURE SOLUTION PROCESS(STAMICARBON TECHNOLOGY)',
       'METHANOL FROM ENERKEM FEEDSTOCK (PLASTICS) CHEMICAL RECYCLING PROCESS'],
      dtype=object)

In [6]:
## Add IFA production
fert_production_file = input_path+'extracted/IFA_production_w_uncertainties.csv'
ifa_production = pd.read_csv(fert_production_file)

name_conversions = {
    'NH3': 'AMMONIA',
    'AN': 'AMMONIUM NITRATE',
    'Ammonium nitrate (33.5-0-0) granulated': 'AMMONIUM NITRATE',
    'AS': 'AMMONIUM SULPHATE',
    'CAN': 'CALCIUM AMMONIUM NITRATE',
    'Calcium ammonium nitrate (27-0-0)': 'CALCIUM AMMONIUM NITRATE',
    'Urea (46-0-0)': 'UREA'
}

ifa_ihs_matches = {
    'AMMONIA':'AMMONIA',
    'AMMONIUM NITRATE': 'AMMONIUM NITRATE FERTILIZER',
    'AMMONIUM SULPHATE': 'HYDROXYLAMMONIUM SULFATE',
    'CALCIUM AMMONIUM NITRATE':'AMMONIUM NITRATE FERTILIZER',
    'UREA': 'UREA, AGRICULTURAL GRADE'
}

ifa_production['PRODUCT'] = ifa_production['PRODUCT'].replace(name_conversions)
ifa_production.rename(columns={'Region':'COUNTRY/TERRITORY'}, inplace=True)
ifa_production['Conv_name'] = ifa_production['PRODUCT'].replace(ifa_ihs_matches)

In [7]:
conv_factors

Unnamed: 0,Product,ei_match,ei_CO2e_20a_cradle-to-gate,ei_CO2e_20a_cradle-to-gate_sigma,ei_CO2e_20a_conv_factor,ei_CO2e_20a_conv_factor_sigma,ei_CO2e_100a_cradle-to-gate,ei_CO2e_100a_cradle-to-gate_sigma,ei_CO2e_100a_conv_factor,ei_CO2e_100a_conv_factor_sigma,...,"Raw Material Sulfur hexafluoride, energy allocation factor","Raw Material Sulfur hexafluoride, energy allocation sigma","Raw Material Sulfur hexafluoride, economic allocation factor","Raw Material Sulfur hexafluoride, economic allocation sigma","Raw Material Other, mass allocation factor","Raw Material Other, mass allocation sigma","Raw Material Other, energy allocation factor","Raw Material Other, energy allocation sigma","Raw Material Other, economic allocation factor","Raw Material Other, economic allocation sigma"
0,"1,12-DODECANEDIAMINE",,,,,,,,,,...,0.0,0.0,0.000000e+00,0.000000e+00,1.116213e-10,1.674320e-11,0.0,0.0,0.000000e+00,0.000000e+00
1,"1,12-DODECANEDIOIC ACID",,,,,,,,,,...,0.0,0.0,1.334469e-08,4.670642e-09,3.075138e-11,4.612707e-12,0.0,0.0,3.075138e-11,1.076298e-11
2,"1,12-DODECANEDIOIC ACID",,,,,,,,,,...,0.0,0.0,4.549688e-09,1.592391e-09,4.162631e-11,6.243946e-12,0.0,0.0,4.162631e-11,1.456921e-11
3,"1,3-BUTADIENE",,,,,,,,,,...,0.0,0.0,0.000000e+00,0.000000e+00,2.055011e-13,3.893346e-14,0.0,0.0,0.000000e+00,0.000000e+00
4,"1,3-BUTADIENE",,,,,,,,,,...,0.0,0.0,0.000000e+00,0.000000e+00,2.435475e-13,4.583293e-14,0.0,0.0,0.000000e+00,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2541,"ZEOLITE A, DETERGENT GRADE",,,,,,,,,,...,0.0,0.0,0.000000e+00,0.000000e+00,1.218056e-09,1.827084e-10,0.0,0.0,0.000000e+00,0.000000e+00
2545,ZINC DITHIOPHOSPHATE,,,,,,,,,,...,0.0,0.0,0.000000e+00,0.000000e+00,-3.457310e-09,-5.284413e-10,0.0,0.0,0.000000e+00,0.000000e+00
2550,ZINC PHOSPHORODITHIOATE,,,,,,,,,,...,0.0,0.0,0.000000e+00,0.000000e+00,-3.861968e-09,-5.792952e-10,0.0,0.0,0.000000e+00,0.000000e+00
2555,ZIRCONIUM OXIDE,,,,,,,,,,...,0.0,0.0,0.000000e+00,0.000000e+00,7.210201e-10,1.081530e-10,0.0,0.0,0.000000e+00,0.000000e+00


In [8]:
## Exclude outliers
poss_ifa = ifa_production.merge(conv_factors, left_on='Conv_name', right_on='Product', how='left').drop(columns=['Conv_name', 'Product'])
cols = ['PRODUCT']
ifa_years = [str(i) for i in range(1978,2051)]
keep_rows = poss_ifa[cols+['ihs_match', 'ihs_cradle-to-out-gate CO2e_20a, mass allocation factor']].groupby(cols).apply(exclude_outliers)
filt_ifa = poss_ifa.iloc[list(keep_rows.index.get_level_values(1))].reset_index(drop=True)
ifa_conversion = filt_ifa[['COUNTRY/TERRITORY']+ifa_years+cols+[i+'_sigma' for i in ifa_years]+['ihs_match']]

In [9]:
facility_conversion = facility_production.merge(icis_ihs_matches, on=matching_on, how='left')
facility_conversion = pd.concat((facility_conversion, ifa_conversion))

In [10]:
# Print out Products with no IHS match
facility_conversion[facility_conversion['ihs_match'].isna()][['PRODUCT','ROUTE']].drop_duplicates()

Unnamed: 0,PRODUCT,ROUTE
3570,ACRYLIC FIBRES,0
4472,ALKYLBENZENE,BENZENE
4828,ASPHALT,ASPHALT
8780,BIOJET (SAF),RENEWABLE
8871,BIOLPG,RENEWABLE
12203,BUTYRALDEHYDE,OXO PROCESS
12205,BUTYRALDEHYDE,ACETALDEHYDE
14401,CRUDE,CRUDE
14414,CRUDE,CONDENSATE DISTILLATION
18084,DAA/MIBK/KETONES,0


In [11]:
### -> Misses products with no IHS match
facility_conversion_orig = facility_conversion.dropna(subset=['ihs_match']).merge(conv_factors, on=['ihs_match'], how='left')
facility_conversion.head()

Unnamed: 0,PRODUCT,COUNTRY/TERRITORY,STATE,COMPANY,SITE,#,ROUTE,TECHNOLOGY,LICENSOR,START_YR,...,2042_sigma,2043_sigma,2044_sigma,2045_sigma,2046_sigma,2047_sigma,2048_sigma,2049_sigma,2050_sigma,ihs_match
0,"1,4-BUTANEDIOL",BELGIUM,n.a.,BASF,FELUY,1.0,MALEIC ANHYDRIDE,HYDRATION,KVAERNER PROCESS TECH,1997.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"1,4-BUTANEDIOL FROM MALEIC ANHYDRIDE (DIMETHYL..."
1,"1,4-BUTANEDIOL",CHINA,JIANGSU,BLUESTAR NEW CHEM. MAT.,NANJING,1.0,MALEIC ANHYDRIDE,HYDRATION,KVAERNER PROCESS TECH,2009.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"1,4-BUTANEDIOL FROM MALEIC ANHYDRIDE (DIMETHYL..."
2,"1,4-BUTANEDIOL",CHINA,JIANGSU,BLUESTAR NEW CHEM. MAT.,NANJING,2.0,MALEIC ANHYDRIDE,HYDRATION,n.a.,2012.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"1,4-BUTANEDIOL FROM MALEIC ANHYDRIDE (DIMETHYL..."
3,"1,4-BUTANEDIOL",CHINA,LIAONING,CHANGLIAN CHEMICAL (PANJIN),PANJIN,1.0,PROPYLENE OXIDE,VIA ALLYL ALCOHOL,n.a.,2014.0,...,5.226989,5.364788,5.494559,5.622993,5.751427,5.879861,6.008295,6.136729,6.265162,"1,4-BUTANEDIOL FROM PROPYLENE OXIDE (ALLYL ALC..."
4,"1,4-BUTANEDIOL",CHINA,JIANGSU,CHANGZHOU CHEMICAL PLANT,CHANGZHOU,1.0,MALEIC ANHYDRIDE,HYDRATION,SINO,1998.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"1,4-BUTANEDIOL FROM MALEIC ANHYDRIDE (DIMETHYL..."


## Calculate emissions by combining production with conversion factors

In [13]:
# Take average of CM and EI conversion factors

emission_val_cols_all = [['CO2e_20a', 'CO2e_100a', 'Carbon dioxide'], ['Carbon monoxide', 'Chloroform', 'Dinitrogen monoxide'], ['Ethane', 'Methane', 'Nitric oxide'], ['Nitrogen fluoride', 'Perfluoropentane'], ['Sulfur hexafluoride', 'Other']]

for i, emission_val_cols in enumerate(emission_val_cols_all):
    emission_val_cols_sigma = [col + '_sigma' for col in emission_val_cols]
    facility_conversion = facility_conversion_orig.copy()

    #for i, emission_val_cols in enumerate(emission_val_cols_all):
    emission_val_cols_sigma = [col + '_sigma' for col in emission_val_cols]
    facility_conversion = facility_conversion_orig.copy()

    for column, col_sigma in zip(emission_val_cols, emission_val_cols_sigma):
        facility_conversion['combined_' + column] = np.nanmean([facility_conversion['ei_' + column + '_conv_factor'], facility_conversion['cm_' + column + '_conv_factor']], axis=0)
        facility_conversion['combined_' + col_sigma] = np.nanmean([facility_conversion['ei_' + column + '_conv_factor_sigma'], facility_conversion['cm_' + column + '_conv_factor_sigma']], axis=0)

    facility_conversion = facility_conversion[facility_conversion.columns[['ei' not in col and 'cm' not in col for col in facility_conversion.columns]]]

    facility_conversion.columns = [i.replace(', mass allocation ','_').replace('_factor','') for i in facility_conversion.columns]

    facility_conversion.rename(columns={'ihs_match':'PROCESS'}, inplace=True)

    facility_conversion.columns = [i.replace(',  allocation ','_').replace('_factor','') for i in facility_conversion.columns]

    # Calculate facility emissions for
    dbs = ['combined_', 'ihs_cradle-to-out-gate ', 'Feedstock ', 'Organic chemicals ', 'Primary chemicals ', 'Other intermediates ', 'Indirect Utilities ', 'Direct Utilities ', 'Direct Process ', 'Electricity ', 'Raw Material ']
    names = ['EI & CM', 'IHS CtOG', 'Feedstock', 'Organic chemicals', 'Primary chemicals', 'Other intermediates', 'Indirect Utilities', 'Direct Utilities', 'Direct Process', 'Electricity', 'Raw Material']

    # Create base dataframe to use
    years = [str(i) for i in range(1978, 2051)]
    years_sigma = [year+'_sigma' for year in years]
    base_columns = ['PRODUCT', 'COUNTRY/TERRITORY', 'STATE', 'COMPANY', 'SITE', '#',
           'ROUTE', 'TECHNOLOGY', 'LICENSOR', 'START_YR', 'COMPLEX', 'LATITUDE', 'LONGITUDE', 'PROCESS'] + years + years_sigma
    base_df = facility_conversion[base_columns]

    facility_emissions = pd.DataFrame()
    for db, name in tqdm(zip(dbs, names)):
        for gas in tqdm(emission_val_cols):
            df = base_df.copy()
            df[years] = df[years].multiply(facility_conversion[db+gas], axis='index')
            ## Incorrect error propagation here
            df[years_sigma] = df[years_sigma].multiply(facility_conversion[db+gas+'_sigma'], axis='index')
            df['Gas'] = gas
            df['Type'] = name
            facility_emissions = pd.concat((facility_emissions, df), axis = 0)

    facility_emissions.to_parquet(output_path+'icisFacilityEmissions_allIhsProcesses_w_uncertainties_dedoubled'+str(i+1)+'.parquet')

  facility_conversion['combined_' + column] = np.nanmean([facility_conversion['ei_' + column + '_conv_factor'], facility_conversion['cm_' + column + '_conv_factor']], axis=0)
  facility_conversion['combined_' + col_sigma] = np.nanmean([facility_conversion['ei_' + column + '_conv_factor_sigma'], facility_conversion['cm_' + column + '_conv_factor_sigma']], axis=0)
0it [00:00, ?it/s]
  0%|          | 0/3 [00:00<?, ?it/s][A
 33%|███▎      | 1/3 [00:00<00:00,  2.02it/s][A
 67%|██████▋   | 2/3 [00:00<00:00,  2.09it/s][A
100%|██████████| 3/3 [00:01<00:00,  2.02it/s][A
1it [00:01,  1.49s/it]
  0%|          | 0/3 [00:00<?, ?it/s][A
 33%|███▎      | 1/3 [00:00<00:01,  1.72it/s][A
 67%|██████▋   | 2/3 [00:01<00:00,  1.66it/s][A
100%|██████████| 3/3 [00:01<00:00,  1.56it/s][A
2it [00:03,  1.75s/it]
  0%|          | 0/3 [00:00<?, ?it/s][A
 33%|███▎      | 1/3 [00:00<00:01,  1.08it/s][A
 67%|██████▋   | 2/3 [00:01<00:00,  1.11it/s][A
100%|██████████| 3/3 [00:02<00:00,  1.12it/s][A
3it [0

In [37]:
# facility_emissions = pd.read_parquet(output_path+'icisFacilityEmissions_allIhsProcesses_w_uncertainties_fert2050'+'0'+'.parquet')
# years = [str(i) for i in range(1978, 2051)]
# years_sigma = [year+'_sigma' for year in years]
#
# facility_emissions[facility_emissions.columns[:13]] = facility_emissions[facility_emissions.columns[:13]].fillna('n.a.')
# aggregated_emissions = facility_emissions.groupby(list(facility_emissions.columns[:13])+['Gas','Type']).mean().reset_index()
#
# stdevs = facility_emissions[list(facility_emissions.columns[:13])+['Gas','Type']+years].groupby(list(facility_emissions.columns[:13])+['Gas','Type']).agg(np.std)
#
# # Keep largest uncertainty between technologies and others
# aggregated_emissions[years_sigma] = np.maximum(stdevs.fillna(0).values, aggregated_emissions.fillna(0)[years_sigma].values)

In [2]:
endings = [str(i) for i in range(1,6)]
years = [str(i) for i in range(1978, 2051)]
years_sigma = [year+'_sigma' for year in years]

mean_aggregated = pd.DataFrame()

for end in tqdm(endings):
    facility_emissions = pd.read_parquet(output_path+'icisFacilityEmissions_allIhsProcesses_w_uncertainties_dedoubled'+end+'.parquet')
    print('0')

    facility_emissions[facility_emissions.columns[:13]] = facility_emissions[facility_emissions.columns[:13]].fillna('n.a.')
    # Take mean of possible emissions given different possible technologies for each facility
    aggregated_emissions = facility_emissions.groupby(list(facility_emissions.columns[:13])+['Gas','Type']).mean()
    print('1')

    ## Get technology uncertainty by taking stdev
    stdevs = facility_emissions[list(facility_emissions.columns[:13])+['Gas','Type']+years].groupby(list(facility_emissions.columns[:13])+['Gas','Type']).agg(np.std)
    print('5')

    # Keep largest uncertainty between technologies and others
    aggregated_emissions[years_sigma] = np.maximum(stdevs.fillna(0).values, aggregated_emissions.fillna(0)[years_sigma].values)

    mean_aggregated = pd.concat((mean_aggregated, aggregated_emissions))

  0%|          | 0/5 [00:00<?, ?it/s]

0
1
5


 20%|██        | 1/5 [02:13<08:53, 133.27s/it]

0
1
5


 40%|████      | 2/5 [04:06<06:05, 121.68s/it]

0
1
5


 60%|██████    | 3/5 [06:07<04:02, 121.08s/it]

0
1
5


 80%|████████  | 4/5 [07:14<01:39, 99.80s/it] 

0
1
5


100%|██████████| 5/5 [08:23<00:00, 100.72s/it]


In [7]:
mean_aggregated[mean_aggregated.columns[:15]] = mean_aggregated[mean_aggregated.columns[:15]].astype(np.float64)

In [9]:
mean_aggregated.head()

Unnamed: 0,PRODUCT,COUNTRY/TERRITORY,STATE,COMPANY,SITE,#,ROUTE,TECHNOLOGY,LICENSOR,START_YR,...,2041_sigma,2042_sigma,2043_sigma,2044_sigma,2045_sigma,2046_sigma,2047_sigma,2048_sigma,2049_sigma,2050_sigma
0,"1,4-BUTANEDIOL",BELGIUM,n.a.,BASF,FELUY,1.0,MALEIC ANHYDRIDE,HYDRATION,KVAERNER PROCESS TECH,1997.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"1,4-BUTANEDIOL",BELGIUM,n.a.,BASF,FELUY,1.0,MALEIC ANHYDRIDE,HYDRATION,KVAERNER PROCESS TECH,1997.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"1,4-BUTANEDIOL",BELGIUM,n.a.,BASF,FELUY,1.0,MALEIC ANHYDRIDE,HYDRATION,KVAERNER PROCESS TECH,1997.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"1,4-BUTANEDIOL",BELGIUM,n.a.,BASF,FELUY,1.0,MALEIC ANHYDRIDE,HYDRATION,KVAERNER PROCESS TECH,1997.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"1,4-BUTANEDIOL",BELGIUM,n.a.,BASF,FELUY,1.0,MALEIC ANHYDRIDE,HYDRATION,KVAERNER PROCESS TECH,1997.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# mean_aggregated = mean_aggregated.reset_index()
# mean_aggregated.drop(columns=['index'], inplace=True)
mean_aggregated[mean_aggregated.columns[:15]] = mean_aggregated[mean_aggregated.columns[:15]].astype(str)

mean_aggregated.to_parquet(output_path+'icisFacilityEmissions_ihsMean_w_uncertainties_allgases_dedoubled.parquet')

In [11]:
del facility_emissions
del mean_aggregated

## Weighted average for Ethylene production

In [12]:
# Read in individual facilities
facility_production = pd.read_csv(production_file, index_col=0)
eth_prod = facility_production[facility_production['PRODUCT']=='ETHYLENE'].reset_index(drop=True)
conv_factors = pd.read_csv(conversion_factor_file)
eth_conv = conv_factors[conv_factors['Product']=='ETHYLENE'].reset_index(drop=True)

# Ethylene feedstocks
feedstocks = pd.read_csv(input_path+'extracted/icisEthyleneFeedstocks_1978-2050.csv', index_col=0, header=[0,1])
feedstock_types = pd.read_csv(input_path+'extra_inputs/feedstock_type.csv')

In [13]:
# Get emissions for each feedstock
years = list(map(str, list(range(1978, 2051))))

eth_prod = facility_production[facility_production['PRODUCT']=='ETHYLENE']
feedstock_matches = feedstocks.merge(eth_prod, how='left', left_on=list(feedstocks.columns[:6]),
                                     right_on=['COUNTRY/TERRITORY','STATE','COMPANY','SITE', '#', 'START_YR'])

capacity_cols = [i for i in feedstock_matches.columns if 'CAPACITY' in str(i)]

for col, year in zip(capacity_cols, years):
    feedstock_matches[col] = feedstock_matches[year]

feedstock_matches.drop(columns=list(facility_production.columns), inplace=True)
feedstock_matches.columns = pd.MultiIndex.from_tuples((feedstock_matches.columns))

  return merge(


In [14]:
del feedstocks

In [15]:
feedstock_vals = feedstock_matches.copy()
for year in years:
    df = feedstock_vals[year]
    df['CAPACITY'] = df['CAPACITY'].apply(lambda x: re.sub("[^0-9.]", "0", str(x))).astype(float)
    df[df.columns[1:]] = df[df.columns[1:]].multiply(df['CAPACITY']/100, axis='index')
    feedstock_vals[year] = df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['CAPACITY'] = df['CAPACITY'].apply(lambda x: re.sub("[^0-9.]", "0", str(x))).astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [16]:
def exclude_outliers(group, col='ihs_cradle-to-out-gate CO2e_20a, mass allocation factor'):
    #print('l'+str(len(group)))
    if len(group) > 3:  # only exclude outliers if the group has more than 3 rows
        mean = np.mean(group[col])
        std = np.std(group[col])
        max_distance = std  # maximum distance from the mean to be considered an outlier
        distances = np.abs(group[col] - mean)  # calculate distances of each value to the mean
        filtered_group = group[distances <= max_distance]  # keep only values within the maximum distance
        #print('f'+str(len(filtered_group)))
        if len(filtered_group) < 3:  # if less than 3 rows remain, take the 3 closest to the mean
            group['dist'] = np.abs(group[col] - mean)
            closest_rows = group.nsmallest(3, 'dist', keep='all')
            print(closest_rows)
            return closest_rows.drop(columns=['dist'])
        else:
            return filtered_group
    else:
        return group

In [17]:
feedstock_emissions = eth_conv.merge(feedstock_types, on='ihs_match', how='left')

# Take mean of possible emissions given different possible technologuies for each facility
aggregated_emissions = feedstock_emissions.groupby(['Feedstock']).mean()

#col = 'ihs_cradle-to-out-gate CO2e_100a,  allocation factor'
keep_match_locs = feedstock_emissions.groupby('Feedstock').apply(exclude_outliers).drop(columns=['Feedstock']).reset_index()['level_1']
keep_matches = eth_conv.loc[keep_match_locs]
keep_rows = feedstock_emissions['ihs_match'].isin(keep_matches['ihs_match'])
feedstock_emissions = feedstock_emissions[keep_rows]

filt_agg = feedstock_emissions.groupby(['Feedstock']).mean()

## Get technology uncertainty by taking stdev
stdevs = feedstock_emissions[['Feedstock']+[i for i in feedstock_emissions.columns if 'ihs' in i and 'sigma' not in i]].groupby(['Feedstock']).agg(np.std)
#
# # Keep largest uncertainty between technologies and others
years_sigma = [i for i in feedstock_emissions.columns if 'ihs' in i and 'sigma' in i]
filt_agg[years_sigma] = np.abs((stdevs.fillna(0).values-filt_agg.fillna(0)[years_sigma].values)/2)+np.minimum(stdevs.fillna(0).values, filt_agg.fillna(0)[years_sigma].values)#np.maximum(stdevs.fillna(0).values, filt_agg.fillna(0)[years_sigma].values)

In [19]:
# filt_agg.to_csv('C:/Users\lukec\PycharmProjects\petrochemical-data\data\processed/ethylene_conversion_factors.csv')

In [18]:
filt_agg = pd.read_csv('C:/Users\lukec\PycharmProjects\petrochemical-data\data\processed/ethylene_conversion_factors.csv', index_col=0)

In [19]:
from tqdm import tqdm
# Apply emissions to each facility
blank = feedstock_vals[feedstock_vals.columns[:7]]
blank.columns = list(blank.columns.droplevel(1))
conversions = filt_agg.columns[['allocation' in name for name in filt_agg.columns]]

for conversion in tqdm(conversions):
    fs_ems = filt_agg[conversion]
    each_conv = pd.DataFrame()
    for year in years:
        df = feedstock_vals[year]
        for fs in df.columns[1:]:
            df[fs] = df[fs]*fs_ems.loc[fs]
        yearly = blank.copy()
        yearly['Year'] = year
        yearly[conversion] = np.sum(df[df.columns[1:]].values, axis=1)
        each_conv = pd.concat((each_conv,yearly), axis=0)
    conv_emissions = pd.concat((blank, each_conv.pivot(columns=['Year'], values=conversion)), axis=1)
    conv_emissions['conversion'] = conversion
    if conversion != conversions[0]:
        ethylene_ems = pd.concat((ethylene_ems, conv_emissions), axis=0)#.merge(each_conv, on=list(each_conv.columns[:8]), how='left')
    else: ethylene_ems = conv_emissions.copy()

ethylene_ems.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[fs] = df[fs]*fs_ems.loc[fs]
100%|██████████| 208/208 [01:12<00:00,  2.86it/s]


Unnamed: 0,COUNTRY/TERRITORY,STATE,COMPANY,SITE,#,START_YR,START_MO,1978,1979,1980,...,2042,2043,2044,2045,2046,2047,2048,2049,2050,conversion
0,ARGENTINA,n.a.,DOW CHEMICAL,BAHIA BLANCA,2.0,2001.0,na,,,,...,720.883957,720.883957,720.883957,720.883957,720.883957,720.883957,720.883957,720.883957,720.883957,"ihs_cradle-to-out-gate CO2e_20a, allocation f..."
1,ARGENTINA,n.a.,PAMPA ENERGIA,P.TO SAN MARTIN,1.0,0.0,na,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"ihs_cradle-to-out-gate CO2e_20a, allocation f..."
2,ARGENTINA,n.a.,PAMPA ENERGIA,SAN LORENZO/AR,2.0,1978.0,na,0.0,0.0,0.0,...,,,,,,,,,,"ihs_cradle-to-out-gate CO2e_20a, allocation f..."
3,AUSTRALIA,n.a.,QENOS,ALTONA,1.0,1971.0,na,91.545799,95.135831,98.725862,...,,,,,,,,,,"ihs_cradle-to-out-gate CO2e_20a, allocation f..."
4,AUSTRALIA,n.a.,QENOS,ALTONA,2.0,1961.0,na,406.906499,422.863617,438.820735,...,,,,,,,,,,"ihs_cradle-to-out-gate CO2e_20a, allocation f..."


In [20]:
# Convert output to facility_emissions format
ethylene_conv = ethylene_ems.copy()
ethylene_conv['conversion'] = [i.replace(',  allocation ','_').replace('_factor','') for i in ethylene_conv['conversion']]
#ethylene_conv.columns = [i.replace(',  allocation ','_').replace('_factor','') for i in ethylene_conv.columns]

dbs = ['ihs_cradle-to-out-gate ', 'Feedstock ', 'Organic chemicals ', 'Primary chemicals ', 'Other intermediates ', 'Direct Utilities ', 'Indirect Utilities ', 'Direct Process ', 'Electricity ', 'Raw Material ']
names = ['IHS CtOG', 'Feedstock', 'Organic chemicals', 'Primary chemicals', 'Other intermediates', 'Direct Utilities', 'Indirect Utilities', 'Direct Process', 'Electricity', 'Raw Material']
# dbs = ['ihs_cradle-to-out-gate ', 'Raw Material ', 'Indirect Utilities ', 'Direct Utilities ', 'Direct Process ', 'Electricity ']
# names = ['IHS CtOG', 'Raw Material', 'Indirect Utilities', 'Direct Utilities', 'Direct Process', 'Electricity']
emission_val_cols = ['CO2e_20a', 'CO2e_100a', 'Carbon dioxide', 'Carbon monoxide', 'Chloroform', 'Dinitrogen monoxide', 'Ethane', 'Methane', 'Nitric oxide', 'Nitrogen fluoride', 'Perfluoropentane', 'Sulfur hexafluoride', 'Other']

base_cols = list(ethylene_conv.columns[:7])

ethylene_vals = pd.DataFrame()
ethylene_sigmas = pd.DataFrame()
#
for db, name in zip(dbs, names):
    for gas in emission_val_cols:
        df = ethylene_conv[ethylene_conv['conversion']==db+gas]
        df['Gas'] = gas
        df['Type'] = name
        ethylene_vals = pd.concat((ethylene_vals, df), axis = 0)

        df_sigma = ethylene_conv[ethylene_conv['conversion']==db+gas+'_sigma']
        df_sigma['Gas'] = gas
        df_sigma['Type'] = name
        ethylene_sigmas = pd.concat((ethylene_sigmas, df_sigma), axis = 0)

ethylene_weighted = ethylene_vals.merge(ethylene_sigmas, on=base_cols+['Gas', 'Type'], how='left', suffixes=('','_sigma')).reset_index()
#
# #ethylene_weighted.columns.name = None
ethylene_weighted = ethylene_weighted.fillna(0).drop(columns=['conversion', 'conversion_sigma', 'index'])

ethylene_weighted[['COUNTRY/TERRITORY', 'STATE', 'COMPANY', 'SITE', '#', 'START_YR', 'Type', 'Gas']] = ethylene_weighted[['COUNTRY/TERRITORY', 'STATE', 'COMPANY', 'SITE', '#', 'START_YR', 'Type', 'Gas']].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Gas'] = gas
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Type'] = name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sigma['Gas'] = gas
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the do

In [21]:
# Update ethylene values in facility emissions database
facility_emissions = pd.read_parquet(output_path+'icisFacilityEmissions_ihsMean_w_uncertainties_allgases_dedoubled.parquet')

eth_ems = facility_emissions[facility_emissions['PRODUCT']=='ETHYLENE']

emissions_merged = eth_ems.merge(ethylene_weighted, on=['COUNTRY/TERRITORY', 'STATE', 'COMPANY', 'SITE', '#', 'START_YR', 'Type', 'Gas'], how='left', suffixes=('_old',''))

years = [str(i) for i in range(1978, 2051)]
years_sigma = [year+'_sigma' for year in years]

for year, uncert in zip(years, years_sigma):
    emissions_merged[year] = emissions_merged[year].fillna(emissions_merged[year+'_old'])
    emissions_merged[uncert] = emissions_merged[uncert].fillna(emissions_merged[uncert+'_old'])

eth_emissions_update = emissions_merged.drop(columns=list(emissions_merged.columns[['_old' in i for i in emissions_merged.columns]]) + ['START_MO'])

In [27]:
del facility_emissions

In [23]:
full_update = pd.concat((facility_emissions[facility_emissions['PRODUCT']!='ETHYLENE'], eth_emissions_update), axis=0)

In [24]:
del ethylene_ems

In [25]:
full_update[years_sigma] = full_update[years_sigma].astype(float)

In [28]:
full_update.sort_values(list(full_update.columns[:15])).to_parquet(output_path+'icisFacilityEmissions_ihsWeighted_w_uncertainties_allgases_dedoubled.parquet')

In [35]:
def filter_df(df, cols, filters):
    for col, filt in zip(cols, filters):
        df = df[[i in filt for i in df[col]]]
    return df

filtered = filter_df(full_update, ['Gas'], [['CO2e_100a']])

In [37]:
filtered = filter_df(filtered, ['Type'], [['Direct Process', 'Direct Utilities', 'Indirect Utilities', 'Feedstock', 'Organic chemicals', 'Primary chemicals', 'Other intermediates']])

In [40]:
filtered['2020'].sum()

4072440.6086431607

In [43]:
filter_df(filtered, ['PRODUCT'], [['AMMONIA']])['2020'].sum()

276876.9117265616

In [36]:
## Utility functions
def uncertainty_propagation(calc:str, x:float, dx:float, y:float=1, dy:float=0, z:float=1, propagation_type:str='simple') -> float:
    if calc == 'mult':
        xdiv = np.divide(dx, x, out=np.zeros_like(dx), where=x!=0)
        ydiv = np.divide(dy, y, out=np.zeros_like(dy), where=y!=0)
        if propagation_type == 'simple':
            return (xdiv + ydiv)*z
        elif propagation_type == 'stdev':
            return np.sqrt(pow(xdiv,2) + pow(ydiv,2))*z
        else: Exception('Specified propagation_type not recognised.')

    elif calc == 'add':
        if propagation_type == 'simple':
            return abs(dx)+abs(dy)
        elif propagation_type == 'stdev':
            return np.sqrt(pow(dx,2) + pow(dy,2))
        else: Exception('Specified propagation_type not recognised.')
    else: Exception('Please specify calc of propagation')#%%

# All possible facility emissions given different processes for making same product

In [5]:
facility_match = facility_conversion[list(facility_conversion.columns[:list(facility_conversion.columns).index('PROCESS')+2])]

In [6]:
# sort_col = 'ihs_cradle-to-out-gate CO2e_20a,  allocation factor'
# min_conv_factors = conv_factors.sort_values(['Product', sort_col]).groupby('Product').head(1).reset_index(drop=True)

In [9]:
facility_min_type = facility_match.drop(columns='PROCESS').merge(conv_factors, on='Product', how='left').drop_duplicates()

In [10]:
emission_val_cols = ['CO2e_20a', 'CO2e_100a']#, 'Carbon dioxide', 'Carbon monoxide', 'Chloroform', 'Dinitrogen monoxide', 'Ethane', 'Methane', 'Nitric oxide', 'Nitrogen fluoride', 'Perfluoropentane', 'Sulfur hexafluoride', 'Other']
emission_val_cols_sigma = [col + '_sigma' for col in emission_val_cols]

for column, col_sigma in zip(emission_val_cols, emission_val_cols_sigma):
    facility_min_type['combined_' + column] = np.nanmean([facility_min_type['ei_' + column + '_cradle-to-gate'], facility_min_type['cm_' + column + '_cradle-to-gate']], axis=0)
    facility_min_type['combined_' + col_sigma] = np.nanmean([facility_min_type['ei_' + column + '_cradle-to-gate_sigma'], facility_min_type['cm_' + column + '_cradle-to-gate_sigma']], axis=0)

facility_min_type = facility_min_type[facility_min_type.columns[['ei' not in col and 'cm' not in col for col in facility_min_type.columns]]]

facility_min_type.columns = [i.replace(',  allocation factor','').replace(',  allocation sigma','_sigma') for i in facility_min_type.columns]

facility_min_type.rename(columns={'ihs_match':'PROCESS'}, inplace=True)

  facility_min_type['combined_' + column] = np.nanmean([facility_min_type['ei_' + column + '_cradle-to-gate'], facility_min_type['cm_' + column + '_cradle-to-gate']], axis=0)
  facility_min_type['combined_' + col_sigma] = np.nanmean([facility_min_type['ei_' + column + '_cradle-to-gate_sigma'], facility_min_type['cm_' + column + '_cradle-to-gate_sigma']], axis=0)


In [11]:
# Calculate facility emissions for
dbs = ['combined_', 'Raw Material ', 'Indirect Utilities ', 'Direct Utilities ', 'Direct Process ', 'Electricity ']
names = ['EI & CM', 'Raw Material', 'Indirect Utilities', 'Direct Utilities', 'Direct Process', 'Electricity']

# Create base dataframe to use
years = [str(i) for i in range(1978, 2051)]
years_sigma = [year+'_sigma' for year in years]
base_columns = ['PRODUCT', 'COUNTRY/TERRITORY', 'STATE', 'COMPANY', 'SITE', '#',
       'ROUTE', 'TECHNOLOGY', 'LICENSOR', 'START_YR', 'COMPLEX', 'LATITUDE', 'LONGITUDE', 'PROCESS'] + years + years_sigma
base_df = facility_min_type[base_columns]

facility_mins = pd.DataFrame()
for db, name in tqdm(zip(dbs, names)):
    for gas in tqdm(emission_val_cols):
        df = base_df.copy()
        df[years] = df[years].multiply(facility_min_type[db+gas], axis='index')
        ## Incorrect error propagation here
        df[years_sigma] = df[years_sigma].multiply(facility_min_type[db+gas+'_sigma'], axis='index')
        df['Gas'] = gas
        df['Type'] = name
        facility_mins = pd.concat((facility_mins, df), axis = 0)

0it [00:00, ?it/s]
  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.03s/it][A
100%|██████████| 2/2 [00:01<00:00,  1.04it/s][A
1it [00:01,  1.93s/it]
  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:00<00:00,  1.02it/s][A
100%|██████████| 2/2 [00:02<00:00,  1.05s/it][A
2it [00:04,  2.03s/it]
  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.52s/it][A
100%|██████████| 2/2 [00:03<00:00,  1.62s/it][A
3it [00:07,  2.58s/it]
  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.68s/it][A
100%|██████████| 2/2 [00:03<00:00,  1.63s/it][A
4it [00:10,  2.85s/it]
  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.76s/it][A
100%|██████████| 2/2 [00:03<00:00,  1.82s/it][A
5it [00:14,  3.13s/it]
  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:03<00:03,  3.26s/it][A
100%|██████████| 2/2 [00:06<00:00,  3.08s/it][A
6it [00:20,  3.39s/it]


In [12]:
# Save as parquet file for large size
facility_mins.to_parquet(output_path+'icisFacilityEmissions_ihsAllPossible_w_uncertainties.parquet')