## Obtain process impacts by combining process outputs with impact factors

Input - xml files (.spold is a form of .xml) downloaded from OpenLCA
        impact factor .xml file also downloaded from OpenLCA

Output - csv file of impact for each gas for all processes in xml files

In [4]:
# Import packages
import os
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
pd.options.mode.chained_assignment = None

# Variable definition
data_path = "../data/"
lcaDataPath = data_path+"EcoSpold01"
impactMethods_20 = data_path+ "extra_inputs/EI_3_8_IPCC2013_GWP20a.xml"
impactMethods_100 = data_path+ "extra_inputs/EI_3_8_IPCC2013_GWP100a.xml"
output_path = '../data/extracted/'

In [5]:
# Define functions
def to_listlist(inlist: list):
    """Ensures element is a list of lists even if single inner list"""
    return [inlist] if type(inlist[0]) is not list else inlist

def read_xml_attributes(filepath:str,branches:list,attributes:list,df=False):
    """This function creates a dataframe of attributes within an xml file.\n
    Inputs:\n
    filepath - path to xml file\n
    branches - list of successive branch choices\n
    attributes - attributes to be read from chosen branch\n
    Outputs:\n
    df - pandas dataframe of attributes for each end branch\n
    Requirements: pandas as pd, xml.etree.ElementTree as ET"""
    if df is False: df = pd.DataFrame([],columns=[i for sublist in to_listlist(attributes) for i in sublist])
    tree_loc = ET.parse(filepath). getroot()
    branches, attributes = to_listlist(branches), to_listlist(attributes)
    attr_values = dict()
    for pathnum, path in enumerate(branches):
        for branch in path:
            for num,val in enumerate([branch in i.tag for i in tree_loc]):
                if val:
                    if branch is path[-1]:
                        attr_values.update(dict(zip(attributes[pathnum],[tree_loc[num].attrib[i] if i else tree_loc[num].text for i in attributes[pathnum]])))
                        if path is branches[-1]:
                            df = pd.concat([df, pd.DataFrame(np.array([[i] for i in attr_values.values()]).transpose(),columns=attr_values.keys())],ignore_index=True)
                    else:
                        tree_loc=tree_loc[num]
                        break
    return df

In [6]:
# Fetch and display impact factors
impacts_20, impacts_100 = [read_xml_attributes(input_file,['dataset','flowData','exchange'],['name','category','subCategory','meanValue']).rename(columns={'meanValue':name}) for input_file, name in zip([impactMethods_20, impactMethods_100], ['CO2e_20a', 'CO2e_100a'])]
impacts = impacts_20.merge(impacts_100, on=list(impacts_20.columns[:-1])).sort_values('name')
impacts.to_csv(output_path+'EI_3_8_IPCC2013_CO2e.csv')

In [None]:
# Calculate emissions from LCA data inventories
# Load impact factors
impacts = pd.read_csv(output_path+'EI_3_8_IPCC2013_CO2e.csv', index_col=0).reset_index(drop=True)

# xml file metadata for row
directory = lcaDataPath
path = [['dataset','meta','process','reference'],['dataset','meta','process','geo']]
attributes = [['name','generalComment'],['location']]

# xml file output data
output_attributes = ['name', 'category', 'subCategory', 'meanValue']
output_branches = ['dataset', 'flowData', 'exchange']

# impact data (CO2 calculated + all others)
co2e_cols = ['CO2e_20a', 'CO2e_100a']
impact_columns = list(impacts[['name']+co2e_cols].drop_duplicates().sort_values('name')['name'].values)

# Function calculating corresponding emissions for all files in subset and outputting parquet file carried out in batch and can be parallelised
file_list = os.listdir(directory)
attributes = to_listlist(attributes)
batch_size = 100

def calc_xml_emissions(num):
    subset = file_list[num:num+batch_size]
    df_new = pd.DataFrame([],columns=[j for i in attributes for j in i] + co2e_cols + impact_columns)
    df_all = df_new.copy()
    for file in subset:
        df = read_xml_attributes(os.path.join(directory, file), path, attributes, df_new)
        outputs = read_xml_attributes(os.path.join(directory, file), output_branches, output_attributes)
        outputs = outputs.merge(impacts)
        outputs[['meanValue']+co2e_cols] = outputs[['meanValue']+co2e_cols].apply(pd.to_numeric)
        df[co2e_cols] = [sum(outputs['meanValue']*outputs[impact_col]) for impact_col in co2e_cols]
        if len(outputs) != 0:
            summary = outputs[['name','meanValue']].groupby('name').sum()
            df[list(summary.index)] = list(summary['meanValue'].values)
        df_all = pd.concat((df_all, df))
    df_all.to_parquet(data_path+'process_emissions/process_emissions_'+str(num)+'-'+str(num+batch_size)+'.parquet')

list(map(calc_xml_emissions, range(0, 11000, batch_size)))

In [13]:
cm_emissions = pd.concat(pd.read_parquet(data_path+'process_emissions/'+file) for file in os.listdir(data_path+'process_emissions/'))

cm_emissions.sort_values(['name','location']).to_csv(data_path+'process_emissions/cm_emissions.csv', index=False)

# ei_emissions = pd.concat(pd.read_parquet(data_path+'process_emissions/'+file) for file in os.listdir(data_path+'process_emissions/'))
# ei_emissions.sort_values(['name','location']).to_csv('C:/Users\lukec\OneDrive - University of Cambridge\PhD\Data\EcoInvent\process_emissions\ei_emissions.csv', index=False)

In [18]:
## Add uncertainties
input_path = ''
ei = pd.read_csv(input_path+'ei_emissions_IPCC2013.csv')
cm = pd.read_csv(input_path+'cm_emissions_IPCC2013.csv')

uncertainty_factor = 0.1
for col in ei.columns[3:]:
    ei[col+'_sigma'] = ei[col]*uncertainty_factor
    cm[col+'_sigma'] = cm[col]*uncertainty_factor
    
ei.to_csv(input_path+'ei_emissions_IPCC2013_uncertainties.csv')
cm.to_csv(input_path+'cm_emissions_IPCC2013_uncertainties.csv')