# Gridded EPA Methane Inventory
## Extension - GHGI 2022

***
#### Authors: 
Erin E. McDuffie
#### Date Last Updated: 
see Step 0
#### Notebook Purpose: 
This Notebook extends and reports annual gridded (0.1°x0.1°) methane emission fluxes (molec./cm2/s) from Industry (Ferroalloy & Petrochemical production) sources for the years 2012-2020, using updated inventory values from the 2022 National GHGI.  
#### Summary & Notes:
EPA annual national methane emissions are read in for the 2022 GHGI (either from the GHGI workbooks or public data). National emissions are then scaled down to CONUS emissions using the relative fraction of CONUS/total emissions from the v2 data (for each year, held constant after 2018). Remaining CONUS data are then allocated to proxy groups using the relevant proxy mapping files and allocated to the grid using the relative mass of emissions in each grid cell from each group from version 2 (for each year, held constant after 2018). Annual emission fluxes (molec./cm2/s) for 2012-2020 are then written to final netCDFs in the ‘/code/Final_Gridded_Data/Extension/v2_ext_final’ folder.
***

-------
## Step 0. Set-Up Notebook Modules, Functions, and Local Parameters and Constants
-------

In [None]:
#Confirm working directory
import os
import time
modtime = os.path.getmtime('./2B8_2C2_Industry_extension.ipynb')
modificationTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(modtime))
print("This file was last modified on: ", modificationTime)
print('')
print("The directory we are working in is {}" .format(os.getcwd()))

In [None]:
## Include plots within notebook
%matplotlib inline

In [None]:
# Import base modules
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import datetime
from copy import copy

# Import additional modules
# Load plotting package Basemap 
from mpl_toolkits.basemap import Basemap

# Load netCDF (for manipulating netCDF file types)
from netCDF4 import Dataset

# Set up ticker
import matplotlib.ticker as ticker

#add path for the global function module (file)
import sys
module_path = os.path.abspath(os.path.join('../Global_Functions/'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Load Tabula (for reading tables from PDFs)
import tabula as tb   
    
# Load user-defined global functions (modules)
import data_load_functions as data_load_fn
import data_functions as data_fn
import data_IO_functions as data_IO_fn
import data_plot_functions as data_plot_fn

In [None]:
#INPUT Files
# Assign global file names
global_filenames = data_load_fn.load_global_file_names()
State_ANSI_inputfile = global_filenames[0]
County_ANSI_inputfile = global_filenames[1]
pop_map_inputfile = global_filenames[2]
Grid_area01_inputfile = global_filenames[3]
Grid_area001_inputfile = global_filenames[4]
Grid_state001_ansi_inputfile = global_filenames[5]
Grid_county001_ansi_inputfile = global_filenames[6]
globalinputlocation = global_filenames[0][0:20]
print(globalinputlocation)

# Specify names of inputs files used in this notebook
EPA_inputfile = '../Global_InputData/GHGI/Ch2_Industry/IPPU CH4 emissions from ferroalloys and petrochemicals 1990-2020.xlsx'


Ind_Mapping_inputfile = './InputData/Industry_ProxyMapping.xlsx'

grid_emi_inputfile = '../Final_Gridded_Data/Extension/v2_input_data/Ind_Petro_Ferro_Grid_Emi2.nc'

#OUTPUT FILES
gridded_outputfile = '../Final_Gridded_Data/Extension/v2_ext_final/EXT_EPA_v2_2B8_2C2_Industry.nc'
netCDF_description = '2020 Extension of the Gridded EPA Inventory (v2)- Industry - IPCC Source Category 2B5 and 2C1'
gridded_petro_outputfile = '../Final_Gridded_Data/Extension/v2_ext_final/EXT_EPA_v2_2B8_Industry_Petrochemical.nc'
netCDF_petro_description = 'EXTENSION to the Gridded EPA Inventory - Industry Emissions - IPCC Source Category 2B8 - Petrochemical'
gridded_ferro_outputfile = '../Final_Gridded_Data/Extension/v2_ext_final/EXT_EPA_v2_2C2_Industry_Ferroalloy.nc'
netCDF_ferro_description = 'EXTENSION to the Gridded EPA Inventory - Industry Emissions - IPCC Source Category 2C2 - Ferroalloy'
title_str = "EPA methane emissions from industry"
title_petro_str = "EPA methane emissions from petrochemical industry"
title_ferro_str = "EPA methane emissions from ferroalloy production"
title_diff_str = "Emissions from industry total difference: 2020-2012"
title_petro_diff_str = "Emissions from petrochemical difference: 2020-2012"
title_ferro_diff_str = "Emissions from ferroalloy total difference: 2020-2012"


In [None]:
# Define local variables
start_year = 2012  #First year in emission timeseries
end_year = 2018    #Last year in emission timeseries
ext_year = 2020    #last year in extended dataset
end_year_idx = 2018-2012 #index of the year 2018
year_range = [*range(start_year, ext_year+1,1)] #List of emission years
year_range_str=[str(i) for i in year_range]
num_years = len(year_range)

# Define constants
Avogadro   = 6.02214129 * 10**(23)  #molecules/mol
Molarch4   = 16.04                  #g/mol
Res01      = 0.1                    # degrees
Res_01     = 0.01
tg_scale   = 0.001                  #Tg scale number [New file allows for the exclusion of the territories] 

# Continental US Lat/Lon Limits (for netCDF files)
Lon_left = -130       #deg
Lon_right = -60       #deg
Lat_low  = 20         #deg
Lat_up  = 55          #deg
loc_dimensions = [Lat_low, Lat_up, Lon_left, Lon_right]

ilat_start = int((90+Lat_low)/Res01) #1100:1450 (continental US range)
ilat_end = int((90+Lat_up)/Res01)
ilon_start = abs(int((-180-Lon_left)/Res01)) #500:1200 (continental US range)
ilon_end = abs(int((-180-Lon_right)/Res01))

# Number of days in each month
month_day_leap  = [  31,  29,  31,  30,  31,  30,  31,  31,  30,  31,  30,  31]
month_day_nonleap = [  31,  28,  31,  30,  31,  30,  31,  31,  30,  31,  30,  31]

# Month arrays
month_range_str = ['January','February','March','April','May','June','July','August','September','October','November','December']
num_months = len(month_range_str)

area_map, lat001, lon001 = data_load_fn.load_area_map_001(Grid_area001_inputfile)
area_map01, Lat01, Lon01 = data_load_fn.load_area_map_01(Grid_area01_inputfile)[0:3]
#Select relevant Continental 0.1 x0.1 domain
Lat_01 = Lat01[ilat_start:ilat_end]
Lon_01 = Lon01[ilon_start:ilon_end]
area_matrix_01 = data_fn.regrid001_to_01(area_map, Lat_01, Lon_01)
area_matrix_01 *= 10000  #convert from m2 to cm2


In [None]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

In [None]:
# Track run time
ct = datetime.datetime.now() 
it = ct.timestamp() 
print("current time:", ct) 

## Step 1. Read in Gridding Groups

In [None]:
#load GHGI Mapping Groups
names = pd.read_excel(Ind_Mapping_inputfile, sheet_name = "GHGI Map - Ind", usecols = "A:B",skiprows = 1, header = 0)
colnames = names.columns.values
ghgi_ind_map = pd.read_excel(Ind_Mapping_inputfile, sheet_name = "GHGI Map - Ind", usecols = "A:B", skiprows = 1, names = colnames)
#drop rows with no data, remove the parentheses and ""
ghgi_ind_map = ghgi_ind_map[ghgi_ind_map['GHGI_Emi_Group'] != 'na']
ghgi_ind_map = ghgi_ind_map[ghgi_ind_map['GHGI_Emi_Group'].notna()]
ghgi_ind_map['GHGI_Source']= ghgi_ind_map['GHGI_Source'].str.replace(r"\(","")
ghgi_ind_map['GHGI_Source']= ghgi_ind_map['GHGI_Source'].str.replace(r"\)","")
ghgi_ind_map.reset_index(inplace=True, drop=True)
display(ghgi_ind_map)

#load emission group - proxy map
names = pd.read_excel(Ind_Mapping_inputfile, sheet_name = "Proxy Map - Ind", usecols = "A:D",skiprows = 1, header = 0)
colnames = names.columns.values
proxy_ind_map = pd.read_excel(Ind_Mapping_inputfile, sheet_name = "Proxy Map - Ind", usecols = "A:D", skiprows = 1, names = colnames)
display((proxy_ind_map))

-----------
## Step 2. Read in v2 Grid Group Emissions
----------

In [None]:
#These data will be assigned to 'proxy_+ghgi_emi_name' (because original proxy mapping is not 1:1 with GHGI group)
#All proxy data will be in 0.1x0.1xyear dimensions
#asign 2018 values to years 2019 ad 2020

nc_in = Dataset(grid_emi_inputfile, 'r', format='NETCDF4')
sum_emi = 0
Emissions_nongrid = np.zeros([num_years])

for igroup in np.arange(0,len(proxy_ind_map)):
    vars()['Proxy_'+proxy_ind_map.loc[igroup,'GHGI_Emi_Group']] = np.zeros([len(Lat_01),len(Lon_01),num_years])
    temp = nc_in['Ext_'+proxy_ind_map['GHGI_Emi_Group'][igroup]][:,:,:]
    for iyear in np.arange(0,num_years):
        if year_range[iyear] <= end_year:
            vars()['Proxy_'+proxy_ind_map.loc[igroup,'GHGI_Emi_Group']][:,:,iyear] = temp[:,:,iyear]
        else:
            vars()['Proxy_'+proxy_ind_map.loc[igroup,'GHGI_Emi_Group']][:,:,iyear] = temp[:,:,end_year_idx]

#assign 2018 values to years 2019 and 2020
for iyear in np.arange(0,num_years):
    if year_range[iyear] <= end_year:
        Emissions_nongrid[iyear] = nc_in['Emissions_nongrid'][iyear]
    else:
        Emissions_nongrid[iyear] = nc_in['Emissions_nongrid'][end_year_idx]

CONUS_frac = np.zeros([num_years])

for iyear in np.arange(0, num_years):
    for igroup in np.arange(0,len(proxy_ind_map)):
        sum_emi += np.sum( vars()['Proxy_'+proxy_ind_map.loc[igroup,'GHGI_Emi_Group']][:,:,iyear])
    CONUS_frac[iyear] = Emissions_nongrid[iyear]/sum_emi
        
print(CONUS_frac)

-----------
## Step 3. Read in and Format 2022 US EPA GHGI Emissions
----------

In [None]:
# Read Petrochemical GHGI emissions (1990-2020), in kt

#Petrochemicals
names = pd.read_excel(EPA_inputfile, skiprows=11,usecols='B:AH')
colnames = names.columns.values
EPA_petro_emissions = pd.read_excel(EPA_inputfile, skiprows = 14, rows=1,names = colnames,usecols='B:AH')
EPA_petro_emissions = EPA_petro_emissions.drop(columns = [*range(1990, start_year,1)])
EPA_petro_emissions= EPA_petro_emissions.drop(columns = ['Unnamed: 2'])
EPA_petro_emissions['Source'] = 'Total Petrochemicals'

#Ferroalloy
EPA_ferro_emissions = pd.read_excel(EPA_inputfile, skiprows = 14, rows=1,names=colnames,usecols='B:AH')
EPA_ferro_emissions= EPA_ferro_emissions.drop(columns = ['Unnamed: 2'])
EPA_ferro_emissions = EPA_ferro_emissions.drop(columns = [*range(1990, start_year,1)])
EPA_ferro_emissions['Source'] = 'Total Ferroalloy'

EPA_Industry = pd.concat([EPA_petro_emissions,EPA_ferro_emissions])
display(EPA_Industry)

Total_EPA_Industry_Emissions = EPA_ferro_emissions.iloc[0,1:]+EPA_petro_emissions.iloc[0,1:]

#### 3.2. Split Emissions into Gridding Groups

In [None]:
# Final Emissions in Units of kt
# Use mapping proxy and source files to split the GHGI emissions

DEBUG =1

start_year_idx = EPA_Industry.columns.get_loc(start_year)
end_year_idx = EPA_Industry.columns.get_loc(end_year)+1
sum_emi = np.zeros(num_years)

ghgi_ind_groups = ghgi_ind_map['GHGI_Emi_Group'].unique()

for igroup in np.arange(0,len(ghgi_ind_groups)): #loop through all groups, finding the GHGI sources in that group and summing emissions for that region, year
        vars()[ghgi_ind_groups[igroup]] = np.zeros([num_years])
        source_temp = ghgi_ind_map.loc[ghgi_ind_map['GHGI_Emi_Group'] == ghgi_ind_groups[igroup], 'GHGI_Source']
        pattern_temp  = '|'.join(source_temp)
        ##DEBUG## display(pattern_temp)
        emi_temp = EPA_Industry[EPA_Industry['Source'].str.contains(pattern_temp)]
        ##DEBUG## display(emi_temp)
        vars()[ghgi_ind_groups[igroup]][:] = np.where(emi_temp.iloc[:,start_year_idx:] =='',[0],emi_temp.iloc[:,start_year_idx:]).sum(axis=0)#/float(1000) #convert Mg to kt
        
#Check against total summary emissions 
print('QA/QC #1: Check Processing Emission Sum against GHGI Summary Emissions')
for iyear in np.arange(0,num_years): 
    sum_emi = 0
    for igroup in np.arange(0,len(ghgi_ind_groups)):
        sum_emi += vars()[ghgi_ind_groups[igroup]][iyear]
        
    summary_emi = Total_EPA_Industry_Emissions[year_range[iyear]]  
    #Check 1 - make sure that the sums from all the regions equal the totals reported
    diff1 = abs(sum_emi - summary_emi)/((sum_emi + summary_emi)/2)
    if DEBUG ==1:
        print(summary_emi)
        print(sum_emi)
    if diff1 < 0.0001:
        print('Year ', year_range[iyear],': PASS, difference < 0.01%')
    else:
        print('Year ', year_range[iyear],': FAIL (check Production & summary tabs): ', diff1,'%') 

--------------
## Step 4. Grid Data
-------------

##### Step 4.1 Allocate emissions to the CONUS region (0.1x0.1)

In [None]:
# Allocate national emissions (Tg) onto a 0.1x0.1 grid using gridcell level 'Proxy_Groups'

DEBUG =1
#Define emission arrays
Emissions_array_01 = np.zeros([len(Lat_01),len(Lon_01),num_years])
Emissions_nongrid = np.zeros([num_years])
Emissions_Ferro = np.zeros([len(Lat_01),len(Lon_01),num_years])
Emissions_Petro = np.zeros([len(Lat_01),len(Lon_01),num_years])


# For each year, distribute natinal emissions onto a grid proxies specified in the Proxy_Mapping file

print('**QA/QC Check: Sum of national gridded emissions vs. GHGI national emissions')
#running_sum = np.zeros([len(proxy_abdcoal_map),num_years])

for igroup in np.arange(0,len(proxy_ind_map)):
    proxy_temp = vars()['Proxy_'+proxy_ind_map.loc[igroup,'GHGI_Emi_Group']][:,:,:]
    vars()['Flux_'+proxy_ind_map.loc[igroup,'GHGI_Emi_Group']] = np.zeros([len(Lat_01),len(Lon_01),num_years])
    vars()['Flux_'+proxy_ind_map.loc[igroup,'GHGI_Emi_Group']+'_nongrid'] = np.zeros([num_years])

    for iyear in np.arange(0,num_years):
        proxy_frac = data_fn.safe_div(proxy_temp[:,:,iyear], np.sum(proxy_temp[:,:,iyear]))
        ghgi_temp = vars()[proxy_ind_map.loc[igroup,'GHGI_Emi_Group']][iyear] * (1-CONUS_frac[iyear])
        vars()['Flux_'+proxy_ind_map.loc[igroup,'GHGI_Emi_Group']][:,:,iyear] += ghgi_temp * proxy_frac[:,:]
        if 'Ferro' in proxy_ind_map.loc[igroup,'GHGI_Emi_Group']:
            Emissions_Ferro[:,:,iyear] += vars()['Flux_'+proxy_ind_map.loc[igroup,'GHGI_Emi_Group']][:,:,iyear]
        if 'Petro' in proxy_ind_map.loc[igroup,'GHGI_Emi_Group']:
            Emissions_Petro[:,:,iyear] += vars()['Flux_'+proxy_ind_map.loc[igroup,'GHGI_Emi_Group']][:,:,iyear]
        Emissions_array_01[:,:,iyear] += vars()['Flux_'+proxy_ind_map.loc[igroup,'GHGI_Emi_Group']][:,:,iyear]
        Emissions_nongrid[iyear] += vars()[proxy_ind_map.loc[igroup,'GHGI_Emi_Group']][iyear] - ghgi_temp
       
        
for iyear in np.arange(0, num_years):    
    calc_emi = np.sum(Emissions_Petro[:,:,iyear]) +np.sum(Emissions_Ferro[:,:,iyear])+ np.sum(Emissions_nongrid[iyear]) 
    summary_emi = Total_EPA_Industry_Emissions[year_range[iyear]]
    emi_diff = abs(summary_emi-calc_emi)/((summary_emi+calc_emi)/2)
    if DEBUG==1:
        print(calc_emi)
        print(summary_emi)
    if abs(emi_diff) < 0.0001:
        print('Year '+ year_range_str[iyear]+': Difference < 0.01%: PASS')
    else: 
        print('Year '+ year_range_str[iyear]+': Difference > 0.01%: FAIL, diff: '+str(emi_diff))
        
ct = datetime.datetime.now() 
print("current time:", ct)

#### 4.2 Calculate Gridded Emission Fluxes (molec./cm2/s) (0.1x0.1)

In [None]:
#Step 2 -- #Convert emissions to emission flux
# conversion: kt emissions to molec/cm2/s flux

DEBUG=1

#Initialize arrays
check_sum_annual = np.zeros([num_years])
Flux_array_01_annual = np.zeros([len(Lat_01),len(Lon_01),num_years])
Flux_array_01_ferro_annual = np.zeros([len(Lat_01),len(Lon_01),num_years])
Flux_array_01_petro_annual = np.zeros([len(Lat_01),len(Lon_01),num_years])
for igroup in np.arange(0,len(proxy_ind_map)):
    vars()['Flux_'+proxy_ind_map.loc[igroup,'GHGI_Emi_Group']+'_annual'] = np.zeros([len(Lat_01),len(Lon_01),num_years])


#Calculate fluxes
for iyear in np.arange(0,num_years):
    if year_range[iyear]==2012 or year_range[iyear]==2016:
        year_days = np.sum(month_day_leap)
        month_days = month_day_leap
    else:
        year_days = np.sum(month_day_nonleap)
        month_days = month_day_nonleap 
    
    # calculate fluxes for annual data  (=kt * grams/kt *molec/mol *mol/g *s^-1 * cm^-2)
    conversion_factor_annual = 10**9 * Avogadro / float(Molarch4 * np.sum(month_days) * 24 * 60 *60) / area_matrix_01
    for igroup in np.arange(0,len(proxy_ind_map)):
        vars()['Flux_'+proxy_ind_map.loc[igroup,'GHGI_Emi_Group']][:,:,iyear] *= conversion_factor_annual
        vars()['Flux_'+proxy_ind_map.loc[igroup,'GHGI_Emi_Group']+'_annual'][:,:,iyear] = vars()['Flux_'+proxy_ind_map.loc[igroup,'GHGI_Emi_Group']][:,:,iyear]
        Flux_array_01_annual[:,:,iyear] = Emissions_array_01[:,:,iyear]*conversion_factor_annual
        Flux_array_01_ferro_annual[:,:,iyear] = Emissions_Ferro[:,:,iyear]*conversion_factor_annual
        Flux_array_01_petro_annual[:,:,iyear] = Emissions_Petro[:,:,iyear]*conversion_factor_annual
    check_sum_annual[iyear] = np.sum(Flux_array_01_ferro_annual[:,:,iyear]/conversion_factor_annual) +\
                                np.sum(Flux_array_01_petro_annual[:,:,iyear]/conversion_factor_annual)#convert back to emissions to check at end

print(' ')
print('QA/QC #2: Check final gridded fluxes against GHGI')  
# for the sum, check the converted annual emissions (convert back from flux) plus all the non-gridded emissions
for iyear in np.arange(0,num_years):
    calc_emi = check_sum_annual[iyear] + Emissions_nongrid[iyear]
    summary_emi = Total_EPA_Industry_Emissions[year_range[iyear]]
    if DEBUG==1:
        print(calc_emi)
        print(summary_emi)
    diff = abs(summary_emi-calc_emi)/((summary_emi+calc_emi)/2)
    if diff < 0.0001:
        print('Year ', year_range[iyear], ': PASS, difference < 0.01%')
    else:
        print('Year ', year_range[iyear], ': FAIL -- Difference = ', diff*100,'%')
        
Flux_Emissions_Total_annual = Flux_array_01_annual
Flux_Emissions_Petro_annual = Flux_array_01_petro_annual
Flux_Emissions_Ferro_annual = Flux_array_01_ferro_annual

-------------
## Step 5. Write netCDF
------------

In [None]:
# Initialize netCDF files

data_IO_fn.initialize_netCDF(gridded_outputfile, netCDF_description, 0, year_range, loc_dimensions, Lat_01, Lon_01)
data_IO_fn.initialize_netCDF(gridded_petro_outputfile, netCDF_petro_description, 0, year_range, loc_dimensions, Lat_01, Lon_01)
data_IO_fn.initialize_netCDF(gridded_ferro_outputfile, netCDF_ferro_description, 0, year_range, loc_dimensions, Lat_01, Lon_01)

# Write the Data to netCDF
nc_out = Dataset(gridded_outputfile, 'r+', format='NETCDF4')
nc_out.variables['emi_ch4'][:,:,:] = Flux_Emissions_Total_annual
nc_out.close()
#Confirm file location
print('** SUCCESS **')
print("Gridded industry fluxes written to file: {}" .format(os.getcwd())+gridded_outputfile)
print('')

#Petro
# Write the Data to netCDF
nc_out = Dataset(gridded_petro_outputfile, 'r+', format='NETCDF4')
nc_out.variables['emi_ch4'][:,:,:] = Flux_Emissions_Petro_annual
nc_out.close()
#Confirm file location
print('** SUCCESS **')
print("Gridded industry fluxes written to file: {}" .format(os.getcwd())+gridded_petro_outputfile)
print('')

#Ferro
# Write the Data to netCDF
nc_out = Dataset(gridded_ferro_outputfile, 'r+', format='NETCDF4')
nc_out.variables['emi_ch4'][:,:,:] = Flux_Emissions_Ferro_annual
nc_out.close()
#Confirm file location
print('** SUCCESS **')
print("Gridded industry fluxes written to file: {}" .format(os.getcwd())+gridded_ferro_outputfile)
print('')

----------
## Step 6. Plot Gridded Data
---------

#### Step 6.1. Plot Annual Emission Fluxes

In [None]:
#Plot annual data for entire timeseries
scale_max = 2
save_flag = 0
save_outfile = ''
data_plot_fn.plot_annual_emission_flux_map(Flux_Emissions_Total_annual, Lat_01, Lon_01, year_range, title_str, scale_max,save_flag,save_outfile)

In [None]:
#Plot each individually (could change to plot each group)
for igroup in np.arange(0,len(proxy_ind_map)):
    temp_plot = vars()['Flux_'+proxy_ind_map.loc[igroup,'GHGI_Emi_Group']]
    data_plot_fn.plot_annual_emission_flux_map(temp_plot, Lat_01, Lon_01, year_range, proxy_ind_map.loc[igroup,'GHGI_Emi_Group'], scale_max,save_flag,save_outfile)

#### Step 6.2 Plot Difference between first and last inventory year

In [None]:
# Plot difference between last and first year for the industry total
data_plot_fn.plot_diff_emission_flux_map(Flux_Emissions_Total_annual, Lat_01, Lon_01, year_range, title_diff_str,save_flag,save_outfile)

In [None]:
ct = datetime.datetime.now() 
ft = ct.timestamp() 
time_elapsed = (ft-it)/(60*60)
print('Time to run: '+str(time_elapsed)+' hours')
print('** EXTENSION_GEPA_2B8_2C2_Industry: COMPLETE **')