In [56]:
import pandas as pd
import numpy as np
import glob
import os

### Import Data

In [57]:
paths = glob.glob(f'./Plan*L*.xls')

df1 = pd.DataFrame()

for path in paths:
    tmp = pd.read_excel(path, skiprows=11)

    tmp.rename(columns={'-': 'municipal_code', 
                    '-.1': 'municipality_name', 
                    '-.2': 'UF', 
                    '-.3': 'code1', 
                    '-.4': 'code2', 
                    '-.5': 'provedor_name', 
                    '-.6': 'sigla', 
                    '-.7': 'scope', 
                    '-.8': 'jur_nature', 
                    '-.9': 'service_type'}, inplace=True)

    # filter by the intereted columns
    tmp = tmp[['municipal_code', 'municipality_name', 'UF', 'service_type','GE12b' ,'GE005', 'ES005', 'ES006', 'ES014', 'ES015']]

    df1 = pd.concat([df1, tmp], ignore_index=True)

In [58]:
folder_path = './others/'

dataframes = []

for file in os.listdir(folder_path):
    if file.endswith('.xls'):  
        file_path = os.path.join(folder_path, file)

        df = pd.read_excel(file_path, skiprows=11)

        df.rename(columns={'-': 'municipal_code', 
                     '-.1': 'municipality_name', 
                     '-.2': 'UF', 
                     '-.3': 'code1', 
                     '-.4': 'code2', 
                     '-.5': 'provedor_name', 
                     '-.6': 'sigla', 
                     '-.7': 'scope', 
                     '-.8': 'jur_nature', 
                     '-.9': 'service_type'}, inplace=True)
        
        stop_index = df[df['municipal_code'] == 'TOTALIZAÇÃO DAS INFORMAÇÕES DESAGREGADAS (Desagre):  '].index.to_list()[0]

        if not pd.isna(stop_index):
            df = df.iloc[:stop_index]
        
        dataframes.append(df)

# Combine all processed DataFrames into one
df2 = pd.concat(dataframes, ignore_index=True)

# filter by the intereted columns
df2 = df2[['municipal_code', 'municipality_name', 'UF', 'service_type', 'GE12b', 'GE005', 'ES005', 'ES006', 'ES014', 'ES015']]

In [59]:
# concatenate the two DataFrames
df_f = pd.concat([df1, df2], ignore_index=True)

In [60]:
column_names = ['municipal_code', 'municipality_name', 'UF', 'service_type', 'total_resident_population','number_municipalities', 'collected', 'treated', 'imported', 'exported']
df_f.columns = column_names

In [61]:
# Select only the rows related to the sewer system information and one mucipality atended by the service
df_f = df_f[~df_f['service_type'].isin(['Água']) & (df_f['number_municipalities'] == 1)]

### Formulas

In [62]:
def TOW(P, BOD, I):
    """
    Units required:
    - P: cap
    - BOD: kg BOD / cap / yr
    - I: unitless

    example: TOW(520600, 18.25, 1.25)
    """
    return P*BOD*I

In [63]:
def EFj(Bo, MCFj):
    """
    Units required:
    - Bo: kg CH4 / kg BOD
    - MCFj: unitless

    example: EFj(0.6, 0.3)
    """
    return Bo*MCFj

In [64]:
def CH4_emissions(Ui, Tij, EFj, TOW, S, R):
    """
    Units required:
    - Ui: fraction
    - Tij: fraction
    - EFj: kg CH4 / kg BOD
    - TOW: kg BOD / yr
    - S: kg CH4 / yr
    - R: kg CH4 / yr

    example: CH4_emissions(0.16, 0.1, 0.18, 11876187.5, 0, 0)
    """
    return ((Ui*Tij*EFj)*(TOW-S))-R

In [65]:
def EF_N2O(protein, Fnpr, F_non_con, F_ind_com):
    """
    Emission factor for N2O based on default values.
    Source: IPCC 2006
    units requered:
        - protein: [kg protein / person / yr]
        - Fnpr: [kg N / kg protein]
        - F_non_con: unitless
        - F_ind_com: unitless
    """
    return protein*Fnpr*F_non_con*F_ind_com

### CH4 Calculations

In [66]:
df_f1 = df_f.copy()

In [67]:
bod = 18.25
i = 1.25
df_f1['TOW'] = df_f1['total_resident_population'].apply(lambda P: TOW(P, bod, i))

In [68]:
bo = 0.6
mcfj = 0.3
df_f1['emissionfactor_value'] = EFj(bo, mcfj)
df_f1['emissionfactor_units'] = 'kg/kg BOD'

In [69]:
# valid only for sewer systems
income_group_dic = {
    'rural': {
        'Ui': 0.16,
        'Tij': 0.10
    },
    'urban_high_income': {
        'Ui': 0.25,
        'Tij': 0.80
    },
    'urban_low_income': {
        'Ui': 0.59,
        'Tij': 0.40
    }
}

In [70]:
# List of income group keys
income_groups = ['rural', 'urban_high_income', 'urban_low_income']

In [71]:
# Calculate population for each income group
for group in income_groups:
    df_f1[f'{group}'] = income_group_dic[group]['Ui']

In [72]:
# reformat of the DataFrame
df_f1 = df_f1.melt(
    id_vars=['municipal_code', 'municipality_name', 'UF', 'total_resident_population', 'collected', 'treated', 'imported', 
             'exported', 'TOW', 'emissionfactor_value', 'emissionfactor_units'], 
    value_vars=['rural', 'urban_high_income', 'urban_low_income'], 
    var_name='income_group', 
    value_name='Ui')

In [73]:
df_f1['Tij'] = df_f1['income_group'].apply(lambda group: income_group_dic[group]['Tij'])

In [74]:
# Assumptions
S = 0
R = 0

# Calculate CH4 emissions
df_f1['emissions_value_tmp'] = df_f1.apply(
    lambda row: CH4_emissions(row['Ui'], row['Tij'], row['emissionfactor_value'], row['TOW'], S, R), axis=1
)

In [75]:
df_f1.columns

Index(['municipal_code', 'municipality_name', 'UF',
       'total_resident_population', 'collected', 'treated', 'imported',
       'exported', 'TOW', 'emissionfactor_value', 'emissionfactor_units',
       'income_group', 'Ui', 'Tij', 'emissions_value_tmp'],
      dtype='object')

In [76]:
df_f1 = df_f1.groupby(['municipal_code', 'municipality_name', 'UF', 'total_resident_population', 'collected', 'treated', 'imported', 'exported', 'TOW', 'emissionfactor_value', 'emissionfactor_units'], 
              as_index=False)[['emissions_value_tmp']].sum()

In [77]:
# create a column to store the metadata
df_f1["metadata"] = df_f1.apply(
    lambda row: {
        "activity_subcategory_type2": 'treatment_type',
        "activity_subcategory_typename2": 'sewer',
        "activity_subcategory_type3": 'TOW',
        "activity_subcategory_typename3": row['TOW']
    },
    axis=1,
)

In [78]:
df_f1.drop(columns=['TOW'], inplace=True)

In [79]:
df_f1['gas_name'] = 'CH4'

### N2O calculations

In [80]:
protein = 27.8     # [kg protein / person / yr]
Fnpr = 0.16        # [kg N / kg protein]
F_non_con = 1.4    # for countries with garbage disposals
F_ind_com = 1.25   # centralized systems

In [81]:
# emission factor calculation
df_f['emissionfactor_value'] = EF_N2O(protein, Fnpr, F_non_con, F_ind_com)  

# emissions value calculation by income group
df_f['emissions_value_tmp'] = df_f['total_resident_population'] * df_f['emissionfactor_value'] * 0.01 * 44.28

# assign the gas name and emission factor units
df_f['gas_name'] = 'N2O'
df_f['emissionfactor_units'] = 'kg/person'

In [82]:
# create a column to store the metadata
df_f["metadata"] = df_f.apply(
    lambda row: {
        "activity_subcategory_type2": 'treatment_type',
        "activity_subcategory_typename2": 'sewer'
    },
    axis=1,
)

In [83]:
df_f.drop(columns=['service_type', 'number_municipalities'], inplace=True)

### Assigning scopes

In [84]:
# final df
df_final = pd.concat([df_f, df_f1], ignore_index=True)

In [85]:
# calculate the fraction of each scope
df_final['fraction_scope1'] = (df_final['collected']-df_final['exported']-df_final['imported'])/df_final['collected']
df_final['fraction_scope3'] = df_final['exported']/df_final['collected']

In [86]:
# calculate the emissions for each scope
df_final['III.4.1'] = df_final['emissions_value_tmp']*df_final['fraction_scope1']
df_final['III.4.2'] = df_final['emissions_value_tmp']*df_final['fraction_scope3']

In [87]:
df_final.columns

Index(['municipal_code', 'municipality_name', 'UF',
       'total_resident_population', 'collected', 'treated', 'imported',
       'exported', 'emissionfactor_value', 'emissions_value_tmp', 'gas_name',
       'emissionfactor_units', 'metadata', 'fraction_scope1',
       'fraction_scope3', 'III.4.1', 'III.4.2'],
      dtype='object')

In [88]:
# reformating the DataFrame
df_final = df_final.melt(
    id_vars=['municipal_code', 'municipality_name', 'UF', 'total_resident_population', 'emissionfactor_value', 'gas_name', 'emissionfactor_units', 
             'metadata'], 
    value_vars=['III.4.1', 'III.4.2'], 
    var_name='GPC_refno', 
    value_name='emissions_value')

In [89]:
# emissions units
df_final['emissions_units'] = 'kg'

# drop the rows with zero emissions
df_final = df_final[df_final['emissions_value'] != 0]

# drop the rows with NaN values
df_final.dropna(subset=['emissions_value'], inplace=True)

In [90]:
# rename the population column by income group as the activity value
df_final.rename(columns={'population_by_income_group': 'activity_value', 'municipality_name': 'actor_name'}, inplace=True)

In [91]:
# assign the activity units
df_final['activity_units'] = 'person'
df_final['activity_name'] = 'treatment-type-centralized-aerobic-treatment-plan-well-managed'

In [92]:
df_final.to_csv('emissions_waste.csv', index=False)