In [1]:
import pandas as pd
import numpy as np
import glob
import os

In [2]:
paths = glob.glob(f'./Plan*L*.xls')

df1 = pd.DataFrame()

for path in paths:
    tmp = pd.read_excel(path, skiprows=11)

    tmp.rename(columns={'-': 'municipal_code', 
                    '-.1': 'municipality_name', 
                    '-.2': 'UF', 
                    '-.3': 'code1', 
                    '-.4': 'code2', 
                    '-.5': 'provedor_name', 
                    '-.6': 'sigla', 
                    '-.7': 'scope', 
                    '-.8': 'jur_nature', 
                    '-.9': 'service_type'}, inplace=True)

    # filter by the intereted columns
    #tmp = tmp[['municipality_name', 'service_type','GE12b' ,'GE005', 'ES005', 'ES006', 'ES014', 'ES015']]

    df1 = pd.concat([df1, tmp], ignore_index=True)

In [3]:
folder_path = './others/'

dataframes = []

for file in os.listdir(folder_path):
    if file.endswith('.xls'):  
        file_path = os.path.join(folder_path, file)

        df = pd.read_excel(file_path, skiprows=11)

        df.rename(columns={'-': 'municipal_code', 
                     '-.1': 'municipality_name', 
                     '-.2': 'UF', 
                     '-.3': 'code1', 
                     '-.4': 'code2', 
                     '-.5': 'provedor_name', 
                     '-.6': 'sigla', 
                     '-.7': 'scope', 
                     '-.8': 'jur_nature', 
                     '-.9': 'service_type'}, inplace=True)
        
        stop_index = df[df['municipal_code'] == 'TOTALIZAÇÃO DAS INFORMAÇÕES DESAGREGADAS (Desagre):  '].index.to_list()[0]

        if not pd.isna(stop_index):
            df = df.iloc[:stop_index]
        
        dataframes.append(df)

# Combine all processed DataFrames into one
df2 = pd.concat(dataframes, ignore_index=True)

# filter by the intereted columns
#df2 = df2[['municipality_name', 'service_type', 'GE12b', 'GE005', 'ES005', 'ES006', 'ES014', 'ES015']]

In [4]:
# concatenate the two DataFrames
df_f = pd.concat([df1, df2], ignore_index=True)

In [5]:
df_f.to_csv('wastewater_raw.csv', index=False)

In [6]:
# filter by the intereted columns
df_f = df_f[['municipality_name', 'service_type', 'GE12b', 'GE005', 'ES005', 'ES006', 'ES014', 'ES015']]

In [8]:
column_names = ['municipality_name', 'service_type', 'total_resident_population','number_municipalities', 'collected', 'treated', 'imported', 'exported']
df_f.columns = column_names

In [9]:
# Select only the rows related to the sewer system information and one mucipality atended by the service
df_f = df_f[~df_f['service_type'].isin(['Água']) & (df_f['number_municipalities'] == 1)]

### Formulas

In [7]:
def TOW(P, BOD, I):
    """
    Units required:
    - P: cap
    - BOD: kg BOD / cap / yr
    - I: unitless

    example: TOW(520600, 18.25, 1.25)
    """
    return P*BOD*I

In [8]:
def EFj(Bo, MCFj):
    """
    Units required:
    - Bo: kg CH4 / kg BOD
    - MCFj: unitless

    example: EFj(0.6, 0.3)
    """
    return Bo*MCFj

In [9]:
def CH4_emissions(income_group_dic, EF_dic, df, TOW_column, S, R):
    """
    Calculate the formula for each row in a DataFrame where TOW is a column.

    Formula:
    (summatory(i,j)(Ui * Ti,j * EFj)) * (TOW - S) - R

    Parameters:
    income_group_dic (dict): Nested dictionary containing U*T values as percentages.
    EF_dic (dict): Dictionary of EF_j values for each treatment type.
    df (pd.DataFrame): DataFrame containing TOW values.
    TOW_column (str): Name of the column in the DataFrame for TOW values.
    S (float): Scalar value S.
    R (float): Scalar value R.

    Returns:
    pd.Series: A Series with the calculated results for each row.
    """
    total_sum = 0

    # Calculate the summation part of the formula
    for data in income_group_dic.values():
        for treatment_type, UT_percent in data['U*T'].items():
            UT_fraction = UT_percent / 100  # Convert percentage to fraction
            EF_j = EF_dic.get(treatment_type, 0)  # Get EF value for treatment type
            total_sum += UT_fraction * EF_j

    df['emissionfactor_value'] = total_sum

    # Apply the formula for each row
    results = (df['emissionfactor_value'] * (df[TOW_column] - S)) - R
    return results

In [10]:
def N_effluent(population, protein, Fnpr, F_non_con, F_ind_com):
    """
    Total annual amount of nitrogen in the wastewater effluent, kg N/yr 
    Source: IPCC 2006
    units requered:
        - population: [person]
        - protein: [kg protein / person / yr]
        - Fnpr: [kg N / kg protein]
        - F_non_con: unitless
        - F_ind_com: unitless
    """
    return population*protein*Fnpr*F_non_con*F_ind_com

In [11]:
income_group_dic = {
    'high': {
        'U*T': {
            'None': 0,
            'Sewer': 20,
            'Septic tank': 0,
            'Latrine': 5,
            'Other': 0
        }
    },
    'low': {
        'U*T': {
            'None': 11.8,
            'Sewer': 23.6,
            'Septic tank': 0,
            'Latrine': 23.6,
            'Other': 0
        }
    },
    'rural': {
        'U*T': {
            'None': 7,
            'Sewer': 2,
            'Septic tank': 0,
            'Latrine': 7,
            'Other': 0
        }
    }
}

In [12]:
EF_dic = {
    'None': 0.057,
    'Sewer': 0.0756,
    'Septic tank': 0,
    'Latrine': 0.15,
    'Other': 0
}

### CH4 Calculations

In [13]:
df_f1 = df_f.copy()

In [14]:
bod = 18.25
i = 1.25
df_f1['TOW'] = df_f1['total_resident_population'].apply(lambda P: TOW(P, bod, i))

In [15]:
# Constants
S = 0  
R = 0
TOW_column = 'TOW'

# Calculate the results for each row in the DataFrame
df_f1['emissions_value_tmp'] = CH4_emissions(income_group_dic, EF_dic, df_f1, TOW_column, S, R)

In [16]:
# create a column to store the metadata
df_f1["metadata"] = df_f1.apply(
    lambda row: {
        "activity_subcategory_type1": 'treatment_type',
        "activity_subcategory_typename1": 'all',
        "activity_subcategory_type2": 'TOW',
        "activity_subcategory_typename2": row['TOW']
    },
    axis=1,
)

In [17]:
df_f1.drop(columns=['TOW', 'service_type', 'number_municipalities'], inplace=True)

In [18]:
df_f1['gas_name'] = 'CH4'

In [19]:
df_f1['emissionfactor_units'] = 'kg / kg BOD'

### N2O calculations

In [20]:
protein = 33.58     # [kg protein / person / yr]
Fnpr = 0.16        # [kg N / kg protein]
F_non_con = 1.4    # for countries with garbage disposals
F_ind_com = 1.25   # centralized systems

In [21]:
# Calculate N2O emissions
df_f['N_effluent'] = df_f['total_resident_population'].apply(
    lambda P: N_effluent(P, protein, Fnpr, F_non_con, F_ind_com), axis=1
    )

In [22]:
df_f['emissionfactor_value'] = 0.005
df_f['emissionfactor_units'] = 'kg N2O-N / kg N'

In [23]:
# emissions value calculation
df_f['emissions_value_tmp'] = df_f['N_effluent'] * (df_f['emissionfactor_value'] * 44/28)

In [24]:
# assign the gas name and emission factor units
df_f['gas_name'] = 'N2O'

In [25]:
# create a column to store the metadata
df_f["metadata"] = df_f.apply(
    lambda row: {
        "activity_subcategory_type1": 'treatment_type',
        "activity_subcategory_typename1": 'all',
        "activity_subcategory_type2": 'N_effluent',
        "activity_subcategory_typename2": row['N_effluent'],
    },
    axis=1,
)

In [26]:
df_f.drop(columns=['N_effluent', 'service_type', 'number_municipalities'], inplace=True)

### Assigning scopes

In [27]:
# final df
df_final = pd.concat([df_f, df_f1], ignore_index=True)

In [28]:
# calculate the fraction of each scope
df_final['fraction_scope1'] = (df_final['collected']-df_final['exported']-df_final['imported'])/df_final['collected']
df_final['fraction_scope3'] = df_final['exported']/df_final['collected']

In [29]:
# calculate the emissions for each scope
df_final['III.4.1'] = df_final['emissions_value_tmp']*df_final['fraction_scope1']
df_final['III.4.2'] = df_final['emissions_value_tmp']*df_final['fraction_scope3']

In [30]:
# reformating the DataFrame
df_final = df_final.melt(
    id_vars=['municipality_name', 'total_resident_population', 'emissionfactor_value', 'gas_name', 'emissionfactor_units', 
             'metadata'], 
    value_vars=['III.4.1', 'III.4.2'], 
    var_name='GPC_refno', 
    value_name='emissions_value')

In [31]:
# emissions units
df_final['emissions_units'] = 'kg'

# drop the rows with zero emissions
df_final = df_final[df_final['emissions_value'] != 0]

# drop the rows with NaN values
df_final.dropna(subset=['emissions_value'], inplace=True)

In [32]:
# rename the population column by income group as the activity value
df_final.rename(columns={'total_resident_population': 'activity_value', 'municipality_name': 'actor_name'}, inplace=True)

In [33]:
# assign the activity units
df_final['activity_units'] = 'person'
df_final['activity_name'] = 'treatment-type-centralized-aerobic-treatment-plan-well-managed'

In [34]:
df_final.to_csv('emissions_waste.csv', index=False)