# Merging GRASS outputs and then cep attributes to the merged output
execution time ~= 1 min

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
output_directory = os.environ['GRASS_OUTPUT_DIRECTORY']
cep_attributes_path = os.environ['CEP_ATTRIBUTES_CSV_PATH']

In [3]:
class MergingDataframes():
    """
    Contains functions Merge the outputs of the GRASS script into a single dataframe with the CEP attributes
    """
    def __init__(self, output_directory,cep_attributes_path):
        self.df = None
        self.output_directory = output_directory
        self.cep_attributes_path = cep_attributes_path
    

    def aggregate_grass_outputs(self):
        """
        Aggregate the outputs of the GRASS script into a single dataframe with the CEP attributes
        """
        dataframes = []
        # step through all csv files in the output directory
        for csv_file in os.listdir(self.output_directory):
            if csv_file.endswith('.csv'):
                # if csv file empty, skip
                if os.path.getsize(os.path.join(output_directory, csv_file)) == 0:
                    continue
                columns = ['transition_band','cep_id', 'area']
                df = pd.read_csv(os.path.join(output_directory, csv_file), names=columns)
                # for each csv append to list of dataframes
                dataframes.append(df)

        # concatenate all dataframes into multi-indexed dataframe (cep_id, transition_band)
        df = pd.concat(dataframes)
        # group by cep_id and transition_band and sum the areas (to create a multi-indexed dataframe with sum of areas)
        df = df.groupby(['cep_id', 'transition_band']).sum()
        #transpose the dataframe so that the transition bands are columns and fill NaN with 0 and set column names to transition_i
        df = df.unstack().fillna(0)
        #fix column names to remove multi-index
        df.columns = df.columns.droplevel()
        df.columns = [f'transition_{i}' for i in range(11)]
        # change the index to be the cep_id
        df.index = df.index.astype(int)
        self.df = df

    def merge_cep_attributes(self, cep_df, to_csv=False):
        """
        Merge the aggregated outputs of the GRASS script with the CEP attributes
        :param cep_df: dataframe containing the CEP attributes
        :param to_csv: if True, save the final dataframe to a csv file
        """
        # join the two dataframes on cep_id and cid to get the attributes
        df = self.df.join(cep_df, on='cep_id')
        # convert dtypes of id fields back to int (keep dtype from original csv)
        df['pa'] = pd.to_numeric(df['pa'], errors='coerce').astype('Int64')
        df['eco'] = pd.to_numeric(df['eco'], errors='coerce').astype('Int64')
        df['country'] = pd.to_numeric(df['country'], errors='coerce').astype('Int64')
        self.df = df

        if to_csv:
            self.df.to_csv('final_output.csv')
        

    # def find_tile_with_id(self, cep_id, output_directory):
    #     # step through all csv files in the output directory
    #     for csv_file in os.listdir(output_directory):
    #         if csv_file.endswith('.csv'):
    #             # if csv file empty, skip
    #             if os.path.getsize(os.path.join(output_directory, csv_file)) == 0:
    #                 continue
    #             columns = ['transition_band','cep_id', 'area']
    #             df = pd.read_csv(os.path.join(output_directory, csv_file), names=columns)

In [4]:
cep_df = pd.read_csv(cep_attributes_path, index_col='cid')
cep_df.head()

Unnamed: 0_level_0,country,country_name,iso3,eco,eco_name,is_marine,pa,pa_name,is_protected
cid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,,,80412,Central European mixed forests,False,0,,False
2,1,Area Beyond National Jurisdiction,ABNJ,1,Antarctic,True,0,,False
3,1,Area Beyond National Jurisdiction,ABNJ,1,Antarctic,True,478191,South Orkney Islands Southern Shelf Marine Pro...,True
4,1,Area Beyond National Jurisdiction,ABNJ,1,Antarctic,True,555547601,South Georgia and South Sandwich Islands Marin...,True
5,1,Area Beyond National Jurisdiction,ABNJ,1,Antarctic,True,555624810,Ross Sea Region Marine Protected Area,True


In [5]:
# get rows where NaN or null country values
cep_df[cep_df['country_name'].isnull()] # {cid: 1}

Unnamed: 0_level_0,country,country_name,iso3,eco,eco_name,is_marine,pa,pa_name,is_protected
cid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,,,80412,Central European mixed forests,False,0,,False


In [6]:
# in cep_df check if there are any missing ids from 1 to 463710 ids (inclusive)
missing_ids = set(range(1, 463711)) - set(cep_df.index)
missing_ids # {cid: 295147}

{295147}

In [7]:
#update cid 1 with country (code): 171, country name: Lithuania, iso3: LTU
cep_df.loc[1, 'country'] = 171
cep_df.loc[1, 'country_name'] = 'Lithuania'
cep_df.loc[1, 'iso3'] = 'LTU'

In [8]:
# maunally insert missing record for id 295147 (copy from 339212 with different id)
cep_df.loc[295147] = cep_df.loc[339212].copy()

In [13]:
# aggregate GRASS outputs into df
merger = MergingDataframes(output_directory,cep_attributes_path)
merger.aggregate_grass_outputs()
#merge cep attributes with GRASS outputs
merger.merge_cep_attributes(cep_df, to_csv=True)