# Process ADB MRIO Excel files

This notebook converts the ADB MRIO Excel files into machine-readable format and saves them as space-efficient parquet files. 

For version control purposes, it is important not to make any edits whatsoever to the raw Excel files, even to fix typos or to change file names. Any changes must be made programmatically.

In [2]:
import pandas as pd
import os
import re

## Helper functions

In [5]:
def process_table(df):
    
    # Remove the last row and the first 2 columns
    df = df.drop(df.index[-1])
    df = df.iloc[:, 2:]

    # Collapse MultiIndex headers into one
    df.columns = [f'{level_1}_{level_2}' for level_1, level_2 in df.columns]

    # Rename the ToT column
    colnames = df.columns.tolist()
    mapping = {colnames[-1]: 'ToT'}
    df = df.rename(columns=mapping)

    # Fix row labels
    rowlabels = [f"{c}_{d}" if not (pd.isna(c) or c == 'ToT') else d for c, d in zip(df.iloc[:, 0], df.iloc[:, 1])]
    df.insert(2, '', rowlabels)
    df = df.iloc[:, 2:]
    
    # Drop intermediates totals
    df = df.drop(df[df[''] == 'r60'].index)

    # Replace blank cells with zero
    df = df.replace(' ', 0)

    return df

## ADB MRIO 72 economies

In [136]:
foldername = 'ADB MRIO, 72 economies as of Dec 2022'
filelist = [file for file in os.listdir('../data/raw/' + foldername) if not file.startswith('.')]
filelist

['ADB-MRIO-2019_Dec2022.xlsx',
 'ADB-MRIO-2018_Dec2022.xlsx',
 'ADB-MRIO-2021_Dec2022-1 (1).xlsx',
 'ADB-MRIO-2020_Dec2022.xlsx',
 'ADB-MRIO-2017_Dec2022-2.xlsx']

In [139]:
for file in filelist:
    
    mrio = pd.read_excel(
        '../raw/ADB MRIO, 72 economies as of Dec 2022/' + file,
        skiprows=5,
        header=[0,1]
    )
    
    process_table(mrio)
    
    # Export as parquet
    year = re.search('[0-9]{4}', file).group()
    mrio.to_parquet(
        '../final/ADB-MRIO/ADB-MRIO72-' + year + '.parquet',
        index=False)

## ADB MRIO 62 economies

In [140]:
foldername = 'ADB MRIO, 62 economies'
filelist = [file for file in os.listdir('../data/raw/' + foldername) if not file.startswith('.')]
filelist

['ADB-MRIO-2008_Mar2022.xlsx',
 'ADB-MRIO-2009_Mar2022.xlsx',
 'ADB-MRIO-2011_Mar2022.xlsx',
 'ADB-MRIO62-2018_Dec2022.xlsx',
 'ADB-MRIO-2016_Mar2022.xlsx',
 'ADB-MRIO62-2019_Dec2022.xlsx',
 'ADB-MRIO-2010_Mar2022.xlsx',
 'ADB-MRIO-2012_Mar2022.xlsx',
 'ADB-MRIO-2015_Mar2022.xlsx',
 'ADB-MRIO-2014_Mar2022.xlsx',
 'ADB-MRIO-2013_Mar2022.xlsx',
 'ADB-MRIO62-2017_Dec2022.xlsx',
 'ADB-MRIO-2007.xlsx',
 'ADB-MRIO62-2021_Dec2022.xlsx',
 'ADB-MRIO62-2020_Dec2022.xlsx',
 'ADB-MRIO-2000_Mar2022-3.xlsx']

In [165]:
for file in filelist:
    
    mrio = pd.read_excel(
        '../raw/ADB MRIO, 62 economies/' + file,
        skiprows=5,
        header=[0,1]
    )
    
    process_table(mrio)

    # Export as parquet
    year = re.search('[0-9]{4}', file).group()
    mrio.to_parquet(
        '../final/ADB-MRIO/ADB-MRIO-' + year + '.parquet',
        index=False)

## ADB MRIO 72 – Jun 2023

In [8]:
filelist = os.listdir('../data/raw/9 MRIO 2020-2022 for upload (ao Jun 2023)')
filelist

['ADB-MRIO-2021_June2023.xlsx',
 'ADB-MRIO-2022_June2023.xlsx',
 'ADB-MRIO-2020_June2023.xlsx']

In [12]:
for file in filelist:
    
    mrio = pd.read_excel(
        '../data/raw/9 MRIO 2020-2022 for upload (ao Jun 2023)/' + file,
        skiprows=5,
        header=[0,1]
    )
    
    mrio = process_table(mrio)

    # Export as parquet
    year = re.search('[0-9]{4}', file).group()
    mrio.to_parquet(
        '../data/final/ADB-MRIO72-checking/ADB-MRIO72-' + year + '.parquet',
        index=False)