## Loading and cleaning

In [254]:
import pandas as pd
import numpy as np
import glob
import re

pd.set_option('display.max_info_columns', 200)

In [255]:
# Reading the iso codes.
f = open('../repository/DUMP/ISO CODE.csv', 'r')
iso_codes = pd.read_csv(f)
files = [f for f in glob.glob("../repository/RAW/BTI*")]
dfs = []
for i, f in enumerate(files):
    year = re.search("(\d{4})", f).group(0)
    with open(f, mode="r", encoding="utf-8"):
        _df = pd.read_csv(f, sep=";", index_col=False, decimal=',')
        regions = {
            k.strip(): v.strip()
            for k, v in [el.split("|") for el in _df.columns[0].split("\n")[1:]]
        }
        _df = _df.rename(str.strip, axis="columns")
        _df = _df.dropna(axis=1, how="all")
        _df["Region"] = _df["Region"].astype(str)
        _df["Region"] = _df["Region"].replace(regions)
        _df = _df.rename({_df.columns[0]: "Country"}, axis="columns")
        _df["Year"] = year
        _df = _df.replace(['-', '?'], np.nan)
        _df = _df.convert_dtypes()
        _df = _df[[*_df.columns[:2], "Year", *_df.columns[2:]]] # Reorder Year column
        _df = _df.loc[:,~_df.columns.duplicated()].copy() # type: ignore # Remove Year duplicates
        to_drop = []
        ptrn = re.compile('Trend.*')
        for i, col in enumerate(_df.columns):
            is_rm = re.match(ptrn, col)
            if is_rm:
                to_drop.append(i + 1) 
            try:
                _df[col] = _df[col].str.replace(',', '.').astype(float)
                try:
                    _df[col] = _df[col].astype(int)
                except ValueError:
                    pass
            except (ValueError, AttributeError):
                continue
        _df = _df.drop([_df.columns[i] for i in to_drop], axis=1) # Remove trending arrows
        dfs.append(_df)

In [256]:
ptrn = re.compile(r'^\w+.*\|\s') # Removing prefixes, such as 'Q1 | Stateness' or 'Q1.1 | Monopoly on the use of force'
rm_prefix = lambda x: ptrn.sub('', x).strip()
cols = dfs[1].rename(rm_prefix, axis='columns').columns
for df in dfs[1:]:
    df.columns = cols
df = pd.concat([*dfs[1:]])


In [257]:
same_col_name = lambda c: re.search(
    r"\s\w+\.\d", c
)  # Columns like 'Status Index.1', or 'Governance Index.1'
ptrn = re.compile(r"\.\d$")  # The '.1' at the end of the column name
for i, col in enumerate(df.columns):
    mtch = same_col_name(col)
    if mtch:
        df = df.rename(
            {
                df[col].name: ptrn.sub(" Score", df[col].name),
                df[df.columns[i + 1]].name: ptrn.sub(" Category", df[col].name),
                df[df.columns[i + 2]].name: ptrn.sub(" Status", df[col].name),
            },
            axis="columns",
        )

## Spliting

In [258]:
iso_codes

Unnamed: 0,name,alpha-2,alpha-3,country-code,iso_3166-2,region,sub-region,intermediate-region,region-code,sub-region-code,intermediate-region-code
0,Afghanistan,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,,142.0,34.0,
1,Ã…land Islands,AX,ALA,248,ISO 3166-2:AX,Europe,Northern Europe,,150.0,154.0,
2,Albania,AL,ALB,8,ISO 3166-2:AL,Europe,Southern Europe,,150.0,39.0,
3,Algeria,DZ,DZA,12,ISO 3166-2:DZ,Africa,Northern Africa,,2.0,15.0,
4,American Samoa,AS,ASM,16,ISO 3166-2:AS,Oceania,Polynesia,,9.0,61.0,
...,...,...,...,...,...,...,...,...,...,...,...
244,Wallis and Futuna,WF,WLF,876,ISO 3166-2:WF,Oceania,Polynesia,,9.0,61.0,
245,Western Sahara,EH,ESH,732,ISO 3166-2:EH,Africa,Northern Africa,,2.0,15.0,
246,Yemen,YE,YEM,887,ISO 3166-2:YE,Asia,Western Asia,,142.0,145.0,
247,Zambia,ZM,ZMB,894,ISO 3166-2:ZM,Africa,Sub-Saharan Africa,Eastern Africa,2.0,202.0,14.0


In [259]:
# df.to_csv('../repository/DUMP/Trasnformation Index (2008-2022).csv', sep=';', index=False, encoding='utf-8')