## Loading and cleaning

In [140]:
import pandas as pd
import numpy as np
import glob
import re

pd.set_option('display.max_info_columns', 200)

In [141]:
files = [f for f in glob.glob("../repository/RAW/BTI*")]
dfs = []
for i, f in enumerate(files):
    year = re.search("(\d{4})", f).group(0)
    with open(f, mode="r", encoding="utf-8"):
        _df = pd.read_csv(f, sep=";", index_col=False, decimal=',')
        regions = {
            k.strip(): v.strip()
            for k, v in [el.split("|") for el in _df.columns[0].split("\n")[1:]]
        }
        _df = _df.rename(str.strip, axis="columns")
        _df = _df.dropna(axis=1, how="all")
        _df["Region"] = _df["Region"].astype(str)
        _df["Region"] = _df["Region"].replace(regions)
        _df = _df.rename({_df.columns[0]: "Country"}, axis="columns")
        _df["Year"] = year
        _df = _df.replace(['-', '?'], np.nan)
        _df = _df.convert_dtypes()
        _df = _df[[*_df.columns[:2], "Year", *_df.columns[2:]]] # Reorder Year column
        _df = _df.loc[:,~_df.columns.duplicated()].copy() # type: ignore # Remove Year duplicates
        to_drop = []
        ptrn = re.compile('Trend.*')
        for i, col in enumerate(_df.columns):
            is_rm = re.match(ptrn, col)
            if is_rm:
                to_drop.append(i + 1) 
            try:
                _df[col] = _df[col].str.replace(',', '.').astype(float)
                try:
                    _df[col] = _df[col].astype(int)
                except ValueError:
                    pass
            except (ValueError, AttributeError):
                continue
        _df = _df.drop([_df.columns[i] for i in to_drop], axis=1) # Remove trending arrows
        dfs.append(_df)

In [142]:
ptrn = re.compile(r'^\w+.*\|\s') # Removing prefixes, such as 'Q1 | Stateness' or 'Q1.1 | Monopoly on the use of force'
rm_prefix = lambda x: ptrn.sub('', x).strip()
cols = dfs[1].rename(rm_prefix, axis='columns').columns
for df in dfs[1:]:
    df.columns = cols
df = pd.concat([*dfs[1:]])


In [143]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 959 entries, 0 to 136
Data columns (total 117 columns):
 #    Column                                      Non-Null Count  Dtype  
---   ------                                      --------------  -----  
 0    Country                                     959 non-null    string 
 1    Region                                      959 non-null    string 
 2    Year                                        959 non-null    int64  
 3    Ranking Status Index                        917 non-null    Float64
 4    Status Index                                917 non-null    Float64
 5    Ranking Democracy Status                    917 non-null    Float64
 6    Democracy Status                            917 non-null    Float64
 7    Stateness                                   917 non-null    Float64
 8    Monopoly on the use of force                917 non-null    Float64
 9    State identity                              917 non-null    Float64
 10   No in

In [144]:
same_col_name = lambda c: re.search(
    r"\s\w+\.\d", c
)  # Columns like 'Status Index.1', or 'Governance Index.1'
ptrn = re.compile(r"\.\d$")  # The '.1' at the end of the column name
for i, col in enumerate(df.columns):
    mtch = same_col_name(col)
    if mtch:
        df = df.rename(
            {
                df[col].name: ptrn.sub(" Score", df[col].name),
                df[df.columns[i + 1]].name: ptrn.sub(" Category", df[col].name),
                df[df.columns[i + 2]].name: ptrn.sub(" Status", df[col].name),
            },
            axis="columns",
        )

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 959 entries, 0 to 136
Data columns (total 117 columns):
 #    Column                                      Non-Null Count  Dtype  
---   ------                                      --------------  -----  
 0    Country                                     959 non-null    string 
 1    Region                                      959 non-null    string 
 2    Year                                        959 non-null    int64  
 3    Ranking Status Index                        917 non-null    Float64
 4    Status Index                                917 non-null    Float64
 5    Ranking Democracy Status                    917 non-null    Float64
 6    Democracy Status                            917 non-null    Float64
 7    Stateness                                   917 non-null    Float64
 8    Monopoly on the use of force                917 non-null    Float64
 9    State identity                              917 non-null    Float64
 10   No in

## Spliting

In [145]:
# df.to_csv('../repository/DUMP/Trasnformation Index (2008-2022).csv', sep=';', index=False, encoding='utf-8')