## Loading and cleaning

In [6]:
import sys
sys.path.insert(0, '../repository')

In [7]:
import database
from functools import reduce
import pandas as pd
import numpy as np
import re
import os
import glob

In [8]:
conn = database.DatabaseConnection(db_name='world-governance-index').connection

In [9]:
f = open('../repository/DUMP/ISO CODE.csv', 'r')
iso_codes = pd.read_csv(f)
iso_codes['id'] = pd.Series(range(len(iso_codes)), name='id')

In [10]:
files = [f for f in glob.glob("../repository/RAW/WGI*")]
dfs = {}
for i, f in enumerate(files):
    name = os.path.basename(f)
    _df = pd.read_csv(f, sep=";", decimal=",")
    ptrn = re.compile("\d{4}")
    join_dfs = []
    for k, d in _df.T.groupby(lambda x: re.search(ptrn, x).group() if re.search(ptrn, x) else None):  # type: ignore
        country_code = _df.copy(deep=True)
        country_code, country_code.columns = country_code[1:], country_code.iloc[0]
        d = d.T
        d, d.columns = d[1:], d.iloc[0]
        d["Year"] = k
        d = d.assign(country_code=country_code.iloc[:, 1])
        join_dfs.append(d)
    _df = reduce(lambda x, y: pd.concat([x, y]), join_dfs)
    _df = _df.merge(
        iso_codes.iloc[:, [-1, 0, 2]], left_on="country_code", right_on="alpha-3"
    )
    _df = _df.drop(axis=1, columns=["country_code", "name", "alpha-3"])
    _df = _df.rename(columns={"id": "country_id"})
    _df = _df[["country_id", "Year", *_df.columns]]
    _df = _df.loc[:, ~_df.columns.duplicated()].copy()
    _df = _df.rename(str.lower, axis="columns")
    _df.columns = _df.columns.str.replace(" ", "_")
    _df["year"] = _df["year"].astype("int64")
    _df["rank"] = _df["rank"].str.replace(",", ".").astype("float32")
    dfs.update({name: _df})

In [11]:
sg = [re.compile(r"Political.*Stability"), re.compile(r"Control.*Corruption"), re.compile(r"Government.*Effectiveness")]
for name, df in dfs.items():
    matches = [p.search(name) for p in sg]
    ctgry = 'Freedom'
    if not any(matches):
        ctgry = 'State Governance'
    # df.to_csv(f"../repository/DUMP/WGI/{name.replace('.CSV', '').strip()} - {ctgry}.csv", sep=";", index=False, decimal='.')
    df.to_sql(name.split('-')[1].replace('.CSV', '').strip(), con=conn, index_label='id', if_exists='replace')