## Loading and cleaning

In [12]:
import sys
sys.path.insert(0, '../repository')

In [13]:
import database
import pandas as pd
import numpy as np
import re

In [14]:
conn = database.DatabaseConnection(db_name='economic-freedom').connection

In [15]:
f = open('../repository/RAW/FI - Economic Freedom.csv', mode='r', encoding='utf-8')
df = pd.read_csv(f, sep=';', index_col=False, decimal=',')
df = df.dropna(axis=0).convert_dtypes()

In [16]:
ptrn = re.compile(r"^\d\w+\s")  # Removing prefixes
rm_prefix = lambda x: ptrn.sub("", x).strip()
df = df.rename(rm_prefix, axis="columns")
df = df.drop(
    columns=list(
        el.group(0) for el in map(lambda x: re.match(r"\d.*", x), df.columns) if el
    )
)
df = df.drop(
    columns=list(
        el.group(0) for el in map(lambda x: re.match(r"data.*", x), df.columns) if el
    )
)

## Spliting

In [17]:
f = open('../repository/DUMP/ISO CODE.csv', 'r')
iso_codes = pd.read_csv(f)
iso_codes['id'] = pd.Series(range(len(iso_codes)), name='id')

In [18]:
df = df.merge(iso_codes.iloc[:, [-1, 0, 2]], left_on="ISO Code 3", right_on="alpha-3")
df = df.drop(axis=1, columns=['ISO Code 2', 'ISO Code 3', 'Countries', 'World Bank Region', 'name', 'alpha-3'])
df = df.rename(columns={'id': 'country_id'})
df = df[['Year', 'country_id', *df.columns]]
df = df.loc[:,~df.columns.duplicated()].copy()
df

Unnamed: 0,Year,country_id,Economic Freedom Summary Index,Rank,Quartile,Government consumption,Transfers and subsidies,Government investment,Top marginal income tax rate,Top marginal income and payroll tax rate,...,Bureacracy costs,Impartial Public Administration,Tax compliance,Business regulations,Market openness,Business Permits,Distorton of the business environment,Freedom to enter markets and compete,Area 5 Rank,"World Bank Current Income Classification, 1990-present"
0,2021,2,7.6,31,1,7.76,7.13,7.55,9,7,...,6.22,5.79,3.59,5.31,6.81,5.62,6.25,6.23,51,UM
1,2020,2,7.59,32,1,8.03,7.05,7.55,9,7,...,6.0,6.0,3.59,5.31,6.81,5.62,6.25,6.23,73,UM
2,2019,2,7.71,35,1,8.16,7.33,7.55,9,7,...,6.67,6.24,3.59,5.54,6.81,5.62,6.25,6.23,56,UM
3,2018,2,7.69,37,1,8.16,6.74,7.63,9,7,...,6.67,6.24,3.59,5.54,6.81,5.62,6.25,6.23,52,UM
4,2017,2,7.69,38,1,8.08,6.66,8.0,9,7,...,6.67,6.01,3.59,5.63,6.81,6.03,6.25,6.36,46,UM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
528,2019,248,5.09,157,4,9.21,8.5,4.19,5,5,...,2.67,1.64,4.89,2.9,2.08,7.99,0.0,3.36,153,LM
529,2018,248,5.36,149,4,8.38,8.5,2.82,4,4,...,2.67,2.72,4.89,3.17,2.08,7.99,0.0,3.36,146,L
530,2017,248,5.0,153,4,5.46,9.33,0.0,4,4,...,2.67,1.97,4.89,2.86,1.95,7.49,0.0,3.15,141,L
531,2017,126,6.01,127,4,8.67,8.64,1.23,9,7,...,3.33,5.7,4.22,4.47,2.95,9.43,0.0,4.13,151,L


In [19]:
gov_col = [
    "Government consumption",
    "Transfers and subsidies",
    "Government investment",
    "Judicial independence",
    "Impartial courts",
    "Military interference",
    "Legal integrity",
    "Police and crime",
    "Impartial Public Administration",
    "Tax compliance",
]

sg_col = df.columns.difference(gov_col, sort=False)

In [20]:
freedom = df.loc[:, ["country_id", "Year", *gov_col]]
sg = df.loc[:, sg_col]

freedom = freedom.rename(str.lower, axis="columns")
freedom.columns = freedom.columns.str.replace(" ", "_")
sg = sg.rename(str.lower, axis="columns")
sg.columns = sg.columns.str.replace(" ", "_")

In [21]:
sg['cost_of_worker_dismissal'] = sg['cost_of_worker_dismissal'].map(lambda x: x.replace(',', '.')).astype(float)

In [22]:
# freedom.to_csv('../repository/DUMP/Economic Freedom/EF - Freedom.csv', sep=';', index=False, decimal='.')
# sg.to_csv('../repository/DUMP/Economic Freedom/EF - State Governance.csv', sep=';', index=False, decimal='.')

sg.to_sql('freedom', con=conn, if_exists='replace', index_label='id')
freedom.to_sql('state_governance', con=conn, if_exists='replace', index_label='id')

533