In [1]:
import os
from pathlib import Path
import warnings
import polars as pl
import matplotlib.pyplot as plt

from blend import BLEND
from blend.utils import clean

In [2]:
warnings.filterwarnings('ignore')

In [3]:
data_path = Path("data", "undata")

tables_path = data_path.joinpath("tables")
queries_path = data_path.joinpath("queries")
data_lake_path = data_path.joinpath('data-lake')

data_path.absolute(), data_path.exists()

(PosixPath('/home/nanni/projects/bdm/lab/blend-duckdb/data/undata'), True)

In [4]:
um49 = pl.read_csv(data_path.joinpath('UNSD_M49.csv'))
um49.head()

Global Code,Global Name,Region Code,Region Name,Sub-region Code,Sub-region Name,Intermediate Region Code,Intermediate Region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code,Least Developed Countries (LDC),Land Locked Developing Countries (LLDC),Small Island Developing States (SIDS)
i64,str,i64,str,i64,str,i64,str,str,i64,str,str,str,str,str
1,"""World""",2,"""Africa""",15,"""Northern Africa""",,,"""Algeria""",12,"""DZ""","""DZA""",,,
1,"""World""",2,"""Africa""",15,"""Northern Africa""",,,"""Egypt""",818,"""EG""","""EGY""",,,
1,"""World""",2,"""Africa""",15,"""Northern Africa""",,,"""Libya""",434,"""LY""","""LBY""",,,
1,"""World""",2,"""Africa""",15,"""Northern Africa""",,,"""Morocco""",504,"""MA""","""MAR""",,,
1,"""World""",2,"""Africa""",15,"""Northern Africa""",,,"""Sudan""",729,"""SD""","""SDN""","""x""",,


In [5]:
import pycountry
from functools import lru_cache


@lru_cache(1000)
def country_to_region(country: str, region: str):
    try:
        code = pycountry.countries.search_fuzzy(country)[0].alpha_3
        return um49.filter(pl.col('ISO-alpha3 Code') == code)[region][0]
    except:
        return None

In [6]:
def get_country_column(columns):
    if 'Reference Area' in columns:
        return 'Reference Area'
    elif 'Country or Area' in columns:
        return 'Country or Area'
    
def get_year_column(columns):
    candidates = ['Time Period', 'Year', 'Year(s)']
    return next((c for c in candidates if c in columns), None)

In [8]:
import shutil
shutil.rmtree(data_lake_path)
data_lake_path.mkdir(parents=True, exist_ok=True)

for table in os.listdir(tables_path):
    df = pl.read_csv(tables_path.joinpath(table))
    if "Decade" not in df.columns:
            
        # add the continent column
        country_column = get_country_column(df.columns)
        for r in ['Region Name', 'Sub-region Name']:
            df = df.with_columns(pl.col(country_column).map_elements(lambda c: country_to_region(c, r), pl.String).alias(r))
            df = df.drop(r).insert_column(df.get_column_index(country_column) + 1, df.get_column(r))

        year_column = get_year_column(df.columns)
        df = df.with_columns(((pl.col(year_column) // 10) * 10).alias('Decade'))
        df = df.drop('Decade').insert_column(df.get_column_index(year_column) + 1, df.get_column('Decade'))

        # drop null rows only considering these columns
        df = df.drop_nulls([year_column, 'Decade', 'Sub-region Name', 'Region Name'])

    # # save a version of the dataset per decade
    # for decade, group in df.group_by('Decade'):
    #     decade = decade[0]
    #     if decade == 0:
    #         continue
    #     group_name = table.removesuffix('.csv') + f'-{decade}.csv'
    #     group.write_csv(data_lake_path.joinpath(group_name))

    # save the new CSV version
    df.write_csv(data_lake_path.joinpath(table))