In [2]:
# Required standard libraries
import pandas as pd
import json
import urllib
import requests
import os
import re
import numpy as np
import bs4 as bs
import selenium
import html5lib
from selenium import webdriver

# Extractors (cluster specific)
from extract.unesco_extractor import extract_unesco_api_data
from extract.ilo_extractor import extract_ilo_api_data
from extract.sdg_extractor import extract_sdg_api_data
from extract.who_extractor import extract_who_api_data
from extract.un_treaty_extractor import extract_un_treaties_data
from extract.ilo_normlex_extractor import extract_ilo_normlex_data

from extract import save_raw_data

# Cleansers (cluster specific)
from cleanse.unesco_cleanser import cleanse_unesco_api_data
from cleanse.ilo_cleanser import cleanse_ilo_api_data
from cleanse.sdg_cleanser import cleanse_sdg_api_data
from cleanse.who_cleanser import cleanse_who_api_num_data
from cleanse.un_treaty_cleanser import cleanse_un_treaty_data
from cleanse.wpac_cleanser import cleanse_wpac_data

# from cleanse.save_cleansed_data import save_cleansed_data 

# Normalizer (generalised across all clusters)
from normalize import scaler
# from normalize import save_normalized_data

In [3]:
# Define the export path for all data exports
from pathlib import Path

cwd = Path('.')

data_in = cwd / 'data_in'
data_sources_raw = cwd / 'data_out'
data_sources_raw.mkdir(parents=True, exist_ok=True)


In [4]:
# Load the list of countries which contains all different variations of country names 
country_full_list = pd.read_excel(
    data_in / 'all_countrynames_list.xlsx',
    keep_default_na = False).drop_duplicates()

# Create a version of the list with unique ISO2 and ISO3 codes
country_iso_list = country_full_list.drop_duplicates(subset = 'CountryIso2')

# Country CRBA list, this is the list of the countries that should be in the final CRBA indicator list
country_crba_list = pd.read_excel(
    data_in / 'crba_country_list.xlsx',
    header = None,
    usecols = [0, 1], 
    names = ['COUNTRY_ISO_3', 'COUNTRY_NAME']).merge(
        right = country_iso_list,
        how = 'left',
        left_on = 'COUNTRY_ISO_3',
        right_on = 'CountryIso3',
        validate = 'one_to_one')[
    ['COUNTRY_ISO_3', 'COUNTRY_NAME', 'CountryIso2']].rename(
    columns = {'CountryIso2': "COUNTRY_ISO_2"})

country_crba_list

Unnamed: 0,COUNTRY_ISO_3,COUNTRY_NAME,COUNTRY_ISO_2
0,AFG,Afghanistan,AF
1,ALB,Albania,AL
2,AND,Andorra,AD
3,DZA,Algeria,DZ
4,AGO,Angola,AO
...,...,...,...
190,VEN,Venezuela,VE
191,VNM,Vietnam,VN
192,YEM,Yemen,YE
193,ZMB,Zambia,ZM


In [5]:
crba_data_dictionary_sources = pd.read_excel(
    data_in / 'indicator_dictionary_CRBA.xlsx',
    sheet_name = "Source",
    keep_default_na = False
)

# Load data dictionary snapshot sheet
crba_data_dictionary_snapshot = pd.read_excel(
    data_in / 'indicator_dictionary_CRBA.xlsx',
    sheet_name = "Snapshot",
    keep_default_na = False
)



In [6]:
# Extract data
from extract import CSVExtractor

s55_raw = CSVExtractor.extract(
    'https://api.uis.unesco.org/sdmx/data/UNESCO,SDG4,2.0/ROFST.PT.L2+L2_3+L3._T._T+F+M.SCH_AGE_GROUP._T.INST_T._Z._T._Z._Z._Z._T._T._Z._Z._Z.?startPeriod=2005&endPeriod=2018&format=csv-sdmx&locale=en&subscription-key=460ab272abdd43c892bb59c218c22c09'
)

s55_raw.to_csv(data_sources_raw / "data_raw/S_55_raw.csv")


The following columns are present in the datasets, and this is the number of unique values they have. 
The column Dataflow has 1 unique values.
The column STAT_UNIT has 1 unique values.
The column UNIT_MEASURE has 1 unique values.
The column EDU_LEVEL has 3 unique values.
The column EDU_CAT has 1 unique values.
The column SEX has 3 unique values.
The column AGE has 1 unique values.
The column GRADE has 1 unique values.
The column SECTOR_EDU has 1 unique values.
The column EDU_ATTAIN has 1 unique values.
The column SUBJECT has 1 unique values.
The column WEALTH_QUINTILE has 1 unique values.
The column INFRASTR has 1 unique values.
The column LOCATION has 1 unique values.
The column EDU_TYPE has 1 unique values.
The column SE_BKGRD has 1 unique values.
The column SOURCE_FUND has 1 unique values.
The column FUND_FLOW has 1 unique values.
The column IMM_STATUS has 1 unique values.
The column REF_AREA has 326 unique values.
The column TIME_PERIOD has 14 unique values.
The column OBS_VALUE h

In [7]:
from cleanse import Cleanser

cleansed= Cleanser.cleanse(
    raw_data = s55_raw,
    raw_data_iso_2_col = 'REF_AREA',
    country_df = country_crba_list,
    country_df_iso2_col = 'COUNTRY_ISO_2',
    non_dim_cols = ['OBS_VALUE', 'TIME_PERIOD', 'OBS_STATUS']
)

s55_raw.to_csv(data_sources_raw / "S_55_cleansed.csv")
