# Integration of UN Statistical Yearbook data into country profiles

In [1]:
import pandas as pd
import os 

dir_path = os.path.dirname(os.path.realpath('__file__'))
print(dir_path)

data_dir = r'../UNSYB/data/'


C:\Users\L.GonzalezMorales\Documents\GitHub\FIS4SDGs\unsd\notebooks


## 1. Integrate all country/area details into a single table

### 1.1 Countries

In [2]:
countries_df = pd.read_json(data_dir + 'countries.json' , orient='columns', encoding='UTF-8')
countries_df = countries_df.rename(columns = {'id': 'refAreaId',
                                              'name': 'refAreaName'})
countries_df

Unnamed: 0,bookNameEng,bookNameFre,refAreaId,isM49,refAreaName
0,Extra-EU-28,Extra-UE-28,-999,N,Extra-EU-28
1,Non Petroleum Exports of Asia Middle East,Exp. non pétrolières de Moyen-Orient d'Asie,-909,N,Non Petroleum Exports of Asia Middle East
2,Regional programmes and other,Programmes régionaux et autres,-793,N,Regional programmes
3,World exc. intra-EU27,Monde excl. intra-UE27,-777,N,World excluding intra-EU27 trade
4,Other,Autres,-579,N,"Global/ interregional, programme support, mana..."
5,Interregional,Interrégional,-378,N,Interregional
6,[use code 593],[use code 593],-198,N,Development Assistance Committee (DAC)
7,Total,Total,-190,Y,Total countries/areas
8,Asia and the Pacific,Asie et le Pacifique,-147,Y,Asia and the Pacific
9,Total,Total,0,Y,Total


In [3]:
countries_df.columns

Index(['bookNameEng', 'bookNameFre', 'refAreaId', 'isM49', 'refAreaName'], dtype='object')

### 1.2 Country attributes

Each country/area can have a number of attributes.  Those attributes are collected in a table called `countryattributevalues`, and each can have:
- symbol
- textEng
- textFre

The following script adds, for each attribure, adds three columms to the countries table ([AttributeCode], [AttributeTextEN], [AttributeTextFR])


**a) Diplay the attributes list**

In [4]:
country_attributes_df = pd.read_json(data_dir + 'countryattributes.json' , orient='columns', encoding='UTF-8')
country_attributes_df = country_attributes_df.rename(columns = {'id': 'attributeId',
                                                                'name': 'attributeName'})
country_attributes_df

Unnamed: 0,attributeId,attributeName
0,1,Capital City
1,6,Code active status
2,18,Code class type
3,7,Code public status
4,5,Code type
5,9,ISO-2 code
6,8,ISO-3 code
7,2,Last Election Date
8,12,Major trading partner 1 (% of exports)
9,13,Major trading partner 1 (% of imports)


**b) Create a list of dictionaries with Attribute ID and Label**

In [5]:
attributes = []

for i in range(len(country_attributes_df.index)):
    
    temp_dict = {}

    temp_dict['attributeId'] = country_attributes_df.iloc[i,0]
    temp_dict['attributeName'] = country_attributes_df.iloc[i,1]
    
    if country_attributes_df.iloc[i,0] == 0:
        temp_dict['label'] = 'nationalCurrency'
    elif country_attributes_df.iloc[i,0] == 1:
        temp_dict['label'] = 'capitalCity'
    elif country_attributes_df.iloc[i,0]== 2:
        temp_dict['label'] = 'lastElectionDate'
    elif country_attributes_df.iloc[i,0] == 3:
        temp_dict['label'] = 'm49'
    elif country_attributes_df.iloc[i,0] == 4:
        temp_dict['label'] = 'UNMembershipDate'
    elif country_attributes_df.iloc[i,0] == 5:
        temp_dict['label'] = 'codeType'
    elif country_attributes_df.iloc[i,0] == 6:
        temp_dict['label'] = 'codeActiveStatus'
    elif country_attributes_df.iloc[i,0] == 7:
        temp_dict['label'] = 'codePublicStatus'
    elif country_attributes_df.iloc[i,0] == 8:
        temp_dict['label'] = 'ISO3CD'
    elif country_attributes_df.iloc[i,0] == 9:
        temp_dict['label'] = 'ISO2CD'
    elif country_attributes_df.iloc[i,0] == 10:
        temp_dict['label'] = 'tradeSystem'
    elif country_attributes_df.iloc[i,0] == 11:
        temp_dict['label'] = 'tourismArrivalsSeriesType'
    elif country_attributes_df.iloc[i,0] == 12:
        temp_dict['label'] = 'exportPartner1'
    elif country_attributes_df.iloc[i,0] == 13:
        temp_dict['label'] = 'importPartner1'
    elif country_attributes_df.iloc[i,0] == 14:
        temp_dict['label'] = 'exportPartner2'
    elif country_attributes_df.iloc[i,0] == 15:
        temp_dict['label'] = 'importPartner2'
    elif country_attributes_df.iloc[i,0] == 16:
        temp_dict['label'] = 'exportPartner3'
    elif country_attributes_df.iloc[i,0] == 17:
        temp_dict['label'] = 'importParnter3'
    elif country_attributes_df.iloc[i,0] == 18:
        temp_dict['label'] = 'codeClassType'
        
    attributes.append(temp_dict)
    
attributes


[{'attributeId': 1, 'attributeName': 'Capital City', 'label': 'capitalCity'},
 {'attributeId': 6,
  'attributeName': 'Code active status',
  'label': 'codeActiveStatus'},
 {'attributeId': 18,
  'attributeName': 'Code class type',
  'label': 'codeClassType'},
 {'attributeId': 7,
  'attributeName': 'Code public status',
  'label': 'codePublicStatus'},
 {'attributeId': 5, 'attributeName': 'Code type', 'label': 'codeType'},
 {'attributeId': 9, 'attributeName': 'ISO-2 code', 'label': 'ISO2CD'},
 {'attributeId': 8, 'attributeName': 'ISO-3 code', 'label': 'ISO3CD'},
 {'attributeId': 2,
  'attributeName': 'Last Election Date',
  'label': 'lastElectionDate'},
 {'attributeId': 12,
  'attributeName': 'Major trading partner 1 (% of exports)',
  'label': 'exportPartner1'},
 {'attributeId': 13,
  'attributeName': 'Major trading partner 1 (% of imports)',
  'label': 'importPartner1'},
 {'attributeId': 14,
  'attributeName': 'Major trading partner 2 (% of exports)',
  'label': 'exportPartner2'},
 {'at

** c) Create individual tables for  each attribute AND consolidate in a single table**

In [6]:
country_attribute_values_df = pd.read_json(data_dir + 'countryattributevalues.json' , orient='columns', encoding='UTF-8')
country_attribute_values_df

Unnamed: 0,countryAttributeId,countryId,dateValue,id,symbol,textEng,textFre
0,0,4,,1,AFN,Afghani (AFN),afghani (AFN)
1,0,8,,2,ALL,Lek (ALL),lek (ALL)
2,0,12,,3,DZD,Algerian Dinar (DZD),dinar algérien (DZD)
3,0,16,,4,USD,US Dollar (USD),dollar des É.-U. (USD)
4,0,20,,5,EUR,Euro (EUR),euro (EUR)
5,0,22,,6,EUR,Euro (EUR),euro (EUR)
6,0,23,,7,EUR,Euro (EUR),euro (EUR)
7,0,24,,8,AOA,Kwanza (AOA),kwanza (AOA)
8,0,28,,9,XCD,E. Caribbean Dollar (XCD),dollar des Caraïb. (XCD)
9,0,31,,10,AZN,Azerbaijan manat (AZN),manat azerbaïdjanais (AZN)


In [7]:
country_attribute_values_df = pd.read_json(data_dir + 'countryattributevalues.json' , orient='columns', encoding='UTF-8')
country_attribute_values_df = country_attribute_values_df.rename(columns = {'countryAttributeId': 'attributeId',
                                                                            'countryId': 'refAreaId',
                                                                            'id': 'countryAttributeValueId',
                                                                            'symbol': 'attributeSymbol',
                                                                            'textEng': 'attributeNameEN',
                                                                            'textFre': 'attributeNameFR'})
country_attribute_values_df

countries_x = countries_df.copy()

for i in range(len(attributes)):

    attributeId =  attributes[i]['attributeId']

    x = country_attribute_values_df.loc[country_attribute_values_df['attributeId'] == attributeId]

    x = x[['refAreaId', 'attributeSymbol', 'attributeNameEN', 'attributeNameFR']]

    label = attributes[i]['label']
    x.columns = ['refAreaId', 
                 label+'_Code', 
                 label+'_DescEN', 
                 label + '_DescFR']

    x.to_csv(path_or_buf=data_dir + label+'.csv',index=False, 
                                           encoding='UTF-8')
    
    if attributeId in [0,1,3,10,11,12,13,14,15,16,17,]:
        countries_x = pd.merge(countries_x, 
                               x[['refAreaId', label+'_Code', label+'_DescEN', label+'_DescFR']],
                               how='left',   on=['refAreaId'])
    if attributeId in [5, 6, 7, 18]:
        countries_x = pd.merge(countries_x,x[['refAreaId', label+'_DescEN']],
                               how='left', on=['refAreaId'])
    if attributeId in [8]:
        countries_x = pd.merge(countries_x,x[['refAreaId', label+'_Code']],
                               how='left', on=['refAreaId'])


In [8]:
print(countries_x)

                                   bookNameEng  \
0                                  Extra-EU-28   
1    Non Petroleum Exports of Asia Middle East   
2                Regional programmes and other   
3                        World exc. intra-EU27   
4                                       Other    
5                                Interregional   
6                               [use code 593]   
7                                        Total   
8                         Asia and the Pacific   
9                                        Total   
10               Total, all countries or areas   
11                                      Africa   
12                               North America   
13                                 Afghanistan   
14                               South America   
15                               Asia [former]   
16                             Europe [former]   
17                                     Albania   
18                                     Oceania   


Rename columns:

In [9]:
countries_x=countries_x.rename(columns = {'codeActiveStatus_DescEN':'activeStatus',
                                          'codeClassType_DescEN': 'classType',
                                          'codePublicStatus_DescEN': 'publicStatus',
                                          'codeType_DescEN': 'type',
                                          'm49_Code': 'parentRegionId',
                                          'm49_DescEN': 'parentRegion_DescEN', 
                                          'm49_DescFR': 'parentRegion_DescFR'})

In [10]:
countries_x.columns

Index(['bookNameEng', 'bookNameFre', 'refAreaId', 'isM49', 'refAreaName',
       'capitalCity_Code', 'capitalCity_DescEN', 'capitalCity_DescFR',
       'activeStatus', 'classType', 'publicStatus', 'type', 'ISO3CD_Code',
       'exportPartner1_Code', 'exportPartner1_DescEN', 'exportPartner1_DescFR',
       'importPartner1_Code', 'importPartner1_DescEN', 'importPartner1_DescFR',
       'exportPartner2_Code', 'exportPartner2_DescEN', 'exportPartner2_DescFR',
       'importPartner2_Code', 'importPartner2_DescEN', 'importPartner2_DescFR',
       'exportPartner3_Code', 'exportPartner3_DescEN', 'exportPartner3_DescFR',
       'importParnter3_Code', 'importParnter3_DescEN', 'importParnter3_DescFR',
       'nationalCurrency_Code', 'nationalCurrency_DescEN',
       'nationalCurrency_DescFR', 'parentRegionId', 'parentRegion_DescEN',
       'parentRegion_DescFR', 'tradeSystem_Code', 'tradeSystem_DescEN',
       'tradeSystem_DescFR', 'tourismArrivalsSeriesType_Code',
       'tourismArrivalsSeriesTy

Re-order columns:

In [11]:
countries_x = countries_x[['type','parentRegionId', 'parentRegion_DescEN','parentRegion_DescFR',
 'refAreaId', 'isM49', 'ISO3CD_Code',
 'refAreaName','bookNameEng', 'bookNameFre', 
 'capitalCity_Code', 'capitalCity_DescEN', 'capitalCity_DescFR',
 'nationalCurrency_Code', 'nationalCurrency_DescEN','nationalCurrency_DescFR',
 'activeStatus', 'classType', 'publicStatus',  
 'exportPartner1_Code', 'exportPartner1_DescEN', 'exportPartner1_DescFR',
 'importPartner1_Code', 'importPartner1_DescEN', 'importPartner1_DescFR',
 'exportPartner2_Code', 'exportPartner2_DescEN', 'exportPartner2_DescFR',
 'importPartner2_Code', 'importPartner2_DescEN', 'importPartner2_DescFR',
 'exportPartner3_Code', 'exportPartner3_DescEN', 'exportPartner3_DescFR',
 'importParnter3_Code', 'importParnter3_DescEN', 'importParnter3_DescFR',
 'tradeSystem_Code', 'tradeSystem_DescEN','tradeSystem_DescFR', 
 'tourismArrivalsSeriesType_Code','tourismArrivalsSeriesType_DescEN', 'tourismArrivalsSeriesType_DescFR']]

Save extended country table to a csv file:

In [12]:
#print(countries_x.head(8))

countries_x.to_csv(data_dir + 'countries_x.csv',index=False, encoding='UTF-8')


# 2. Consolidate data table

## 2.1 Explore data tables

** a) data table **

In [13]:
data_df = pd.read_json(data_dir + 'data.json' , orient='columns', encoding='UTF-8')
data_df = data_df.rename(columns = {'countryId': 'refAreaId',
                                    'id' : 'dataId',
                                    'sybValue' : 'value'})
data_df.to_csv(data_dir + 'ref_data_df.csv',index=False, encoding='UTF-8')

data_df

Unnamed: 0,refAreaId,dataId,seriesId,value,year
0,8,10775856,254,121.0000,1993
1,8,10775857,254,198.0000,1994
2,8,10775858,254,309.0000,1995
3,8,10775859,254,213.0000,1996
4,8,10775860,254,123.0000,1997
5,8,10775861,254,61.0000,1998
6,8,10775862,254,29.0000,1999
7,8,10775863,254,72.0000,2000
8,8,10775864,254,69.0000,2001
9,8,10775865,254,64.0000,2002


** b) series table **

In [14]:
series_df = pd.read_json(data_dir + 'series.json' , orient='columns', encoding='UTF-8')
series_df = series_df.rename(columns = {'id' : 'seriesId',
                                        'name' : 'seriesName',
                                        'code' : 'seriesCode'})
series_df.to_csv(data_dir + 'ref_series_df.csv',index=False, encoding='UTF-8')


series_df

Unnamed: 0,seriesCode,countryAttributeId,seriesId,seriesName,sourceId,tableId
0,Water-%U,,2,"Improved drinking water sources, urban (Propor...",42,84
1,Water-%R,,3,"Improved drinking water sources, rural (Propor...",42,84
2,Water-%T,,4,"Improved drinking water sources, total (Propor...",42,84
3,Sanit-%U,,5,"Improved sanitation facilities, urban (Proport...",42,84
4,Sanit-%R,,6,"Improved sanitation facilities, rural (Proport...",42,84
5,Sanit-%T,,7,"Improved sanitation facilities, total (Proport...",42,84
6,Agri_ind,,8,Agricultural production (Index Base: 2004-2006...,11,32
7,vertebr,,10,Threatened Species: Vertebrates (number),41,5
8,mjtradexp1,12.0,11,Major trading partner 1 (% of exports),95,14
9,mjtradexp2,14.0,12,Major trading partner 2 (% of exports),95,14


** c) topics table **

In [15]:
topics_df = pd.read_json(data_dir + 'topics.json' , orient='columns', encoding='UTF-8')
topics_df = topics_df.rename(columns = {'id' : 'topicId',
                                        'nameEng' : 'topicNameEN',
                                        'nameFre' : 'topicNameFR'})
topics_df.to_csv(data_dir + 'ref_topics_df.csv',index=False, encoding='UTF-8')


topics_df

Unnamed: 0,topicId,topicNameEN,topicNameFR
0,2,Population and migration,Population et migration
1,3,Education,Éducation
2,4,Gender,La situation des femmes
3,5,Communication,Communications
4,6,National accounts,Comptes nationaux
5,7,Finance,Finances
6,8,Labour market,Marché du travail
7,9,Price and production indices,Indices des prix et de la production
8,11,Crime,Criminalité
9,13,Environment,Environnement


** d) sources table **

In [16]:
sources_df = pd.read_json(data_dir + 'sources.json' , orient='columns', encoding='UTF-8')
sources_df = sources_df.rename(columns = {'id' : 'sourceId',
                                          'code': 'sourceCode',
                                        'nameEng' : 'sourceNameEN',
                                        'nameFre' : 'sourceNameFR'})

topics_df.to_csv(data_dir + 'ref_topics_df.csv',index=False, encoding='UTF-8')



** e) tables table **

In [17]:
tables_df = pd.read_json(data_dir + 'tables.json' , orient='columns', encoding='UTF-8')
tables_df = tables_df.rename(columns = {'id' : 'tableId',
                                        'code': 'tableCode',
                                        'name': 'tableName',
                                        'note': 'tableNote'})
tables_df

Unnamed: 0,tableCode,tableId,tableName,tableNote,tableStatusId,tbBkCode,tbBkPrint,topicId
0,education,1,"Enrolment in primary, secondary and tertiary e...",,1,T07,Y,3
1,threatened,5,Threatened species,,1,T25,Y,13
2,rdexpend,6,Gross domestic expenditure on research and dev...,13 Jun 2018 (Ian)\r\nRegional data from SDG 9....,2,T30,Y,14
3,lifecbmort,8,Population growth and indicators of fertility ...,,3,X03,Y,2
4,majtradepa,14,Major trading partners,,1,T22,Y,15
5,newvaluead,18,Gross Value added by kind of economic activity,,2,T14,Y,6
6,gdp,19,Gross domestic product and gross domestic prod...,,1,T13,Y,6
7,industindx,23,Index of Industrial Production,,3,X25,Y,9
8,agindx,32,Agricultural production indices,,1,T20,Y,9
9,employment,49,Employment by economic activity,,1,T18,Y,8


** f) dataxfootnotes table **

In [18]:
dataxfootnotes_df = pd.read_json(data_dir + 'dataxfootnotes.json' , orient='columns', encoding='UTF-8')
dataxfootnotes_df = dataxfootnotes_df.rename(columns = {'countryId' : 'refAreaId'})
dataxfootnotes_df

Unnamed: 0,refAreaId,footnoteId,id,seriesId,year
0,50,58596,7874820,2,1990
1,50,58596,7874826,2,2000
2,156,56965,8381537,2,1990
3,156,56965,8381538,2,2000
4,156,56965,8381539,2,2005
5,156,56965,8381540,2,2010
6,156,56965,8381541,2,2011
7,156,56965,8381542,2,2012
8,156,56965,8381543,2,2013
9,156,56965,8381544,2,2014


** g) footnotes **

In [19]:
footnotes_df = pd.read_json(data_dir + 'footnotes.json' , orient='columns', encoding='UTF-8')
footnotes_df = footnotes_df.rename(columns = {'code' : 'footnoteCode',
                                              'id': 'footnoteId',
                                              'textEng' : 'footnoteTextEN',
                                              'textFre' : 'footnoteTextFR'})
footnotes_df

Unnamed: 0,footnoteCode,footnoteId,footnoteTextEN,footnoteTextFR
0,bellux,1,Data refer to Belgium and Luxembourg.,Les données se rapportent à Belgique et Luxemb...
1,changestructure,3,Change in structure beginning this year.,Changement de structure à partir de cette année.
2,500905,5,General education includes public and aided ed...,
3,chcoverage,16,Change in data coverage beginning this year.,Changement dans la couverture des données à pa...
4,500930,17,Not including evening schools.,
5,chinastat2,20,"For statistical purposes, the data for China d...","Pour la présentation des statistiques, les don..."
6,dayschools,21,Day schools only.,
7,500942,22,Not including Turkish schools.,
8,incdebtf9092,25,Including debt forgiveness of non-ODA claims i...,
9,568,27,Data on vocational refer to public education o...,


## Consolidate data table

**a) Join `data` and `countries_x` **

In [20]:
data_x = pd.merge(countries_x[['type', 'parentRegionId', 'parentRegion_DescEN', 'refAreaId','refAreaName']],
                  data_df,
                  how='right',
                  on = ['refAreaId'])
data_x


Unnamed: 0,type,parentRegionId,parentRegion_DescEN,refAreaId,refAreaName,dataId,seriesId,value,year
0,,,,-147,Asia and the Pacific,20396084,1586,50.87,2000
1,,,,-147,Asia and the Pacific,20396085,1586,50.08,2001
2,,,,-147,Asia and the Pacific,20396086,1586,49.23,2002
3,,,,-147,Asia and the Pacific,20396087,1586,48.03,2003
4,,,,-147,Asia and the Pacific,20396088,1586,47.07,2004
5,,,,-147,Asia and the Pacific,20396089,1586,45.91,2005
6,,,,-147,Asia and the Pacific,20396090,1586,44.04,2006
7,,,,-147,Asia and the Pacific,20396091,1586,42.58,2007
8,,,,-147,Asia and the Pacific,20396092,1586,41.50,2008
9,,,,-147,Asia and the Pacific,20396093,1586,40.06,2009


** join `data_x` and `series`**

In [21]:
data_x = pd.merge(series_df, data_x, how='right',on=['seriesId'])
data_x

Unnamed: 0,seriesCode,countryAttributeId,seriesId,seriesName,sourceId,tableId,type,parentRegionId,parentRegion_DescEN,refAreaId,refAreaName,dataId,value,year
0,Water-%U,,2,"Improved drinking water sources, urban (Propor...",42,84,Region,,,1,World,18687780,95.1000,1990
1,Water-%U,,2,"Improved drinking water sources, urban (Propor...",42,84,Region,,,1,World,18687781,95.5000,2000
2,Water-%U,,2,"Improved drinking water sources, urban (Propor...",42,84,Region,,,1,World,18687782,95.8000,2005
3,Water-%U,,2,"Improved drinking water sources, urban (Propor...",42,84,Region,,,1,World,18687783,96.1000,2010
4,Water-%U,,2,"Improved drinking water sources, urban (Propor...",42,84,Region,,,1,World,18687784,96.2000,2011
5,Water-%U,,2,"Improved drinking water sources, urban (Propor...",42,84,Region,,,1,World,18687785,96.2000,2012
6,Water-%U,,2,"Improved drinking water sources, urban (Propor...",42,84,Region,,,1,World,18687786,96.3000,2013
7,Water-%U,,2,"Improved drinking water sources, urban (Propor...",42,84,Region,,,1,World,18687787,96.4000,2014
8,Water-%U,,2,"Improved drinking water sources, urban (Propor...",42,84,Region,,,1,World,18687788,96.4000,2015
9,Water-%U,,2,"Improved drinking water sources, urban (Propor...",42,84,Area,34,Southern Asia,4,Afghanistan,18687829,52.2362,2000


** join `data_x` and `tables`**

In [22]:
data_x = pd.merge(tables_df, data_x, how='right',on=['tableId'])
data_x

Unnamed: 0,tableCode,tableId,tableName,tableNote,tableStatusId,tbBkCode,tbBkPrint,topicId,seriesCode,countryAttributeId,...,seriesName,sourceId,type,parentRegionId,parentRegion_DescEN,refAreaId,refAreaName,dataId,value,year
0,education,1,"Enrolment in primary, secondary and tertiary e...",,1,T07,Y,3,tot-prim,,...,Students enrolled in primary education (thousa...,24,Region,,,1,World,20194865,657127.2509,2000
1,education,1,"Enrolment in primary, secondary and tertiary e...",,1,T07,Y,3,tot-prim,,...,Students enrolled in primary education (thousa...,24,Region,,,1,World,20194866,654368.0718,2001
2,education,1,"Enrolment in primary, secondary and tertiary e...",,1,T07,Y,3,tot-prim,,...,Students enrolled in primary education (thousa...,24,Region,,,1,World,20194867,657912.1719,2002
3,education,1,"Enrolment in primary, secondary and tertiary e...",,1,T07,Y,3,tot-prim,,...,Students enrolled in primary education (thousa...,24,Region,,,1,World,20194868,669135.0731,2003
4,education,1,"Enrolment in primary, secondary and tertiary e...",,1,T07,Y,3,tot-prim,,...,Students enrolled in primary education (thousa...,24,Region,,,1,World,20194869,683551.0547,2004
5,education,1,"Enrolment in primary, secondary and tertiary e...",,1,T07,Y,3,tot-prim,,...,Students enrolled in primary education (thousa...,24,Region,,,1,World,20194870,678989.7351,2005
6,education,1,"Enrolment in primary, secondary and tertiary e...",,1,T07,Y,3,tot-prim,,...,Students enrolled in primary education (thousa...,24,Region,,,1,World,20194871,681383.8437,2006
7,education,1,"Enrolment in primary, secondary and tertiary e...",,1,T07,Y,3,tot-prim,,...,Students enrolled in primary education (thousa...,24,Region,,,1,World,20194872,689538.0084,2007
8,education,1,"Enrolment in primary, secondary and tertiary e...",,1,T07,Y,3,tot-prim,,...,Students enrolled in primary education (thousa...,24,Region,,,1,World,20194873,693630.8046,2008
9,education,1,"Enrolment in primary, secondary and tertiary e...",,1,T07,Y,3,tot-prim,,...,Students enrolled in primary education (thousa...,24,Region,,,1,World,20194874,694828.8556,2009


** join `data_x` and `countries`**

In [23]:
data_x = pd.merge(topics_df, data_x, how='right',on=['topicId'])
data_x

Unnamed: 0,topicId,topicNameEN,topicNameFR,tableCode,tableId,tableName,tableNote,tableStatusId,tbBkCode,tbBkPrint,...,seriesName,sourceId,type,parentRegionId,parentRegion_DescEN,refAreaId,refAreaName,dataId,value,year
0,2,Population and migration,Population et migration,lifecbmort,8,Population growth and indicators of fertility ...,,3,X03,Y,...,Population annual rate of increase (percent),73,Region,,,1,World,19032061,1.185,2015
1,2,Population and migration,Population et migration,lifecbmort,8,Population growth and indicators of fertility ...,,3,X03,Y,...,Population annual rate of increase (percent),73,Region,,,1,World,19683519,1.782,1985
2,2,Population and migration,Population et migration,lifecbmort,8,Population growth and indicators of fertility ...,,3,X03,Y,...,Population annual rate of increase (percent),73,Region,,,1,World,19683520,1.793,1990
3,2,Population and migration,Population et migration,lifecbmort,8,Population growth and indicators of fertility ...,,3,X03,Y,...,Population annual rate of increase (percent),73,Region,,,1,World,19683521,1.519,1995
4,2,Population and migration,Population et migration,lifecbmort,8,Population growth and indicators of fertility ...,,3,X03,Y,...,Population annual rate of increase (percent),73,Region,,,1,World,19683522,1.324,2000
5,2,Population and migration,Population et migration,lifecbmort,8,Population growth and indicators of fertility ...,,3,X03,Y,...,Population annual rate of increase (percent),73,Region,,,1,World,19683523,1.253,2005
6,2,Population and migration,Population et migration,lifecbmort,8,Population growth and indicators of fertility ...,,3,X03,Y,...,Population annual rate of increase (percent),73,Region,,,1,World,19683524,1.233,2010
7,2,Population and migration,Population et migration,lifecbmort,8,Population growth and indicators of fertility ...,,3,X03,Y,...,Population annual rate of increase (percent),73,Region,,,1,World,19707619,1.087,2020
8,2,Population and migration,Population et migration,lifecbmort,8,Population growth and indicators of fertility ...,,3,X03,Y,...,Population annual rate of increase (percent),73,Region,1,World,2,Africa,19032062,2.587,2015
9,2,Population and migration,Population et migration,lifecbmort,8,Population growth and indicators of fertility ...,,3,X03,Y,...,Population annual rate of increase (percent),73,Region,1,World,2,Africa,19683525,2.824,1985


** join `data_x` and `sources`**

In [24]:
data_x = pd.merge(data_x, sources_df, how='left',on=['sourceId'])
data_x

Unnamed: 0,topicId,topicNameEN,topicNameFR,tableCode,tableId,tableName,tableNote,tableStatusId,tbBkCode,tbBkPrint,...,parentRegionId,parentRegion_DescEN,refAreaId,refAreaName,dataId,value,year,sourceCode,sourceNameEN,sourceNameFR
0,2,Population and migration,Population et migration,lifecbmort,8,Population growth and indicators of fertility ...,,3,X03,Y,...,,,1,World,19032061,1.185,2015,UNPD_WPP,"United Nations Population Division, New York, ...","Organisation des Nations Unies, Division de la..."
1,2,Population and migration,Population et migration,lifecbmort,8,Population growth and indicators of fertility ...,,3,X03,Y,...,,,1,World,19683519,1.782,1985,UNPD_WPP,"United Nations Population Division, New York, ...","Organisation des Nations Unies, Division de la..."
2,2,Population and migration,Population et migration,lifecbmort,8,Population growth and indicators of fertility ...,,3,X03,Y,...,,,1,World,19683520,1.793,1990,UNPD_WPP,"United Nations Population Division, New York, ...","Organisation des Nations Unies, Division de la..."
3,2,Population and migration,Population et migration,lifecbmort,8,Population growth and indicators of fertility ...,,3,X03,Y,...,,,1,World,19683521,1.519,1995,UNPD_WPP,"United Nations Population Division, New York, ...","Organisation des Nations Unies, Division de la..."
4,2,Population and migration,Population et migration,lifecbmort,8,Population growth and indicators of fertility ...,,3,X03,Y,...,,,1,World,19683522,1.324,2000,UNPD_WPP,"United Nations Population Division, New York, ...","Organisation des Nations Unies, Division de la..."
5,2,Population and migration,Population et migration,lifecbmort,8,Population growth and indicators of fertility ...,,3,X03,Y,...,,,1,World,19683523,1.253,2005,UNPD_WPP,"United Nations Population Division, New York, ...","Organisation des Nations Unies, Division de la..."
6,2,Population and migration,Population et migration,lifecbmort,8,Population growth and indicators of fertility ...,,3,X03,Y,...,,,1,World,19683524,1.233,2010,UNPD_WPP,"United Nations Population Division, New York, ...","Organisation des Nations Unies, Division de la..."
7,2,Population and migration,Population et migration,lifecbmort,8,Population growth and indicators of fertility ...,,3,X03,Y,...,,,1,World,19707619,1.087,2020,UNPD_WPP,"United Nations Population Division, New York, ...","Organisation des Nations Unies, Division de la..."
8,2,Population and migration,Population et migration,lifecbmort,8,Population growth and indicators of fertility ...,,3,X03,Y,...,1,World,2,Africa,19032062,2.587,2015,UNPD_WPP,"United Nations Population Division, New York, ...","Organisation des Nations Unies, Division de la..."
9,2,Population and migration,Population et migration,lifecbmort,8,Population growth and indicators of fertility ...,,3,X03,Y,...,1,World,2,Africa,19683525,2.824,1985,UNPD_WPP,"United Nations Population Division, New York, ...","Organisation des Nations Unies, Division de la..."


Partition by Table

In [25]:
for i in tables_df.index:
    tableId =  tables_df.iloc[i,1]
    tableCode = tables_df.iloc[i,0]
    x = data_x.loc[data_x['tableId'] == tableId]
    x = x.replace('\n',' ', regex=True).replace('\r',' ', regex=True).replace('\t',' ', regex=True)
    x.to_csv(data_dir + 't_' + tableCode + '.csv',index=False, encoding='UTF-8')


Save full data table as csv. (Remove line breaks and tab character from the dataframe)

In [None]:
data_x = data_x.replace('\n',' ', regex=True).replace('\r',' ', regex=True).replace('\t',' ', regex=True)


data_x.to_csv(data_dir + 'data_x.csv',index=False, encoding='UTF-8')
