In [1]:
import pandas as pd
import numpy as np



### Get all the pillar names from the excel

In [2]:
names = pd.read_excel('../../UNDP Digital Assessment Data Framework Filename Matching V7.xlsx')

In [3]:
col_names = ['Indicator','check', 'Data Source','Index','Filename']

In [4]:
names = names[col_names]

In [5]:
names.head()

Unnamed: 0,Indicator,check,Data Source,Index,Filename
0,Countries,,United Nations,False,Countries
1,"Database of Global Administrative Areas (GADM,...",,GADM maps and data,False,
2,High Resolution Population Density Maps + Demo...,,Facebook,False,
3,population density vs openstreetmap object den...,,Kontur,False,
4,Population Density,Infrastructure,World Bank,False,population_density


In [6]:
# get all the files per pillar
data_stats = names.groupby('check').agg({'Filename':'count','Indicator':'count'})

In [7]:
data_stats

Unnamed: 0_level_0,Filename,Indicator
check,Unnamed: 1_level_1,Unnamed: 2_level_1
Business,20,25
Foundations,8,12
Government,10,15
Infrastructure,39,48
People,35,47
Regulation,6,7
Strategy,1,1


### Foundations

In [8]:
bnames = names[(names.check=='Foundations')&(~names.Filename.isna())]#&(names.Index==False)]

In [9]:
bnames.head(25)

Unnamed: 0,Indicator,check,Data Source,Index,Filename
148,Digital payments penetration,Foundations,Portulans Institute,True,digital_payments_penetration
149,% of population with digital finance account -...,Foundations,World Bank,False,population_digital_financial_services
150,% of population with digital finance account -...,Foundations,World Bank,False,population_digital_financial_services
154,% of population with ID,Foundations,World Bank,False,id4d_nid
155,% of services that can be accessed,Foundations,World Bank,False,id4d_services
156,can ID be used for transactions,Foundations,World Bank,False,id4d_services
157,Is personal data siloed,Foundations,World Bank,False,Egov_strategy
158,Open data index,Foundations,Open Knowledge Foundation,True,open_data_idx


In [10]:
# get list of names for all indicators
indicators = bnames.Indicator.unique()

In [11]:
# get all file names
bfiles = bnames.Filename.unique()

In [12]:
bfiles

array(['digital_payments_penetration',
       'population_digital_financial_services', 'id4d_nid',
       'id4d_services', 'Egov_strategy', 'open_data_idx'], dtype=object)

In [13]:
# ls digital-readiness-assessment-main/processed/

In [14]:
##ict_goods and services not in process data

In [15]:
# formula for converting scale
def convert_rank(old_value, old_min=1, old_max=7, new_min=1, new_max=6 ):
    """ Convert old scale values scale into new scale values"""
    old_range = old_max - old_min
    new_range = new_max - new_min
    new_value = (((old_value-old_min)*new_range)/old_range)+new_min
    return new_value

### 1. Digital payments penetration

In [16]:
indicators[0]

'Digital payments penetration'

In [17]:
# load data
indicator = indicators[0]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Digital payments penetration
digital_payments_penetration


In [18]:
df.head()

Unnamed: 0,RANK,COUNTRY/ECONOMY,VALUE,SCORE
0,1.0,Norway,0.85,100.0
1,2.0,Denmark,0.83,97.24
2,3.0,Finland,0.8,93.95
3,4.0,Sweden,0.8,93.08
4,5.0,Netherlands,0.76,89.01


In [19]:
# score looks like the one to use
df.describe()

Unnamed: 0,RANK,VALUE,SCORE
count,122.0,122.0,122.0
mean,61.5,0.315738,36.076967
std,35.362409,0.209012,24.867406
min,1.0,0.01,0.0
25%,31.25,0.15,16.235
50%,61.5,0.27,30.87
75%,91.75,0.44,51.11
max,122.0,0.85,100.0


In [20]:
# df.Indicator.unique()

In [21]:
# create standard columns
df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['SCORE'] 


min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 1-100 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))



In [22]:
df

Unnamed: 0,RANK,Country Name,VALUE,SCORE,higher_is_better,Indicator,data_col,new_rank_score
0,1.0,Norway,0.85,100.00,True,Digital payments penetration,100.00,6.0000
1,2.0,Denmark,0.83,97.24,True,Digital payments penetration,97.24,5.8620
2,3.0,Finland,0.80,93.95,True,Digital payments penetration,93.95,5.6975
3,4.0,Sweden,0.80,93.08,True,Digital payments penetration,93.08,5.6540
4,5.0,Netherlands,0.76,89.01,True,Digital payments penetration,89.01,5.4505
...,...,...,...,...,...,...,...,...
129,,Jamaica,,,True,Digital payments penetration,,
130,,Madagascar,,,True,Digital payments penetration,,
131,,Oman,,,True,Digital payments penetration,,
132,,Qatar,,,True,Digital payments penetration,,


In [23]:

# output scores to csv
df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].to_csv('../indicator_scores/foundations_{}_scores.csv'.format(bf), index=False)

In [24]:
df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].head(15)


Unnamed: 0,Country Name,Indicator,data_col,new_rank_score,higher_is_better
0,Norway,Digital payments penetration,100.0,6.0,True
1,Denmark,Digital payments penetration,97.24,5.862,True
2,Finland,Digital payments penetration,93.95,5.6975,True
3,Sweden,Digital payments penetration,93.08,5.654,True
4,Netherlands,Digital payments penetration,89.01,5.4505,True
5,New Zealand,Digital payments penetration,86.73,5.3365,True
6,United States,Digital payments penetration,84.4,5.22,True
7,Estonia,Digital payments penetration,82.82,5.141,True
8,"Korea, Rep.",Digital payments penetration,82.64,5.132,True
9,Canada,Digital payments penetration,81.53,5.0765,True


## 2. % of population with digital finance account - registered


In [25]:
indicator = indicators[1]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population with digital finance account - registered
population_digital_financial_services


In [26]:
# no DATE associated wiht the data

In [27]:
df.columns.tolist()

['Unnamed: 0',
 'Account (% age 15+)',
 'Account, male (% age 15+)',
 'Account, in labor force (% age 15+) ',
 'Account, out of labor force (% age 15+) ',
 'Account, female (% age 15+)',
 'Account, young adults (% ages 15-24)',
 'Account, older adults (% ages 25+)',
 'Account, primary education or less (% ages 15+) ',
 'Account, secondary education or more (% ages 15+) ',
 'Account, income, poorest 40% (% ages 15+)',
 'Account, income, richest 60% (% ages 15+) ',
 'Account, rural (% age 15+) ',
 'Financial institution account (% age 15+) ',
 'Financial institution account,male(% age 15+) ',
 'Financial institution account, in labor force(% age 15+) ',
 'Financial institution account, out of labor force (% age 15+) ',
 'Financial institution account,female(% age 15+) ',
 'Financial institution account,young adults(% age 15-24) ',
 'Financial institution account, older adults(% age 25+) ',
 'Financial institution account, primary education or less(% age 15+) ',
 'Financial institution ac

In [28]:
df[['data_country',
 'data_year','Unnamed: 0']].drop_duplicates()

Unnamed: 0.1,data_country,data_year,Unnamed: 0
0,Low income,Low income,Low income
3,Lower middle income,Lower middle income,Lower middle income
5,Upper middle income,Upper middle income,Upper middle income
8,,,
11,High income,High income,High income


In [29]:
df.head(16)

Unnamed: 0.1,Unnamed: 0,Account (% age 15+),"Account, male (% age 15+)","Account, in labor force (% age 15+)","Account, out of labor force (% age 15+)","Account, female (% age 15+)","Account, young adults (% ages 15-24)","Account, older adults (% ages 25+)","Account, primary education or less (% ages 15+)","Account, secondary education or more (% ages 15+)",...,"Mobile money account, female (% age 15+)","Mobile money account, young adults (% age 15-24)","Mobile money account, older adults (% age 25+)","Mobile money account, primary education or less (% age 15+)","Mobile money account, secondary education or less (% age 15+)","Mobile money account, income, poorest 40% (% age 15+)","Mobile money account, income, richest 60% (% age 15+)","Mobile money account, rural (% age 15+)",data_country,data_year
0,Low income,9%,15%,15%,2%,3%,6%,11%,5%,30%,...,,,,,,,,,Low income,Low income
1,Low income,10%,16%,15%,4%,4%,7%,12%,5%,23%,...,0%,0%,0%,0%,0%,0%,1%,0%,Low income,Low income
2,Low income,15%,23%,25%,4%,7%,10%,18%,9%,31%,...,1%,0%,1%,0%,2%,0%,1%,1%,Low income,Low income
3,Lower middle income,39%,39%,46%,31%,39%,30%,45%,35%,44%,...,,,,,,,,,Lower middle income,Lower middle income
4,Lower middle income,29%,36%,36%,12%,22%,15%,38%,14%,70%,...,,,,,,,,,Lower middle income,Lower middle income
5,Upper middle income,28%,34%,36%,15%,23%,26%,29%,15%,36%,...,,,,,,,,,Upper middle income,Upper middle income
6,Upper middle income,38%,43%,44%,29%,34%,30%,40%,24%,56%,...,,,,,,,,,Upper middle income,Upper middle income
7,Upper middle income,40%,42%,53%,27%,38%,32%,43%,27%,56%,...,2%,6%,1%,1%,4%,0%,4%,2%,Upper middle income,Upper middle income
8,,22%,30%,33%,11%,14%,15%,26%,14%,31%,...,,,,,,,,,,
9,,30%,38%,42%,18%,22%,21%,34%,21%,39%,...,,,,,,,,,,


In [30]:
# going to use the  score column since this is already an Index

In [31]:
# # convert rank into 1-6 - in order to get most recent data 
# df['data_col'] = df['Score'] 

# # going to assume index is between 1-100 but not 100% sure
# min_rank = 1 #df['data_col'].min()
# max_rank = 100 #df['data_col'].max()

# # transform 1-100 rank into 1-6
# df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# # need to invert score since higher rank is not better 
# # df['new_rank_score'] = (6-df['new_rank_score'])+1

In [32]:
# # prepare output
# df.rename(columns={'Country':'Country Name'}, inplace=True)
# df['Indicator'] = indicator
# df['higher_is_better'] = True

# df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].head(15)


# # output scores to csv
# df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].to_csv('../indicator_scores/{}_scores.csv'.format(bf), index=False)

## 3. % of population with digital finance account - active (90 days)

In [33]:
indicator = indicators[2]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population with digital finance account - active (90 days)
population_digital_financial_services


In [34]:
df.head()

Unnamed: 0.1,Unnamed: 0,Account (% age 15+),"Account, male (% age 15+)","Account, in labor force (% age 15+)","Account, out of labor force (% age 15+)","Account, female (% age 15+)","Account, young adults (% ages 15-24)","Account, older adults (% ages 25+)","Account, primary education or less (% ages 15+)","Account, secondary education or more (% ages 15+)",...,"Mobile money account, female (% age 15+)","Mobile money account, young adults (% age 15-24)","Mobile money account, older adults (% age 25+)","Mobile money account, primary education or less (% age 15+)","Mobile money account, secondary education or less (% age 15+)","Mobile money account, income, poorest 40% (% age 15+)","Mobile money account, income, richest 60% (% age 15+)","Mobile money account, rural (% age 15+)",data_country,data_year
0,Low income,9%,15%,15%,2%,3%,6%,11%,5%,30%,...,,,,,,,,,Low income,Low income
1,Low income,10%,16%,15%,4%,4%,7%,12%,5%,23%,...,0%,0%,0%,0%,0%,0%,1%,0%,Low income,Low income
2,Low income,15%,23%,25%,4%,7%,10%,18%,9%,31%,...,1%,0%,1%,0%,2%,0%,1%,1%,Low income,Low income
3,Lower middle income,39%,39%,46%,31%,39%,30%,45%,35%,44%,...,,,,,,,,,Lower middle income,Lower middle income
4,Lower middle income,29%,36%,36%,12%,22%,15%,38%,14%,70%,...,,,,,,,,,Lower middle income,Lower middle income


No country or date data. Also not clear which column to use.

In [35]:
# dcol = 'Percentage of total trade in services'
# indicol = 'Category Label'
# cname = 'Economy Label'

# # filter most recent year and imports
# df = df[(df.Year==2019)&(df['Flow Label']=='Imports')]

# # create the standard columns
# df['higher_is_better'] = True
# df['Indicator'] =  df[indicol]+' '+ df['Flow Label']
# df['Country Name'] = df[cname]
# # df['2019 [YR2019]'] = df['2019 [YR2019]'].astype(float)
# df['data_col'] = df[dcol]


# # convert 1-100 %  into 1-6
# df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=0,old_max=100))

# df.sort_values(by='new_rank_score', ascending=False)

# # prepare output
# df = df[['Country Name', 'Indicator', 'data_col', 'new_rank_score','higher_is_better']]

# # output scores
# df.to_csv('../indicator_scores/{}_scores.csv'.format(bf+'_imports'), index=False)

## 4. % of population with ID


In [76]:
indicator = indicators[3]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

# Move the top row to the position of header
df.columns = df.iloc[0] 
df = df[1:]
df.head(20)

% of population with ID
id4d_nid


Unnamed: 0,id,NaN,Region,Mandatory NID Age,Type,RPB (Registered Population Above Cut-off Age),RPB Male,RPB Rural,RPB 0-30,NaN.1,NaN.2
1,1,,SAS,0,Voter,20845988,13549892.0,,,,
2,2,,ECS,16,Direct,4455772,2267673.0,,1863436.0,,
3,3,,MEA,18,Voter,23251503,,,,,
4,4,,ECS,-,Voter,24512,,,,,
5,5,,SSF,10,Voter,9317294,,,,,
6,6,,LCN,-,Voter,51258,,,,,
7,7,,LCN,0,Voter,33454411,,,,,
8,8,,ECS,16,Voter,2588590,,,,,
9,9,,EAS,-,Voter,16117860,7847109.0,,3031006.0,,
10,10,,ECS,0,Voter,6400993,3093348.0,,,,


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198 entries, 1 to 198
Data columns (total 11 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   id                                             198 non-null    object 
 1   nan                                            0 non-null      float64
 2   Region                                         198 non-null    object 
 3   Mandatory NID Age                              198 non-null    object 
 4   Type                                           198 non-null    object 
 5   RPB (Registered Population Above Cut-off Age)  198 non-null    object 
 6   RPB Male                                       57 non-null     object 
 7   RPB Rural                                      14 non-null     object 
 8   RPB 0-30                                       37 non-null     object 
 9   nan                                            0 non-n

In [38]:
# dcol = 'Percentage of total merchandise trade'
# indicol = 'IctProductCategory Label'
# cname = 'Economy Label'

# # filter most recent year and exports
# df = df[(df.Year==2019)&(df['Flow Label']=='Exports')]

# # create standard columns
# df['higher_is_better'] = True
# df['Indicator'] =  df[indicol]+' '+ df['Flow Label']
# df['Country Name'] = df[cname]
# # df['2019 [YR2019]'] = df['2019 [YR2019]'].astype(float)
# df['data_col'] = df[dcol]


# # convert 1-100 %  into 1-6
# df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=0,old_max=100))

# df.sort_values(by='new_rank_score', ascending=False)

# # prepare output
# df = df[['Country Name', 'Indicator', 'data_col', 'new_rank_score','higher_is_better']]

# # output scores
# df.to_csv('../indicator_scores/{}_scores.csv'.format(bf+'_exports'), index=False)

## 5. % of services that can be accessed

In [77]:
indicator = indicators[4]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of services that can be accessed
id4d_services


In [78]:
# Move the top row to the position of header
df.columns = df.iloc[0] 
df = df[1:]
df.head(20)

Unnamed: 0,#,Economic Environment (0-30 points),Country Code,Legal System Type,Key Act / Bill,Key Act / Bill.1,Status,Status.1,Press Freedom Status,Economic Environment (0-30 points).1,Economic Environment (0-30 points).2
1,1,16,AFG,civil + customary + religious,"Law for Registration of Population Records, Ar...",,NF,,PF,16,16
2,2,17,ALB,civil,"Law on ID Cards No. 8952, as amended, Arts. 1-3",Act on the Protection of Personal Data,PF,,PF,17,17
3,3,19,DZA,civil + religious,Décret No. 67/126 du 21/07/1967 Portant Instit...,,NF,,NF,19,19
4,4,8,AND,civil,-,Law on the protection of personal data,F,,F,8,8
5,5,21,AGO,civil,"Law No. 4/09 of June 30, Arts. 19 and 20",Lei da Protecção de Dados Pessoais,NF,PF,NF,21,21
6,6,11,ATG,common,"Representation of the People Act, Registration...",Data Protection Act,F,,PF,11,11
7,7,15,ARG,civil,Ley No.17.671 de Identificación Registro y Cla...,"Personal Data Protection Law Number 25,326 (th...",F,F,PF,15,15
8,8,20,ARM,civil,"Law on Identification Cards, Art. 4",The Republic of Armenia Law HO-422-N of Octob...,PF,PF,NF,20,20
9,9,7,AUS,common,-,The Federal Privacy Act 1988 (Cth) (Privacy Ac...,F,F,F,7,7
10,10,6,AUT,civil,Security Police Act § 35a,EU Data Protection Directive 95/46/EC with the...,F,,F,6,6


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203 entries, 1 to 203
Data columns (total 11 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   #                                   198 non-null    object
 1   Economic Environment (0-30 points)  201 non-null    object
 2   Country Code                        200 non-null    object
 3   Legal System Type                   202 non-null    object
 4   Key Act / Bill                      202 non-null    object
 5   Key Act / Bill                      202 non-null    object
 6   Status                              193 non-null    object
 7   Status                              63 non-null     object
 8   Press Freedom Status                193 non-null    object
 9   Economic Environment (0-30 points)  193 non-null    object
 10  Economic Environment (0-30 points)  193 non-null    object
dtypes: object(11)
memory usage: 17.6+ KB


In [79]:
df.head()

Unnamed: 0,#,Economic Environment (0-30 points),Country Code,Legal System Type,Key Act / Bill,Key Act / Bill.1,Status,Status.1,Press Freedom Status,Economic Environment (0-30 points).1,Economic Environment (0-30 points).2
1,1,16,AFG,civil + customary + religious,"Law for Registration of Population Records, Ar...",,NF,,PF,16,16
2,2,17,ALB,civil,"Law on ID Cards No. 8952, as amended, Arts. 1-3",Act on the Protection of Personal Data,PF,,PF,17,17
3,3,19,DZA,civil + religious,Décret No. 67/126 du 21/07/1967 Portant Instit...,,NF,,NF,19,19
4,4,8,AND,civil,-,Law on the protection of personal data,F,,F,8,8
5,5,21,AGO,civil,"Law No. 4/09 of June 30, Arts. 19 and 20",Lei da Protecção de Dados Pessoais,NF,PF,NF,21,21


In [43]:
# dcol = 'Percentage of total merchandise trade'
# indicol = 'IctProductCategory Label'
# cname = 'Economy Label'

# # filter most recent year and exports
# df = df[(df.Year==2019)&(df['Flow Label']=='Imports')]

# # create standard columns
# df['higher_is_better'] = True
# df['Indicator'] =  df[indicol]+' '+ df['Flow Label']
# df['Country Name'] = df[cname]
# # df['2019 [YR2019]'] = df['2019 [YR2019]'].astype(float)
# df['data_col'] = df[dcol]


# # convert 1-100 %  into 1-6
# df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=0,old_max=100))

# df.sort_values(by='new_rank_score', ascending=False)

# # prepare output
# df = df[['Country Name', 'Indicator', 'data_col', 'new_rank_score','higher_is_better']]

# # output scores
# df.to_csv('../indicator_scores/{}_scores.csv'.format(bf+'_imports'), index=False)

## 6. can ID be used for transactions



In [80]:
indicator = indicators[5]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

can ID be used for transactions
id4d_services


In [45]:
df.head()

Unnamed: 0,Legal Enablers to ID,Unnamed: 1,Country Characteristics,Legal System Type,Foundational Civil Identification Law,Data Protection Laws & Privacy Bills,Freedom in the World (2018),Freedom on the Net (2017),Freedom in the Press (2017),data_country,data_year
0,#,Economic Environment (0-30 points),Country Code,Legal System Type,Key Act / Bill,Key Act / Bill,Status,Status,Press Freedom Status,Economic Environment (0-30 points),Economic Environment (0-30 points)
1,1,16,AFG,civil + customary + religious,"Law for Registration of Population Records, Ar...",,NF,,PF,16,16
2,2,17,ALB,civil,"Law on ID Cards No. 8952, as amended, Arts. 1-3",Act on the Protection of Personal Data,PF,,PF,17,17
3,3,19,DZA,civil + religious,Décret No. 67/126 du 21/07/1967 Portant Instit...,,NF,,NF,19,19
4,4,8,AND,civil,-,Law on the protection of personal data,F,,F,8,8


In [46]:
# min_rank = 1
# max_rank = df['Country Name'].nunique()

# # transform 1-24 rank into 1-6
# df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# # need to invert score since higher rank is not better 
# # df['new_rank_score'] = (6-df['new_rank_score'])+1

In [47]:
# # prepare output
# df.sort_values(by='new_rank_score', ascending=False)

# # prepare output
# df = df[['Country Name', 'Indicator', 'data_col', 'new_rank_score','higher_is_better']]

# # output scores
# df.to_csv('../indicator_scores/{}_scores.csv'.format(bf), index=False)

## 7. Is personal data siloed


In [82]:
indicator = indicators[6]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

# Must limit the database to the first 206 rows, the remaining rows do not contain any useful information
df = df.iloc[0:205,:]

# Drop the superfluous rows by dropping na
df = df[df['#'].notna()]

# Must convert data in the DPL column into float
df['DPL'] = df['DPL'].astype(float, errors = 'ignore')


Is personal data siloed
Egov_strategy


In [86]:
df.head(104)

Unnamed: 0,#,Flag,Code,Cnum,Economy,Level,Population,GNI,GNIPC,e-Government,...,NGTI-1,NGTI-2,NGTI-3,NGTI-4,GTI-1,GTI-2,GTI-3,GTI-4,data_country,data_year
0,1,,AFG,4.0,Afghanistan,LIC,38928,20726,540,https://mcit.gov.af/node/6938,...,0.69,0.52,0.31,0.62,-0.09,-0.34,0.02,-0.12,,
1,2,,ALB,8.0,Albania,UMIC,2878,14949,5240,https://e-albania.al/,...,0.81,0.78,0.60,0.77,0.29,0.62,0.74,0.26,,
2,3,,DZA,12.0,Algeria,LMIC,43851,170722,3970,https://www.mpttn.gov.dz/ar/content/%D8%A7%D9%...,...,0.73,0.46,0.02,0.60,0.06,-0.56,-0.68,-0.15,,
3,4,,ADO,20.0,Andorra,HIC,77,3154,40886,http://www.govern.ad,...,0.63,0.60,0.05,0.13,-0.28,-0.06,-0.62,-1.32,,
4,5,,AGO,24.0,Angola,LMIC,32866,97005,3050,http://www.governo.gov.ao,...,0.68,0.69,0.21,0.61,-0.12,0.27,-0.23,-0.13,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,98,,LSO,426.0,Lesotho,LMIC,2142,2901,1360,http://www.gov.ls,...,,,,,,,,,,
104,99,,LBR,430.0,Liberia,LIC,5058,2852,580,http://www.emansion.gov.lr,...,0.69,0.42,0.13,0.50,-0.08,-0.69,-0.42,-0.39,,
105,100,,LBY,434.0,Libya,UMIC,6871,51757,7640,http://www.pm.gov.ly,...,0.26,0.31,0.01,0.50,-1.49,-1.09,-0.71,-0.42,,
106,101,,LIE,438.0,Liechtenstein,HIC,38,4160,116430,http://regierung.li,...,0.30,0.48,0.07,0.87,-1.35,-0.49,-0.57,0.51,,


In [50]:
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df.iloc[:,205]
df['Country Name'] = df['Economy']
df['Year'] = 2020

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [81]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(107)

KeyError: "None of [Index(['Country Name', 'Year', 'Indicator', 'data_col', 'new_rank_score',\n       'higher_is_better'],\n      dtype='object')] are in the [columns]"

## 8. Open data index

In [52]:
bnames

Unnamed: 0,Indicator,check,Data Source,Index,Filename
148,Digital payments penetration,Foundations,Portulans Institute,True,digital_payments_penetration
149,% of population with digital finance account -...,Foundations,World Bank,False,population_digital_financial_services
150,% of population with digital finance account -...,Foundations,World Bank,False,population_digital_financial_services
154,% of population with ID,Foundations,World Bank,False,id4d_nid
155,% of services that can be accessed,Foundations,World Bank,False,id4d_services
156,can ID be used for transactions,Foundations,World Bank,False,id4d_services
157,Is personal data siloed,Foundations,World Bank,False,Egov_strategy
158,Open data index,Foundations,Open Knowledge Foundation,True,open_data_idx


In [53]:
indicator = indicators[7]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Open data index
open_data_idx


In [54]:
df.head(15)

Unnamed: 0,id,site,name,slug,region,continent,rank,score
0,tw,global,Taiwan,taiwan,,,1,90
1,au,global,Australia,australia,,,2,79
2,gb,global,Great Britain,united_kingdom,,,2,79
3,fr,global,France,france,,,4,70
4,fi,global,Finland,finland,,,5,69
5,ca,global,Canada,canada,,,5,69
6,no,global,Norway,norway,,,5,69
7,br,global,Brazil,brazil,,,8,68
8,nz,global,New Zealand,new_zealand,,,8,68
9,nir,global,Northern Ireland,northern_ireland,,,10,67


In [55]:
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['score'] 
df['Country Name'] = df['name']
df['Year'] = 2016

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [56]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,Taiwan,2016,Open data index,90,6.0,True
1,Australia,2016,Open data index,79,5.382022,True
2,Great Britain,2016,Open data index,79,5.382022,True
3,France,2016,Open data index,70,4.876404,True
4,Finland,2016,Open data index,69,4.820225,True
5,Canada,2016,Open data index,69,4.820225,True
6,Norway,2016,Open data index,69,4.820225,True
7,Brazil,2016,Open data index,68,4.764045,True
8,New Zealand,2016,Open data index,68,4.764045,True
9,Northern Ireland,2016,Open data index,67,4.707865,True


### Score Aggregating

In [57]:
import os


In [58]:
# get list of files in scores folder
scores = os.listdir('../indicator_scores/')
scores = [s for s in scores if s.startswith('foundations')]

In [59]:
scores

['foundations_digital_payments_penetration_scores.csv']

In [60]:
# create a dataframe that concatenates all these file into one table
df = pd.concat([pd.read_csv('../indicator_scores/{}'.format(s)) for s in scores])    

In [61]:
df

Unnamed: 0,Country Name,Indicator,data_col,new_rank_score,higher_is_better
0,Norway,Digital payments penetration,100.00,6.0000,True
1,Denmark,Digital payments penetration,97.24,5.8620,True
2,Finland,Digital payments penetration,93.95,5.6975,True
3,Sweden,Digital payments penetration,93.08,5.6540,True
4,Netherlands,Digital payments penetration,89.01,5.4505,True
...,...,...,...,...,...
129,Jamaica,Digital payments penetration,,,True
130,Madagascar,Digital payments penetration,,,True
131,Oman,Digital payments penetration,,,True
132,Qatar,Digital payments penetration,,,True


In [62]:
# Data cleaning
df['new_rank_score'] = df['new_rank_score'].fillna(0)
df.sort_values(by=['Country Name'], ascending=True, inplace=True)
df.reset_index(drop=True, inplace=True)

In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134 entries, 0 to 133
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Country Name      134 non-null    object 
 1   Indicator         134 non-null    object 
 2   data_col          122 non-null    float64
 3   new_rank_score    134 non-null    float64
 4   higher_is_better  134 non-null    bool   
dtypes: bool(1), float64(2), object(2)
memory usage: 4.4+ KB


In [64]:
df.head(15)

Unnamed: 0,Country Name,Indicator,data_col,new_rank_score,higher_is_better
0,Albania,Digital payments penetration,8.62,1.431,True
1,Algeria,Digital payments penetration,4.16,1.208,True
2,Angola,Digital payments penetration,,0.0,True
3,Argentina,Digital payments penetration,22.93,2.1465,True
4,Armenia,Digital payments penetration,17.52,1.876,True
5,Australia,Digital payments penetration,79.37,4.9685,True
6,Austria,Digital payments penetration,59.89,3.9945,True
7,Azerbaijan,Digital payments penetration,6.93,1.3465,True
8,Bahrain,Digital payments penetration,40.0,3.0,True
9,Bangladesh,Digital payments penetration,14.08,1.704,True


In [65]:
df.describe()

Unnamed: 0,data_col,new_rank_score
count,122.0,134.0
mean,36.076967,2.552757
std,24.867406,1.432577
min,0.0,0.0
25%,16.235,1.627375
50%,30.87,2.40025
75%,51.11,3.507125
max,100.0,6.0


In [66]:
# checking country names
sorted(df['Country Name'].unique().tolist())

['Albania',
 'Algeria',
 'Angola',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahrain',
 'Bangladesh',
 'Belarus',
 'Belgium',
 'Benin',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Congo, Dem. Rep.',
 'Costa Rica',
 'Croatia',
 'Cyprus',
 'Czech Republic',
 "Côte d'Ivoire",
 'Denmark',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'Finland',
 'France',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Guatemala',
 'Guinea',
 'Honduras',
 'Hong Kong (China)',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 'Iran, Islamic Rep.',
 'Ireland',
 'Israel',
 'Italy',
 'Jamaica',
 'Japan',
 'Jordan',
 'Kazakhstan',
 'Kenya',
 'Korea, Rep.',
 'Kuwait',
 'Kyrgyzstan',
 'Lao PDR',
 'Latvia',
 'Lebanon',
 'Lesotho',
 'Lithuania',
 'Luxembourg',
 'Madagascar

In [67]:
# remove trailing whitespaces from country name
df['Country Name'] = df['Country Name'].str.strip()


In [68]:
# checking country names
sorted(df['Country Name'].unique().tolist())

['Albania',
 'Algeria',
 'Angola',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahrain',
 'Bangladesh',
 'Belarus',
 'Belgium',
 'Benin',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Congo, Dem. Rep.',
 'Costa Rica',
 'Croatia',
 'Cyprus',
 'Czech Republic',
 "Côte d'Ivoire",
 'Denmark',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'Finland',
 'France',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Guatemala',
 'Guinea',
 'Honduras',
 'Hong Kong (China)',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 'Iran, Islamic Rep.',
 'Ireland',
 'Israel',
 'Italy',
 'Jamaica',
 'Japan',
 'Jordan',
 'Kazakhstan',
 'Kenya',
 'Korea, Rep.',
 'Kuwait',
 'Kyrgyzstan',
 'Lao PDR',
 'Latvia',
 'Lebanon',
 'Lesotho',
 'Lithuania',
 'Luxembourg',
 'Madagascar

In [69]:
# average indicator scores per country
agg_df = df.groupby(['Country Name']).agg({'new_rank_score':'mean','data_col':'count'})

In [70]:
agg_df.columns = ['agg_score', 'count_source' ]

In [71]:
max_number_sources = agg_df.describe()['count_source']['max']

In [72]:
agg_df['agg_score_wt'] = agg_df['agg_score']*(agg_df['count_source']/max_number_sources)

In [73]:
agg_df.sort_values(by='agg_score', ascending=False, inplace=True)

In [74]:
agg_df.head(25)

Unnamed: 0_level_0,agg_score,count_source,agg_score_wt
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Norway,6.0,1,6.0
Denmark,5.862,1,5.862
Finland,5.6975,1,5.6975
Sweden,5.654,1,5.654
Netherlands,5.4505,1,5.4505
New Zealand,5.3365,1,5.3365
United States,5.22,1,5.22
Estonia,5.141,1,5.141
"Korea, Rep.",5.132,1,5.132
Canada,5.0765,1,5.0765


In [75]:
agg_df.to_csv('../pillar_scores/foundation_scores_v0.csv')