In [2]:
import pandas as pd
import numpy as np



### Get all the pillar names from the excel

In [3]:
ls ../..

README.md
UNDP Digital Assessment Data Framework Filename Matching V7.xlsx
[34mdashboard[m[m/
[34mdata[m[m/
data_manifest.csv
data_manifest_instructions.md
process_manifest.js
process_raw_data.js
[34mprocessed[m[m/
[34mscore[m[m/
[34msources[m[m/
undp-diagram.png


In [4]:
names = pd.read_excel('../../UNDP Digital Assessment Data Framework Filename Matching V7.xlsx')

In [5]:
col_names = ['Indicator','check', 'Data Source','Index','Filename']

In [6]:
names = names[col_names]

In [7]:
names.head()

Unnamed: 0,Indicator,check,Data Source,Index,Filename
0,Countries,,United Nations,False,list_of_countries
1,"Database of Global Administrative Areas (GADM,...",,GADM maps and data,False,
2,High Resolution Population Density Maps + Demo...,,Facebook,False,
3,population density vs openstreetmap object den...,,Kontur,False,
4,Population Density,Infrastructure,World Bank,False,population_density


In [8]:
# get all the files per pillar
data_stats = names.groupby('check').agg({'Filename':'count','Indicator':'count'})

In [9]:
data_stats

Unnamed: 0_level_0,Filename,Indicator
check,Unnamed: 1_level_1,Unnamed: 2_level_1
Business,20,25
Foundations,9,12
Government,12,15
Infrastructure,46,48
People,38,46
Regulation,6,7
Strategy,1,1


### Foundations

In [11]:
bnames = names[(names.check=='Government')&(~names.Filename.isna())]#&(names.Index==False)]

In [12]:
bnames.head(25)

Unnamed: 0,Indicator,check,Data Source,Index,Filename
53,Online-Service-Index (OSI),Government,DESA,True,e_government_index
54,E-Participation index,Government,DESA,True,e_government_index
55,Use of public services online (% of services o...,Government,Boston Consulting Group/SalesForce,False,digital_public_service_use
56,What is the % change of government digitizing ...,Government,World Bank,True,Egov_strategy
57,Security incidents (# of relevant issues),Government,SPECOPS/CSIS,False,cyber_attacks
58,ICT investment as a percentage of GDP,Government,OECD,False,ICT_Investment
59,R&D spending (% of GDP),Government,World Bank,False,RD_Percentage_GDP
60,Evidence of digital strategies in/across Minst...,Government,World Bank,False,Egov_strategy
62,Evidence of focus on vulnerable groups,Government,World Bank,False,Egov_strategy
65,% of digital skills certifications / training ...,Government,Coursera,False,digital_skill_level


In [13]:
# get list of names for all indicators
indicators = bnames.Indicator.unique()

In [14]:
# get all file names
bfiles = bnames.Filename.unique()

In [15]:
bfiles

array(['e_government_index', 'digital_public_service_use',
       'Egov_strategy', 'cyber_attacks', 'ICT_Investment',
       'RD_Percentage_GDP', 'digital_skill_level',
       'gov_encrypted_traffic', '%_countries_gov_encrypted_webtraffic'],
      dtype=object)

In [16]:
# formula for converting scale
def convert_rank(old_value, old_min=1, old_max=7, new_min=1, new_max=6 ):
    """ Convert old scale values scale into new scale values"""
    old_range = old_max - old_min
    new_range = new_max - new_min
    new_value = (((old_value-old_min)*new_range)/old_range)+new_min
    return new_value

### 1. Digital payments penetration

In [17]:
indicators[0]

'Online-Service-Index (OSI)'

In [18]:
# load data
indicator = indicators[0]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Online-Service-Index (OSI)
e_government_index


In [19]:
df.head()

Unnamed: 0,Survey Year,Country Name,E-Government Rank,E-Government Index,E-Participation Index,Online Service Index,Human Capital Index,Telecommunication Infrastructure Index
0,2020,Iraq,143,0.436,0.3095,0.3353,0.4358,0.537
1,2020,Ireland,27,0.8433,0.8571,0.7706,0.9494,0.81
2,2020,Israel,30,0.8361,0.7143,0.7471,0.8924,0.8689
3,2020,Italy,37,0.8231,0.8214,0.8294,0.8466,0.7932
4,2020,Jamaica,114,0.5392,0.369,0.3882,0.7142,0.5151


In [22]:
# all data from 2020
df['Survey Year'].value_counts()

2020    193
Name: Survey Year, dtype: int64

In [20]:
# score looks like the one to use
df.describe()

Unnamed: 0,Survey Year,E-Government Rank,E-Government Index,E-Participation Index,Online Service Index,Human Capital Index,Telecommunication Infrastructure Index
count,193.0,193.0,193.0,193.0,193.0,193.0,193.0
mean,2020.0,97.0,0.598767,0.567723,0.561961,0.687992,0.546354
std,0.0,55.858452,0.214869,0.259592,0.249874,0.19444,0.259358
min,2020.0,1.0,0.0875,0.0,0.0,0.0,0.0
25%,2020.0,49.0,0.432,0.3571,0.3529,0.5599,0.3496
50%,2020.0,97.0,0.6129,0.5714,0.5765,0.7395,0.5669
75%,2020.0,145.0,0.7798,0.7976,0.7647,0.8414,0.7723
max,2020.0,193.0,0.9758,1.0,1.0,1.0,1.0


In [43]:
# df.Indicator.unique()

In [23]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Online Service Index'] 


min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))



In [25]:

# output scores to csv
df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].to_csv('../indicator_scores/government_{}_scores.csv'.format(bf), index=False)

In [26]:
df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].head(15)


Unnamed: 0,Country Name,Indicator,data_col,new_rank_score,higher_is_better
0,Iraq,Online-Service-Index (OSI),0.3353,2.6765,True
1,Ireland,Online-Service-Index (OSI),0.7706,4.853,True
2,Israel,Online-Service-Index (OSI),0.7471,4.7355,True
3,Italy,Online-Service-Index (OSI),0.8294,5.147,True
4,Jamaica,Online-Service-Index (OSI),0.3882,2.941,True
5,Japan,Online-Service-Index (OSI),0.9059,5.5295,True
6,Jordan,Online-Service-Index (OSI),0.3588,2.794,True
7,Kazakhstan,Online-Service-Index (OSI),0.9235,5.6175,True
8,Kenya,Online-Service-Index (OSI),0.6765,4.3825,True
9,Kiribati,Online-Service-Index (OSI),0.4941,3.4705,True


## 2. E-Participation index


In [27]:
indicator = indicators[1]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

E-Participation index
e_government_index


In [29]:
df.head()

Unnamed: 0,Survey Year,Country Name,E-Government Rank,E-Government Index,E-Participation Index,Online Service Index,Human Capital Index,Telecommunication Infrastructure Index
0,2020,Iraq,143,0.436,0.3095,0.3353,0.4358,0.537
1,2020,Ireland,27,0.8433,0.8571,0.7706,0.9494,0.81
2,2020,Israel,30,0.8361,0.7143,0.7471,0.8924,0.8689
3,2020,Italy,37,0.8231,0.8214,0.8294,0.8466,0.7932
4,2020,Jamaica,114,0.5392,0.369,0.3882,0.7142,0.5151


In [30]:
df['Survey Year'].unique()

array([2020])

In [34]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['E-Participation Index'] 


min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))


# output scores to csv
df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].to_csv('../indicator_scores/government_{}_scores.csv'.format(bf), index=False)

## 3. Use of public services online (% of services online, penetration, frequency of use)


In [65]:
indicator = indicators[2]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Use of public services online (% of services online, penetration, frequency of use)
digital_public_service_use


In [66]:
# drop first row
df = df.iloc[1: , :]

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 1 to 32
Data columns (total 2 columns):
 #   Column                                                       Non-Null Count  Dtype 
---  ------                                                       --------------  ----- 
 0   Net perception that needs are being met by digital services  32 non-null     object
 1   Unnamed: 1                                                   32 non-null     object
dtypes: object(2)
memory usage: 644.0+ bytes


In [68]:
df.head()

Unnamed: 0,Net perception that needs are being met by digital services,Unnamed: 1
1,Estonia,67%
2,UAE,61%
3,Saudi Arabia,59%
4,Singapore,54%
5,China,53%


In [69]:
# clean data to remove % and convert to float
df['Unnamed: 1'] = df['Unnamed: 1'].str.replace('%','')

df['Unnamed: 1'] = df['Unnamed: 1'].astype(float)

In [70]:
# create the standard columns
df['higher_is_better'] = True
df['Indicator'] =  indicator
df['Country Name'] = df['Net perception that needs are being met by digital services']
# df['2019 [YR2019]'] = df['2019 [YR2019]'].astype(float)
df['data_col'] = df['Unnamed: 1']

In [71]:
# convert 1-100 %  into 1-6
min_rank = df['data_col'].min()
max_rank = df['data_col'].max()
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=min_rank,old_max=max_rank))

df.sort_values(by='new_rank_score', ascending=False)

# prepare output
df = df[['Country Name', 'Indicator', 'data_col', 'new_rank_score','higher_is_better']]

# output scores
df.to_csv('../indicator_scores/government_{}_scores.csv'.format(bf), index=False)

In [72]:
df.describe()

Unnamed: 0,data_col,new_rank_score
count,32.0,32.0
mean,33.8125,4.092672
std,20.85501,1.198564
min,-20.0,1.0
25%,21.0,3.356322
50%,39.0,4.390805
75%,49.5,4.994253
max,67.0,6.0


## 4. What is the % change of government digitizing public services?



In [73]:
indicator = indicators[3]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

What is the % change of government digitizing public services?
Egov_strategy


In [79]:
df.head()

Unnamed: 0,198,DG Systems & Services,Unnamed: 2,Country Data,Digital Government,UN e-Gov Dev Index,DG Institution,DG Strategy,Whole of Government,GovTech,...,Comparison of weights,OECD DG 2019,GTMI > Correlation of indicators (without standardization),GTI > Factor analysis (without standardization),GTMI Raw Scores >,GTMI Standardized Z-Scores>,GTC > Correlation analysis with Standardized Scores,GTF > Factor analysis with Standardized Scores,data_country,data_year
0,Group,#,GTI-4,Level,e-Government,eGov'20,DG Org URL,DG St,WoG,GT,...,GTE,Score,Code,Code,Code,Code,Code,Code,GTI-4,GTI-4
1,C,1,-0.12,LIC,https://mcit.gov.af/node/6938,169,http://mcit.gov.af/,2,0,0,...,0.442,-,AFG,AFG,AFG,AFG,AFG,AFG,-0.12,-0.12
2,A,2,0.26,UMIC,https://e-albania.al/,59,http://akshi.gov.al/,3,1,1,...,0.748,-,ALB,ALB,ALB,ALB,ALB,ALB,0.26,0.26
3,C,3,-0.15,LMIC,https://www.mpttn.gov.dz/ar/content/%D8%A7%D9%...,120,https://www.mpttn.gov.dz/,2,0,0,...,0.281,-,DZA,DZA,DZA,DZA,DZA,DZA,-0.15,-0.15
4,C,4,-1.32,HIC,http://www.govern.ad,80,http://www.govern.ad/,3,0,0,...,0.336,-,ADO,ADO,ADO,ADO,ADO,ADO,-1.32,-1.32


In [81]:
df['GTMI > Correlation of indicators (without standardization)'].unique()

array(['Code', 'AFG', 'ALB', 'DZA', 'ADO', 'AGO', 'ATG', 'ARG', 'ARM',
       'AUS', 'AUT', 'AZE', 'BHS', 'BHR', 'BGD', 'BRB', 'BLR', 'BEL',
       'BLZ', 'BEN', 'BTN', 'BOL', 'BIH', 'BWA', 'BRA', 'BRN', 'BGR',
       'BFA', 'BDI', 'KHM', 'CMR', 'CAN', 'CPV', 'CAF', 'TCD', 'CHL',
       'CHN', 'COL', 'COM', 'COG', 'ZAR', 'CRI', 'CIV', 'HRV', 'CUB',
       'CYP', 'CZE', 'DNK', 'DJI', 'DMA', 'DOM', 'ECU', 'EGY', 'SLV',
       'GNQ', 'ERI', 'EST', 'ETH', 'FJI', 'FIN', 'FRA', 'GAB', 'GMB',
       'GEO', 'DEU', 'GHA', 'GRC', 'GRD', 'GTM', 'GIN', 'GNB', 'GUY',
       'HTI', 'HND', 'HKG', 'HUN', 'ISL', 'IND', 'IDN', 'IRN', 'IRQ',
       'IRL', 'ISR', 'ITA', 'JAM', 'JPN', 'JOR', 'KAZ', 'KEN', 'KIR',
       'PRK', 'KOR', 'KSV', 'KWT', 'KGZ', 'LAO', 'LVA', 'LBN', 'LSO',
       'LBR', 'LBY', 'LIE', 'LTU', 'LUX', 'MAC', 'MKD', 'MDG', 'MWI',
       'MYS', 'MDV', 'MLI', 'MLT', 'MHL', 'MRT', 'MUS', 'MEX', 'FSM',
       'MDA', 'MCO', 'MNG', 'MNE', 'MAR', 'MOZ', 'MMR', 'NAM', 'NRU',
       'NPL', 'NLD'

In [78]:
df[['GovTech Index','data_country','data_year']].head()

Unnamed: 0,GovTech Index,data_country,data_year
0,GT0,GTI-4,GTI-4
1,0.46,-0.12,-0.12
2,0.76,0.26,0.26
3,0.36,-0.15,-0.15
4,0.39,-1.32,-1.32


In [76]:
df.columns.tolist()

['198',
 'DG Systems & Services',
 'Unnamed: 2',
 'Country Data',
 'Digital Government',
 'UN e-Gov Dev Index',
 'DG Institution',
 'DG Strategy',
 'Whole of Government',
 'GovTech',
 'GovTech Products and Services',
 'Finance Ministry web site, Public Finance data, and FMIS links',
 'Financial Management Information System',
 'Treasury Single Account',
 'Tax Administration',
 'Customs Administration',
 'e-Filing',
 'e-Payment',
 'Digital Signature',
 'Public Employment',
 'HRMIS',
 'Payroll System',
 'e-Procurement',
 'Public Debt',
 'Public Investment Management Systems',
 'Aid Mgmt Systems',
 'Other Categories',
 'Data Governance Maturity',
 'Data Governance Institution, Policy & Regulations',
 'Government Cloud, Enterprise Architecture and Interoperability Platform',
 ' National CERT or CSIRT',
 'National website for citizen participation',
 'National website for citizen feedback / GRM',
 'Government responsiveness',
 'Legal System Type',
 'Data Protection & Privacy Laws',
 'Data P

In [51]:
df[['data_country', 'data_year']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   data_country  0 non-null      float64
 1   data_year     0 non-null      float64
dtypes: float64(2)
memory usage: 3.2 KB


In [52]:
# dcol = 'Percentage of total trade in services'
# indicol = 'Category Label'
# cname = 'Economy Label'

# # filter most recent year
# df = df[(df.Year==2019)&(df['Flow Label']=='Exports')]

# # create standard columns
# df['higher_is_better'] = True
# df['Indicator'] =  df[indicol]+' '+ df['Flow Label']
# df['Country Name'] = df[cname]
# # df['2019 [YR2019]'] = df['2019 [YR2019]'].astype(float)
# df['data_col'] = df[dcol]


# # convert 1-100 %  into 1-6
# df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=0,old_max=100))

# df.sort_values(by='new_rank_score', ascending=False)

# # prepare output
# df = df[['Country Name', 'Indicator', 'data_col', 'new_rank_score','higher_is_better']]

# # output scores
# df.to_csv('../indicator_scores/{}_scores.csv'.format(bf+'_exports'), index=False)

## 5. Security incidents (# of relevant issues)



In [82]:
indicator = indicators[4]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Security incidents (# of relevant issues)
cyber_attacks


In [55]:
df['Country Characteristics'].unique()

array(['Region', 'SAS', 'ECS', 'MEA', 'SSF', 'LCN', 'EAS', 'NAC'],
      dtype=object)

In [89]:
df.head()

Unnamed: 0,Country,Number of Significant Cyberattacks (2006-2020),data_rank
0,United States,156,20.0
1,United Kingdom,47,19.0
2,India,23,18.0
3,Germany,21,17.0
4,South Korea,18,16.0


In [85]:
# create a rank from the number of attacks fields
df['data_rank'] = df['Number of Significant Cyberattacks (2006-2020)'].rank(method='max')

In [88]:
df.data_rank.max()

20.0

In [90]:
# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['data_rank'] 


min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 1-20 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# need to invert since higher rank is not better
df['new_rank_score'] = (6-df['new_rank_score'])+1

# output scores to csv
df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].to_csv('../indicator_scores/government_{}_scores.csv'.format(bf), index=False)

In [91]:
df

Unnamed: 0,Country Name,Number of Significant Cyberattacks (2006-2020),data_rank,higher_is_better,Indicator,data_col,new_rank_score
0,United States,156,20.0,True,Security incidents (# of relevant issues),20.0,1.0
1,United Kingdom,47,19.0,True,Security incidents (# of relevant issues),19.0,1.263158
2,India,23,18.0,True,Security incidents (# of relevant issues),18.0,1.526316
3,Germany,21,17.0,True,Security incidents (# of relevant issues),17.0,1.789474
4,South Korea,18,16.0,True,Security incidents (# of relevant issues),16.0,2.052632
5,Australia,16,15.0,True,Security incidents (# of relevant issues),15.0,2.315789
6,Ukraine,16,15.0,True,Security incidents (# of relevant issues),15.0,2.315789
7,China,15,13.0,True,Security incidents (# of relevant issues),13.0,2.842105
8,Iran,15,13.0,True,Security incidents (# of relevant issues),13.0,2.842105
9,Saudi Arabia,15,13.0,True,Security incidents (# of relevant issues),13.0,2.842105


## 6. ICT investment as a percentage of GDP


In [102]:
indicator = indicators[5]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

ICT investment as a percentage of GDP
ICT_Investment


In [103]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450 entries, 0 to 449
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   LOCATION    450 non-null    object 
 1   INDICATOR   450 non-null    object 
 2   SUBJECT     450 non-null    object 
 3   MEASURE     450 non-null    object 
 4   FREQUENCY   450 non-null    object 
 5   TIME        450 non-null    int64  
 6   Value       450 non-null    float64
 7   Flag Codes  0 non-null      float64
dtypes: float64(2), int64(1), object(5)
memory usage: 28.2+ KB


In [104]:
df.head()

Unnamed: 0,LOCATION,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value,Flag Codes
0,AUS,ICTINVST,TOT,PC,A,1985,12.465454,
1,AUS,ICTINVST,TOT,PC,A,1986,13.939533,
2,AUS,ICTINVST,TOT,PC,A,1987,14.142429,
3,AUS,ICTINVST,TOT,PC,A,1988,14.289993,
4,AUS,ICTINVST,TOT,PC,A,1989,15.348707,


In [105]:
df.SUBJECT.unique()

array(['TOT'], dtype=object)

In [106]:
df.TIME.max()

2010

In [107]:
df.Value.describe()

count    450.000000
mean      16.074272
std        5.636379
min        2.798417
25%       12.410442
50%       14.840726
75%       19.419433
max       32.601105
Name: Value, dtype: float64

In [108]:
dcol = 'Value'
indicol = indicator
cname = 'LOCATION'

# filter most recent year
df = df[(df.TIME==2010)]

# create standard columns
df['higher_is_better'] = True
df['Indicator'] =  indicator
df['Country Code'] = df[cname]
# # df['2019 [YR2019]'] = df['2019 [YR2019]'].astype(float)
df['data_col'] = df[dcol]

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 1-20 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# df.sort_values(by='new_rank_score', ascending=False)

# # prepare output
df = df[['Country Code', 'Indicator', 'data_col', 'new_rank_score','higher_is_better']]

# # output scores
df.to_csv('../indicator_scores/government{}_scores.csv'.format(bf), index=False)

In [109]:
df

Unnamed: 0,Country Code,Indicator,data_col,new_rank_score,higher_is_better
72,CAN,ICT investment as a percentage of GDP,17.018367,2.47101,True
121,FIN,ICT investment as a percentage of GDP,15.519828,2.121241,True
172,DEU,ICT investment as a percentage of GDP,12.690394,1.460831,True
198,IRL,ICT investment as a percentage of GDP,12.412805,1.39604,True
224,ITA,ICT investment as a percentage of GDP,11.026056,1.072363,True
274,KOR,ICT investment as a percentage of GDP,10.716026,1.0,True
323,NZL,ICT investment as a percentage of GDP,21.238183,3.455945,True
349,ESP,ICT investment as a percentage of GDP,13.763415,1.711282,True
400,CHE,ICT investment as a percentage of GDP,18.506691,2.818396,True
449,USA,ICT investment as a percentage of GDP,32.137836,6.0,True


## 7. R&D spending (% of GDP)




In [101]:
indicator = indicators[6]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

R&D spending (% of GDP)
RD_Percentage_GDP


FileNotFoundError: [Errno 2] No such file or directory: '../../processed/RD_Percentage_GDP.csv'

In [63]:
df.head()

Unnamed: 0,Legal Enablers to ID,Unnamed: 1,Country Characteristics,Legal System Type,Foundational Civil Identification Law,Data Protection Laws & Privacy Bills,Freedom in the World (2018),Freedom on the Net (2017),Freedom in the Press (2017),data_country,data_year
0,#,Economic Environment (0-30 points),Country Code,Legal System Type,Key Act / Bill,Key Act / Bill,Status,Status,Press Freedom Status,Economic Environment (0-30 points),Economic Environment (0-30 points)
1,1,16,AFG,civil + customary + religious,"Law for Registration of Population Records, Ar...",,NF,,PF,16,16
2,2,17,ALB,civil,"Law on ID Cards No. 8952, as amended, Arts. 1-3",Act on the Protection of Personal Data,PF,,PF,17,17
3,3,19,DZA,civil + religious,Décret No. 67/126 du 21/07/1967 Portant Instit...,,NF,,NF,19,19
4,4,8,AND,civil,-,Law on the protection of personal data,F,,F,8,8


In [66]:
# min_rank = 1
# max_rank = df['Country Name'].nunique()

# # transform 1-24 rank into 1-6
# df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# # need to invert score since higher rank is not better 
# # df['new_rank_score'] = (6-df['new_rank_score'])+1

In [67]:
# # prepare output
# df.sort_values(by='new_rank_score', ascending=False)

# # prepare output
# df = df[['Country Name', 'Indicator', 'data_col', 'new_rank_score','higher_is_better']]

# # output scores
# df.to_csv('../indicator_scores/{}_scores.csv'.format(bf), index=False)

## 8. Evidence of digital strategies in/across Minstries



In [110]:
indicator = indicators[7]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Evidence of digital strategies in/across Minstries
Egov_strategy


In [111]:
df.head()

Unnamed: 0,198,DG Systems & Services,Unnamed: 2,Country Data,Digital Government,UN e-Gov Dev Index,DG Institution,DG Strategy,Whole of Government,GovTech,...,Comparison of weights,OECD DG 2019,GTMI > Correlation of indicators (without standardization),GTI > Factor analysis (without standardization),GTMI Raw Scores >,GTMI Standardized Z-Scores>,GTC > Correlation analysis with Standardized Scores,GTF > Factor analysis with Standardized Scores,data_country,data_year
0,Group,#,GTI-4,Level,e-Government,eGov'20,DG Org URL,DG St,WoG,GT,...,GTE,Score,Code,Code,Code,Code,Code,Code,GTI-4,GTI-4
1,C,1,-0.12,LIC,https://mcit.gov.af/node/6938,169,http://mcit.gov.af/,2,0,0,...,0.442,-,AFG,AFG,AFG,AFG,AFG,AFG,-0.12,-0.12
2,A,2,0.26,UMIC,https://e-albania.al/,59,http://akshi.gov.al/,3,1,1,...,0.748,-,ALB,ALB,ALB,ALB,ALB,ALB,0.26,0.26
3,C,3,-0.15,LMIC,https://www.mpttn.gov.dz/ar/content/%D8%A7%D9%...,120,https://www.mpttn.gov.dz/,2,0,0,...,0.281,-,DZA,DZA,DZA,DZA,DZA,DZA,-0.15,-0.15
4,C,4,-1.32,HIC,http://www.govern.ad,80,http://www.govern.ad/,3,0,0,...,0.336,-,ADO,ADO,ADO,ADO,ADO,ADO,-1.32,-1.32


In [70]:
# # filter on relevant years
# df = df[(df['Time']==2017)&(df['Information and communication technologies']=='ICT-intensive')]

# # convert rank into 1-6 - in order to get most recent data 
# df['data_col'] = df['Value'] 

# # going to assume index is between 1-100 but not 100% sure
# min_rank = 0 #df['data_col'].min()
# max_rank = 100 #df['data_col'].max()

# # transform 1-147 rank into 1-6
# df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# # need to invert score since higher rank is not better 
# # df['new_rank_score'] = (6-df['new_rank_score'])+1

# # df_rank[['Country ISO3', 'Country Name','Indicator','data_col','new_rank_score']].head()

# df.rename(columns={'Country':'Country Name'}, inplace=True)
# df['Indicator'] = indicator
# df['higher_is_better'] = True

# df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].head(15)


# # output scores to csv
# df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].to_csv('../indicator_scores/{}_scores.csv'.format(bf), index=False)

## 9. Evidence of focus on vulnerable groups


In [124]:
indicator = indicators[8]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Evidence of focus on vulnerable groups
Egov_strategy


In [125]:
df.head()

Unnamed: 0,198,DG Systems & Services,Unnamed: 2,Country Data,Digital Government,UN e-Gov Dev Index,DG Institution,DG Strategy,Whole of Government,GovTech,...,Comparison of weights,OECD DG 2019,GTMI > Correlation of indicators (without standardization),GTI > Factor analysis (without standardization),GTMI Raw Scores >,GTMI Standardized Z-Scores>,GTC > Correlation analysis with Standardized Scores,GTF > Factor analysis with Standardized Scores,data_country,data_year
0,Group,#,GTI-4,Level,e-Government,eGov'20,DG Org URL,DG St,WoG,GT,...,GTE,Score,Code,Code,Code,Code,Code,Code,GTI-4,GTI-4
1,C,1,-0.12,LIC,https://mcit.gov.af/node/6938,169,http://mcit.gov.af/,2,0,0,...,0.442,-,AFG,AFG,AFG,AFG,AFG,AFG,-0.12,-0.12
2,A,2,0.26,UMIC,https://e-albania.al/,59,http://akshi.gov.al/,3,1,1,...,0.748,-,ALB,ALB,ALB,ALB,ALB,ALB,0.26,0.26
3,C,3,-0.15,LMIC,https://www.mpttn.gov.dz/ar/content/%D8%A7%D9%...,120,https://www.mpttn.gov.dz/,2,0,0,...,0.281,-,DZA,DZA,DZA,DZA,DZA,DZA,-0.15,-0.15
4,C,4,-1.32,HIC,http://www.govern.ad,80,http://www.govern.ad/,3,0,0,...,0.336,-,ADO,ADO,ADO,ADO,ADO,ADO,-1.32,-1.32


## 10. % of digital skills certifications / training courses completed


In [127]:
indicator = indicators[9]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of digital skills certifications / training courses completed
digital_skill_level


In [128]:
df

Unnamed: 0,Global Rank,Region,Country and Region,Technology,Data Science
0,1,Europe,Switzerland,84%,96%
1,2,Europe,Luxembourg,62%,85%
2,3,Europe,Austria,88%,95%
3,4,Asia Pacific,Japan,100%,88%
4,5,Europe,Germany,89%,94%
...,...,...,...,...,...
103,104,Asia Pacific,Uzbekistan,6%,9%
104,105,Sub-Saharan Africa,Sierra Leone,2%,4%
105,106,Latin America and the Caribbean,Paraguay,7%,11%
106,107,Latin America and the Caribbean,Guyana,8%,2%


In [129]:
dcol = 'Global Rank'
indicol = indicator
cname = 'Country and Region'

# filter most recent year
# df = df[(df.TIME==2010)]

# create standard columns
df['higher_is_better'] = True
df['Indicator'] =  indicator
df['Country Name'] = df[cname]
# # df['2019 [YR2019]'] = df['2019 [YR2019]'].astype(float)
df['data_col'] = df[dcol]

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 1-20 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# # need to invert score since higher rank is not better 
df['new_rank_score'] = (6-df['new_rank_score'])+1

df.sort_values(by='new_rank_score', ascending=False)

# # prepare output
df = df[['Country Name', 'Indicator', 'data_col', 'new_rank_score','higher_is_better']]

# # output scores
df.to_csv('../indicator_scores/government{}_scores.csv'.format(bf), index=False)

In [130]:
df

Unnamed: 0,Country Name,Indicator,data_col,new_rank_score,higher_is_better
0,Switzerland,% of digital skills certifications / training ...,1,6.000000,True
1,Luxembourg,% of digital skills certifications / training ...,2,5.953271,True
2,Austria,% of digital skills certifications / training ...,3,5.906542,True
3,Japan,% of digital skills certifications / training ...,4,5.859813,True
4,Germany,% of digital skills certifications / training ...,5,5.813084,True
...,...,...,...,...,...
103,Uzbekistan,% of digital skills certifications / training ...,104,1.186916,True
104,Sierra Leone,% of digital skills certifications / training ...,105,1.140187,True
105,Paraguay,% of digital skills certifications / training ...,106,1.093458,True
106,Guyana,% of digital skills certifications / training ...,107,1.046729,True


## 11. % of government encrypted web traffic


In [117]:
indicator = indicators[10]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of government encrypted web traffic
gov_encrypted_traffic


  exec(code_obj, self.user_global_ns, self.user_ns)


In [121]:
df.columns

Index(['0', '1', '2', '3', '7', '29', '31', '53', '66', '82', '132', '164',
       '1390', '1421927414', '59.166.0.0', '149.171.126.6', 'udp', 'CON',
       '0.001055', 'dns', '500473.9375', '621800.9375', '0.017', '0.013',
       'Unnamed: 24'],
      dtype='object')

In [118]:
df.head()

Unnamed: 0,0,1,2,3,7,29,31,53,66,82,...,149.171.126.6,udp,CON,0.001055,dns,500473.9375,621800.9375,0.017,0.013,Unnamed: 24
0,0,2,4,3,4,29,31,1024,132,76,...,149.171.126.9,udp,CON,0.036133,-,87676.08594,50480.17188,7.005,7.564333,
1,0,1,2,2,8,29,31,53,73,89,...,149.171.126.7,udp,CON,0.001119,dns,521894.5313,636282.375,0.017,0.013,
2,0,1,2,1,9,29,31,53,66,82,...,149.171.126.5,udp,CON,0.001209,dns,436724.5625,542597.1875,0.043,0.014,
3,0,1,2,1,9,29,31,53,73,89,...,149.171.126.0,udp,CON,0.001169,dns,499572.25,609067.5625,0.005,0.003,
4,0,2,4,3,4,29,31,111,142,78,...,149.171.126.9,udp,CON,0.078339,-,43503.23438,23896.14258,21.003,24.315,


In [120]:
bnames

Unnamed: 0,Indicator,check,Data Source,Index,Filename
53,Online-Service-Index (OSI),Government,DESA,True,e_government_index
54,E-Participation index,Government,DESA,True,e_government_index
55,Use of public services online (% of services o...,Government,Boston Consulting Group/SalesForce,False,digital_public_service_use
56,What is the % change of government digitizing ...,Government,World Bank,True,Egov_strategy
57,Security incidents (# of relevant issues),Government,SPECOPS/CSIS,False,cyber_attacks
58,ICT investment as a percentage of GDP,Government,OECD,False,ICT_Investment
59,R&D spending (% of GDP),Government,World Bank,False,RD_Percentage_GDP
60,Evidence of digital strategies in/across Minst...,Government,World Bank,False,Egov_strategy
62,Evidence of focus on vulnerable groups,Government,World Bank,False,Egov_strategy
65,% of digital skills certifications / training ...,Government,Coursera,False,digital_skill_level


## 12. % of country encrypted web traffic

In [119]:
indicator = indicators[11]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of country encrypted web traffic
%_countries_gov_encrypted_webtraffic


FileNotFoundError: [Errno 2] No such file or directory: '../../processed/%_countries_gov_encrypted_webtraffic.csv'

### Score Aggregating

In [131]:
import os


In [132]:
# get list of files in scores folder
scores = os.listdir('../indicator_scores/')
scores = [s for s in scores if s.startswith('government')]

In [133]:
scores

['government_cyber_attacks_scores.csv',
 'government_digital_public_service_use_scores.csv',
 'government_e_government_index_scores.csv',
 'governmentdigital_skill_level_scores.csv',
 'governmentICT_Investment_scores.csv']

In [134]:
# create a dataframe that concatenates all these file into one table
df = pd.concat([pd.read_csv('../indicator_scores/{}'.format(s)) for s in scores])    

In [135]:
df

Unnamed: 0,Country Name,Indicator,data_col,new_rank_score,higher_is_better,Country Code
0,United States,Security incidents (# of relevant issues),20.000000,1.000000,True,
1,United Kingdom,Security incidents (# of relevant issues),19.000000,1.263158,True,
2,India,Security incidents (# of relevant issues),18.000000,1.526316,True,
3,Germany,Security incidents (# of relevant issues),17.000000,1.789474,True,
4,South Korea,Security incidents (# of relevant issues),16.000000,2.052632,True,
...,...,...,...,...,...,...
5,,ICT investment as a percentage of GDP,10.716026,1.000000,True,KOR
6,,ICT investment as a percentage of GDP,21.238183,3.455945,True,NZL
7,,ICT investment as a percentage of GDP,13.763415,1.711282,True,ESP
8,,ICT investment as a percentage of GDP,18.506691,2.818396,True,CHE


In [136]:
# Data cleaning
df['new_rank_score'] = df['new_rank_score'].fillna(0)
df.sort_values(by=['Country Name'], ascending=True, inplace=True)
df.reset_index(drop=True, inplace=True)

In [137]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 363 entries, 0 to 362
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Country Name      353 non-null    object 
 1   Indicator         363 non-null    object 
 2   data_col          363 non-null    float64
 3   new_rank_score    363 non-null    float64
 4   higher_is_better  363 non-null    bool   
 5   Country Code      10 non-null     object 
dtypes: bool(1), float64(2), object(3)
memory usage: 14.7+ KB


In [138]:
df.head(15)

Unnamed: 0,Country Name,Indicator,data_col,new_rank_score,higher_is_better,Country Code
0,Afghanistan,E-Participation index,0.4643,3.3215,True,
1,Albania,E-Participation index,0.8452,5.226,True,
2,Algeria,% of digital skills certifications / training ...,95.0,1.607477,True,
3,Algeria,E-Participation index,0.1548,1.774,True,
4,Andorra,E-Participation index,0.5119,3.5595,True,
5,Angola,E-Participation index,0.4524,3.262,True,
6,Antigua and Barbuda,E-Participation index,0.4881,3.4405,True,
7,Argentina,% of digital skills certifications / training ...,82.0,2.214953,True,
8,Argentina,Use of public services online (% of services o...,15.0,3.011494,True,
9,Argentina,E-Participation index,0.8571,5.2855,True,


In [139]:
df.describe()

Unnamed: 0,data_col,new_rank_score
count,363.0,363.0
mean,20.547108,3.696272
std,30.134795,1.386564
min,-20.0,1.0
25%,0.51785,2.621257
50%,0.9524,3.71028
75%,34.5,4.873752
max,108.0,6.0


In [141]:
# checking country names
# sorted(df['Country Name'].unique().tolist())

In [142]:
# remove trailing whitespaces from country name
df['Country Name'] = df['Country Name'].str.strip()


In [146]:
df.head()

Unnamed: 0,Country Name,Indicator,data_col,new_rank_score,higher_is_better,Country Code
0,Afghanistan,E-Participation index,0.4643,3.3215,True,
1,Albania,E-Participation index,0.8452,5.226,True,
2,Algeria,% of digital skills certifications / training ...,95.0,1.607477,True,
3,Algeria,E-Participation index,0.1548,1.774,True,
4,Andorra,E-Participation index,0.5119,3.5595,True,


In [144]:
# checking country names
sorted(df['Country Name'].unique().tolist())

TypeError: '<' not supported between instances of 'float' and 'str'

In [89]:
# average indicator scores per country
agg_df = df.groupby(['Country Name']).agg({'new_rank_score':'mean','data_col':'count'})

In [90]:
agg_df.columns = ['agg_score', 'count_source' ]

In [91]:
max_number_sources = agg_df.describe()['count_source']['max']

In [92]:
agg_df['agg_score_wt'] = agg_df['agg_score']*(agg_df['count_source']/max_number_sources)

In [93]:
agg_df.sort_values(by='agg_score', ascending=False, inplace=True)

In [94]:
agg_df.head(25)

Unnamed: 0_level_0,agg_score,count_source,agg_score_wt
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Norway,6.0,1,6.0
Denmark,5.862,1,5.862
Finland,5.6975,1,5.6975
Sweden,5.654,1,5.654
Netherlands,5.4505,1,5.4505
New Zealand,5.3365,1,5.3365
United States,5.22,1,5.22
Estonia,5.141,1,5.141
"Korea, Rep.",5.132,1,5.132
Canada,5.0765,1,5.0765


In [95]:
agg_df.to_csv('../pillar_scores/foundation_scores_v0.csv')