In [1]:
import pandas as pd
import numpy as np



### Get all the pillar names from the excel

In [2]:
names = pd.read_excel('../../UNDP Digital Assessment Data Framework Filename Matching V7.xlsx')

In [3]:
col_names = ['Indicator','check', 'Data Source','Index','Filename']

In [4]:
names = names[col_names]

In [5]:
names.head()

Unnamed: 0,Indicator,check,Data Source,Index,Filename
0,Countries,,United Nations,False,Countries
1,"Database of Global Administrative Areas (GADM,...",,GADM maps and data,False,
2,High Resolution Population Density Maps + Demo...,,Facebook,False,
3,population density vs openstreetmap object den...,,Kontur,False,
4,Population Density,Infrastructure,World Bank,False,population_density


In [6]:
# get all the files per pillar
data_stats = names.groupby('check').agg({'Filename':'count','Indicator':'count'})

In [7]:
data_stats

Unnamed: 0_level_0,Filename,Indicator
check,Unnamed: 1_level_1,Unnamed: 2_level_1
Business,20,25
Foundations,7,12
Government,9,15
Infrastructure,39,48
People,35,47
Regulation,6,7
Strategy,1,1


### Foundations

In [8]:
bnames = names[(names.check=='Government')&(~names.Filename.isna())]#&(names.Index==False)]

In [9]:
bnames.head(25)

Unnamed: 0,Indicator,check,Data Source,Index,Filename
53,Online-Service-Index (OSI),Government,DESA,True,e_government_index
54,E-Participation index,Government,DESA,True,e_government_index
55,Use of public services online (% of services o...,Government,Boston Consulting Group/SalesForce,False,digital_public_service_use
56,What is the % change of government digitizing ...,Government,World Bank,True,Egov_strategy
57,Security incidents (# of relevant issues),Government,SPECOPS/CSIS,False,cyber_attacks
58,ICT investment as a percentage of GDP,Government,OECD,False,ICT_Investment
60,Evidence of digital strategies in/across Minst...,Government,World Bank,False,Egov_strategy
62,Evidence of focus on vulnerable groups,Government,World Bank,False,Egov_strategy
65,% of digital skills certifications / training ...,Government,Coursera,False,digital_skill_level


In [10]:
# get list of names for all indicators
indicators = bnames.Indicator.unique()

In [11]:
# get all file names
bfiles = bnames.Filename.unique()

In [12]:
bfiles

array(['e_government_index', 'digital_public_service_use',
       'Egov_strategy', 'cyber_attacks', 'ICT_Investment',
       'digital_skill_level'], dtype=object)

In [13]:
# formula for converting scale
def convert_rank(old_value, old_min=1, old_max=7, new_min=1, new_max=6 ):
    """ Convert old scale values scale into new scale values"""
    old_range = old_max - old_min
    new_range = new_max - new_min
    new_value = (((old_value-old_min)*new_range)/old_range)+new_min
    return new_value

### 1. Online-Service-Index (OSI)

In [14]:
indicators[0]

'Online-Service-Index (OSI)'

In [15]:
# load data
indicator = indicators[0]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Online-Service-Index (OSI)
e_government_index


In [16]:
df.head()

Unnamed: 0,Survey Year,Country Name,E-Government Rank,E-Government Index,E-Participation Index,Online Service Index,Human Capital Index,Telecommunication Infrastructure Index
0,2020,Iraq,143,0.436,0.3095,0.3353,0.4358,0.537
1,2020,Ireland,27,0.8433,0.8571,0.7706,0.9494,0.81
2,2020,Israel,30,0.8361,0.7143,0.7471,0.8924,0.8689
3,2020,Italy,37,0.8231,0.8214,0.8294,0.8466,0.7932
4,2020,Jamaica,114,0.5392,0.369,0.3882,0.7142,0.5151


In [17]:
# all data from 2020
df['Survey Year'].value_counts()

2020    193
Name: Survey Year, dtype: int64

In [18]:
# score looks like the one to use
df.describe()

Unnamed: 0,Survey Year,E-Government Rank,E-Government Index,E-Participation Index,Online Service Index,Human Capital Index,Telecommunication Infrastructure Index
count,193.0,193.0,193.0,193.0,193.0,193.0,193.0
mean,2020.0,97.0,0.598767,0.567723,0.561961,0.687992,0.546354
std,0.0,55.858452,0.214869,0.259592,0.249874,0.19444,0.259358
min,2020.0,1.0,0.0875,0.0,0.0,0.0,0.0
25%,2020.0,49.0,0.432,0.3571,0.3529,0.5599,0.3496
50%,2020.0,97.0,0.6129,0.5714,0.5765,0.7395,0.5669
75%,2020.0,145.0,0.7798,0.7976,0.7647,0.8414,0.7723
max,2020.0,193.0,0.9758,1.0,1.0,1.0,1.0


In [19]:
# df.Indicator.unique()

In [20]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Online Service Index'] 


min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))



In [21]:

# output scores to csv
df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].to_csv('../indicator_scores/government_{}_scores.csv'.format(indicator), index=False)

In [22]:
df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].head(15)


Unnamed: 0,Country Name,Indicator,data_col,new_rank_score,higher_is_better
0,Iraq,Online-Service-Index (OSI),0.3353,2.6765,True
1,Ireland,Online-Service-Index (OSI),0.7706,4.853,True
2,Israel,Online-Service-Index (OSI),0.7471,4.7355,True
3,Italy,Online-Service-Index (OSI),0.8294,5.147,True
4,Jamaica,Online-Service-Index (OSI),0.3882,2.941,True
5,Japan,Online-Service-Index (OSI),0.9059,5.5295,True
6,Jordan,Online-Service-Index (OSI),0.3588,2.794,True
7,Kazakhstan,Online-Service-Index (OSI),0.9235,5.6175,True
8,Kenya,Online-Service-Index (OSI),0.6765,4.3825,True
9,Kiribati,Online-Service-Index (OSI),0.4941,3.4705,True


## 2. E-Participation index


In [23]:
indicator = indicators[1]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

E-Participation index
e_government_index


In [24]:
df.head()

Unnamed: 0,Survey Year,Country Name,E-Government Rank,E-Government Index,E-Participation Index,Online Service Index,Human Capital Index,Telecommunication Infrastructure Index
0,2020,Iraq,143,0.436,0.3095,0.3353,0.4358,0.537
1,2020,Ireland,27,0.8433,0.8571,0.7706,0.9494,0.81
2,2020,Israel,30,0.8361,0.7143,0.7471,0.8924,0.8689
3,2020,Italy,37,0.8231,0.8214,0.8294,0.8466,0.7932
4,2020,Jamaica,114,0.5392,0.369,0.3882,0.7142,0.5151


In [25]:
df['Survey Year'].unique()

array([2020], dtype=int64)

In [26]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df.iloc[:,4]


min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))


# output scores to csv
df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].to_csv('../indicator_scores/government_{}_scores.csv'.format(indicator), index=False)

In [27]:
df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Indicator,data_col,new_rank_score,higher_is_better
0,Iraq,E-Participation index,0.3095,2.5475,True
1,Ireland,E-Participation index,0.8571,5.2855,True
2,Israel,E-Participation index,0.7143,4.5715,True
3,Italy,E-Participation index,0.8214,5.107,True
4,Jamaica,E-Participation index,0.369,2.845,True
5,Japan,E-Participation index,0.9881,5.9405,True
6,Jordan,E-Participation index,0.3333,2.6665,True
7,Kazakhstan,E-Participation index,0.881,5.405,True
8,Kenya,E-Participation index,0.5952,3.976,True
9,Kiribati,E-Participation index,0.5595,3.7975,True


## 3. Use of public services online (% of services online, penetration, frequency of use)


In [28]:
indicator = indicators[2]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Use of public services online (% of services online, penetration, frequency of use)
digital_public_service_use


In [29]:
# drop first row
df = df.iloc[1: , :]

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 1 to 31
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Country             31 non-null     object
 1   Net Perception (%)  31 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 628.0+ bytes


In [31]:
df.head(15)

Unnamed: 0,Country,Net Perception (%)
1,UAE,61
2,Saudi Arabia,59
3,Singapore,54
4,China,53
5,New Zealand,52
6,Netherlands,51
7,Qatar,51
8,Canada,40
9,Denmark,48
10,India,45


In [32]:
# create the standard columns
df['higher_is_better'] = True
df['Indicator'] =  indicator
df['Country Name'] = df['Country']
# df['2019 [YR2019]'] = df['2019 [YR2019]'].astype(float)
df['data_col'] = df['Net Perception (%)']

In [33]:
# convert 1-100 %  into 1-6
min_rank = df['data_col'].min()
max_rank = df['data_col'].max()
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=min_rank,old_max=max_rank))

df.sort_values(by='new_rank_score', ascending=False)

# prepare output
df = df[['Country Name', 'Indicator', 'data_col', 'new_rank_score','higher_is_better']]

# output scores
df.to_csv('../indicator_scores/government_{}_scores.csv'.format(indicator), index=False)

In [34]:
df.head(15)

Unnamed: 0,Country Name,Indicator,data_col,new_rank_score,higher_is_better
1,UAE,Use of public services online (% of services o...,61,6.0,True
2,Saudi Arabia,Use of public services online (% of services o...,59,5.876543,True
3,Singapore,Use of public services online (% of services o...,54,5.567901,True
4,China,Use of public services online (% of services o...,53,5.506173,True
5,New Zealand,Use of public services online (% of services o...,52,5.444444,True
6,Netherlands,Use of public services online (% of services o...,51,5.382716,True
7,Qatar,Use of public services online (% of services o...,51,5.382716,True
8,Canada,Use of public services online (% of services o...,40,4.703704,True
9,Denmark,Use of public services online (% of services o...,48,5.197531,True
10,India,Use of public services online (% of services o...,45,5.012346,True


## 4. What is the % change of government digitizing public services?



In [35]:
indicator = indicators[3]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

What is the % change of government digitizing public services?
Egov_strategy


In [36]:
df.head()

Unnamed: 0,#,Flag,Code,Cnum,Economy,Level,Population,GNI,GNIPC,e-Government,...,NGTI-1,NGTI-2,NGTI-3,NGTI-4,GTI-1,GTI-2,GTI-3,GTI-4,data_country,data_year
0,1,,AFG,4.0,Afghanistan,LIC,38928,20726,540,https://mcit.gov.af/node/6938,...,0.69,0.52,0.31,0.62,-0.09,-0.34,0.02,-0.12,,
1,2,,ALB,8.0,Albania,UMIC,2878,14949,5240,https://e-albania.al/,...,0.81,0.78,0.6,0.77,0.29,0.62,0.74,0.26,,
2,3,,DZA,12.0,Algeria,LMIC,43851,170722,3970,https://www.mpttn.gov.dz/ar/content/%D8%A7%D9%...,...,0.73,0.46,0.02,0.6,0.06,-0.56,-0.68,-0.15,,
3,4,,ADO,20.0,Andorra,HIC,77,3154,40886,http://www.govern.ad,...,0.63,0.6,0.05,0.13,-0.28,-0.06,-0.62,-1.32,,
4,5,,AGO,24.0,Angola,LMIC,32866,97005,3050,http://www.governo.gov.ao,...,0.68,0.69,0.21,0.61,-0.12,0.27,-0.23,-0.13,,


In [37]:
df.columns.tolist()

['#',
 'Flag',
 'Code',
 'Cnum',
 'Economy',
 'Level',
 'Population',
 'GNI',
 'GNIPC',
 'e-Government',
 'eSrv',
 'e-Services',
 "eGov'20",
 "EGDI'20",
 "eOSI'20",
 "eTII'20",
 "eHCI'20",
 "ePart'20",
 "EGDI'18",
 "eOSI'18",
 "eOSI'16",
 "eOSI'14",
 'DG Org URL',
 'DG Entity',
 'DG Org',
 'DG St',
 'DG Strategy',
 'DG Yr',
 'WoG',
 'Whole of Gov URL',
 'GT',
 'GovTech Institution',
 'GovTech URL',
 'Type',
 'GT Org',
 'GT since',
 'GT focus',
 'Other GT Links',
 'e-Gov',
 'e-ID',
 'GSB',
 'e-Serv',
 'Innov',
 'Priv',
 'DTech',
 'OSS',
 'KSL',
 'MoF URL',
 'Org Name',
 'PF',
 'Budget Data URL',
 'MIS',
 'FMIS URL',
 'FMIS / TS',
 'FMIS Name',
 'Tpl',
 'Func',
 'Status',
 'Op Yr',
 'Scope',
 'ASW',
 'ASW Solution',
 'Arch',
 'Treasury website',
 'Tre Yr',
 'TSA website',
 'TSA Yr',
 'TSA Type',
 'TSA Scope',
 'Duration',
 'TSA Benefits',
 'TSA Savings',
 'Sav Yr',
 'Tax Adm URL',
 'Tax Yr',
 'TMIS Abbr',
 'TMIS Name',
 'TMIS',
 'TMIS Soln',
 'Tax System URL',
 'TMIS Yr',
 'TMIS Sta',
 '

In [38]:
# Must limit the database to the first 206 rows, the remaining rows do not contain any useful information
df = df.iloc[0:205,:]

# Drop the superfluous rows by dropping na
df = df[pd.to_numeric(df['#'], errors='coerce').notnull()]
df['#'] = df[df['#'].notna()]

# Must convert data in the DPL column into float
df['CGSI'] = df['CGSI'].replace('-',np.nan)
df['CGSI'] = df['CGSI'].astype(float, errors = 'ignore')

In [39]:
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['CGSI']
df['Country Name'] = df['Economy']
df['Year'] = 2020

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [40]:
df = df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']]
df.head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,Afghanistan,2020,What is the % change of government digitizing ...,0.69,4.520408,True
1,Albania,2020,What is the % change of government digitizing ...,0.79,5.030612,True
2,Algeria,2020,What is the % change of government digitizing ...,0.71,4.622449,True
3,Andorra,2020,What is the % change of government digitizing ...,0.58,3.959184,True
4,Angola,2020,What is the % change of government digitizing ...,0.67,4.418367,True
5,Antigua and Barbuda,2020,What is the % change of government digitizing ...,0.68,4.469388,True
6,Argentina,2020,What is the % change of government digitizing ...,0.81,5.132653,True
7,Armenia,2020,What is the % change of government digitizing ...,0.73,4.72449,True
8,Australia,2020,What is the % change of government digitizing ...,0.89,5.540816,True
9,Austria,2020,What is the % change of government digitizing ...,,,True


In [41]:
# output scores
df.to_csv('../indicator_scores/government_percent change of government digitalizing_scores.csv'.format(indicator), index=False)

## 5. Security incidents (# of relevant issues)



In [42]:
indicator = indicators[4]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Security incidents (# of relevant issues)
cyber_attacks


In [43]:
df.head()

Unnamed: 0,Country,Number of Significant Cyberattacks (2006-2020)
0,United States,156
1,United Kingdom,47
2,India,23
3,Germany,21
4,South Korea,18


In [44]:
# create a rank from the number of attacks fields
df['data_rank'] = df['Number of Significant Cyberattacks (2006-2020)'].rank(method='max')

In [45]:
df.data_rank.max()

20.0

In [46]:
# create standard columns
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['data_rank'] 


min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 1-20 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# need to invert since higher rank is not better
df['new_rank_score'] = (6-df['new_rank_score'])+1

# # prepare output
df = df[['Country Name', 'Indicator', 'data_col', 'new_rank_score','higher_is_better']]

df.head(15)

Unnamed: 0,Country Name,Indicator,data_col,new_rank_score,higher_is_better
0,United States,Security incidents (# of relevant issues),20.0,1.0,True
1,United Kingdom,Security incidents (# of relevant issues),19.0,1.263158,True
2,India,Security incidents (# of relevant issues),18.0,1.526316,True
3,Germany,Security incidents (# of relevant issues),17.0,1.789474,True
4,South Korea,Security incidents (# of relevant issues),16.0,2.052632,True
5,Australia,Security incidents (# of relevant issues),15.0,2.315789,True
6,Ukraine,Security incidents (# of relevant issues),15.0,2.315789,True
7,China,Security incidents (# of relevant issues),13.0,2.842105,True
8,Iran,Security incidents (# of relevant issues),13.0,2.842105,True
9,Saudi Arabia,Security incidents (# of relevant issues),13.0,2.842105,True


In [47]:
# # output scores
df.to_csv('../indicator_scores/government_{}_scores.csv'.format(indicator), index=False)

## 6. ICT investment as a percentage of GDP


In [48]:
indicator = indicators[5]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

ICT investment as a percentage of GDP
ICT_Investment


In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450 entries, 0 to 449
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   LOCATION    450 non-null    object 
 1   INDICATOR   450 non-null    object 
 2   SUBJECT     450 non-null    object 
 3   MEASURE     450 non-null    object 
 4   FREQUENCY   450 non-null    object 
 5   TIME        450 non-null    int64  
 6   Value       450 non-null    float64
 7   Flag Codes  0 non-null      float64
dtypes: float64(2), int64(1), object(5)
memory usage: 28.2+ KB


In [50]:
df.head()

Unnamed: 0,LOCATION,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value,Flag Codes
0,AUS,ICTINVST,TOT,PC,A,1985,12.465454,
1,AUS,ICTINVST,TOT,PC,A,1986,13.939533,
2,AUS,ICTINVST,TOT,PC,A,1987,14.142429,
3,AUS,ICTINVST,TOT,PC,A,1988,14.289993,
4,AUS,ICTINVST,TOT,PC,A,1989,15.348707,


In [51]:
df.SUBJECT.unique()

array(['TOT'], dtype=object)

In [52]:
df.TIME.max()

2010

In [53]:
df.Value.describe()

count    450.000000
mean      16.074272
std        5.636379
min        2.798417
25%       12.410442
50%       14.840726
75%       19.419433
max       32.601105
Name: Value, dtype: float64

In [54]:
dcol = 'Value'
indicol = indicator
cname = 'LOCATION'

# filter most recent year
df = df[(df.TIME==2010)]

# create standard columns
df['higher_is_better'] = True
df['Indicator'] =  indicator
df['Country Code'] = df[cname]
# # df['2019 [YR2019]'] = df['2019 [YR2019]'].astype(float)
df['data_col'] = df[dcol]

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 1-20 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# df.sort_values(by='new_rank_score', ascending=False)

# # prepare output
df = df[['Country Code', 'Indicator', 'data_col', 'new_rank_score','higher_is_better']]

# # output scores
df.to_csv('../indicator_scores/government_{}_scores.csv'.format(indicator), index=False)

In [55]:
df

Unnamed: 0,Country Code,Indicator,data_col,new_rank_score,higher_is_better
72,CAN,ICT investment as a percentage of GDP,17.018367,2.47101,True
121,FIN,ICT investment as a percentage of GDP,15.519828,2.121241,True
172,DEU,ICT investment as a percentage of GDP,12.690394,1.460831,True
198,IRL,ICT investment as a percentage of GDP,12.412805,1.39604,True
224,ITA,ICT investment as a percentage of GDP,11.026056,1.072363,True
274,KOR,ICT investment as a percentage of GDP,10.716026,1.0,True
323,NZL,ICT investment as a percentage of GDP,21.238183,3.455945,True
349,ESP,ICT investment as a percentage of GDP,13.763415,1.711282,True
400,CHE,ICT investment as a percentage of GDP,18.506691,2.818396,True
449,USA,ICT investment as a percentage of GDP,32.137836,6.0,True


## 7. Evidence of digital strategies in/across Ministries



In [56]:
indicator = indicators[6]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Evidence of digital strategies in/across Minstries
Egov_strategy


In [57]:
df.head()

Unnamed: 0,#,Flag,Code,Cnum,Economy,Level,Population,GNI,GNIPC,e-Government,...,NGTI-1,NGTI-2,NGTI-3,NGTI-4,GTI-1,GTI-2,GTI-3,GTI-4,data_country,data_year
0,1,,AFG,4.0,Afghanistan,LIC,38928,20726,540,https://mcit.gov.af/node/6938,...,0.69,0.52,0.31,0.62,-0.09,-0.34,0.02,-0.12,,
1,2,,ALB,8.0,Albania,UMIC,2878,14949,5240,https://e-albania.al/,...,0.81,0.78,0.6,0.77,0.29,0.62,0.74,0.26,,
2,3,,DZA,12.0,Algeria,LMIC,43851,170722,3970,https://www.mpttn.gov.dz/ar/content/%D8%A7%D9%...,...,0.73,0.46,0.02,0.6,0.06,-0.56,-0.68,-0.15,,
3,4,,ADO,20.0,Andorra,HIC,77,3154,40886,http://www.govern.ad,...,0.63,0.6,0.05,0.13,-0.28,-0.06,-0.62,-1.32,,
4,5,,AGO,24.0,Angola,LMIC,32866,97005,3050,http://www.governo.gov.ao,...,0.68,0.69,0.21,0.61,-0.12,0.27,-0.23,-0.13,,


In [58]:
# Must limit the database to the first 206 rows, the remaining rows do not contain any useful information
df = df.iloc[0:205,:]

# Drop the superfluous rows by dropping na
df = df[pd.to_numeric(df['#'], errors='coerce').notnull()]
df['#'] = df[df['#'].notna()]

# Must convert data in the DPL column into float
df['DG St'] = df['DG St'].replace('-',np.nan)
df['DG St'] = df['DG St'].astype(float, errors = 'ignore')

In [59]:
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['DG St']
df['Country Name'] = df['Economy']
df['Year'] = 2020

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

max_rank

3.0

In [60]:
# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [61]:
df = df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']]
df.head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,Afghanistan,2020,Evidence of digital strategies in/across Minst...,2.0,4.333333,True
1,Albania,2020,Evidence of digital strategies in/across Minst...,3.0,6.0,True
2,Algeria,2020,Evidence of digital strategies in/across Minst...,2.0,4.333333,True
3,Andorra,2020,Evidence of digital strategies in/across Minst...,3.0,6.0,True
4,Angola,2020,Evidence of digital strategies in/across Minst...,2.0,4.333333,True
5,Antigua and Barbuda,2020,Evidence of digital strategies in/across Minst...,1.0,2.666667,True
6,Argentina,2020,Evidence of digital strategies in/across Minst...,3.0,6.0,True
7,Armenia,2020,Evidence of digital strategies in/across Minst...,1.0,2.666667,True
8,Australia,2020,Evidence of digital strategies in/across Minst...,3.0,6.0,True
9,Austria,2020,Evidence of digital strategies in/across Minst...,3.0,6.0,True


In [62]:
# # output scores
df.to_csv('../indicator_scores/government_Evidence of digital strategies_scores.csv'.format(indicator), index=False)

## 8. Evidence of focus on vulnerable groups


In [63]:
indicator = indicators[7]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

df = df.iloc[1:,:]

Evidence of focus on vulnerable groups
Egov_strategy


In [64]:
df.head()

Unnamed: 0,#,Flag,Code,Cnum,Economy,Level,Population,GNI,GNIPC,e-Government,...,NGTI-1,NGTI-2,NGTI-3,NGTI-4,GTI-1,GTI-2,GTI-3,GTI-4,data_country,data_year
1,2,,ALB,8.0,Albania,UMIC,2878,14949,5240,https://e-albania.al/,...,0.81,0.78,0.6,0.77,0.29,0.62,0.74,0.26,,
2,3,,DZA,12.0,Algeria,LMIC,43851,170722,3970,https://www.mpttn.gov.dz/ar/content/%D8%A7%D9%...,...,0.73,0.46,0.02,0.6,0.06,-0.56,-0.68,-0.15,,
3,4,,ADO,20.0,Andorra,HIC,77,3154,40886,http://www.govern.ad,...,0.63,0.6,0.05,0.13,-0.28,-0.06,-0.62,-1.32,,
4,5,,AGO,24.0,Angola,LMIC,32866,97005,3050,http://www.governo.gov.ao,...,0.68,0.69,0.21,0.61,-0.12,0.27,-0.23,-0.13,,
5,6,,ATG,28.0,Antigua and Barbuda,HIC,98,1618,16660,http://www.ab.gov.ag,...,0.72,0.55,0.05,0.42,0.0,-0.23,-0.62,-0.59,,


In [65]:
# Must limit the database to the first 206 rows, the remaining rows do not contain any useful information
df = df.iloc[0:205,:]

# Drop the superfluous rows by dropping na
df = df[pd.to_numeric(df['#'], errors='coerce').notnull()]
df['#'] = df[df['#'].notna()]

# Must convert data in the DPL column into float
df['WoG'] = df['WoG'].replace('-',np.nan)
df['WoG'] = df['WoG'].astype(float, errors = 'ignore')

In [66]:
df['WoG'].describe

<bound method NDFrame.describe of 1      0.03
2     -0.03
3     -0.03
4      0.03
5     -0.03
       ... 
200    0.03
201   -0.03
202   -0.03
203   -0.03
204   -0.03
Name: WoG, Length: 197, dtype: float64>

In [67]:
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['WoG']
df['Country Name'] = df['Economy']
df['Year'] = 2020

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

max_rank

0.09

In [68]:
# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [69]:
df = df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']]
df.head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
1,Albania,2020,Evidence of focus on vulnerable groups,0.03,3.5,True
2,Algeria,2020,Evidence of focus on vulnerable groups,-0.03,1.0,True
3,Andorra,2020,Evidence of focus on vulnerable groups,-0.03,1.0,True
4,Angola,2020,Evidence of focus on vulnerable groups,0.03,3.5,True
5,Antigua and Barbuda,2020,Evidence of focus on vulnerable groups,-0.03,1.0,True
6,Argentina,2020,Evidence of focus on vulnerable groups,0.03,3.5,True
7,Armenia,2020,Evidence of focus on vulnerable groups,-0.03,1.0,True
8,Australia,2020,Evidence of focus on vulnerable groups,0.09,6.0,True
9,Austria,2020,Evidence of focus on vulnerable groups,0.09,6.0,True
11,Azerbaijan,2020,Evidence of focus on vulnerable groups,-0.03,1.0,True


In [70]:
# # output scores
df.to_csv('../indicator_scores/government_{}_scores.csv'.format(indicator), index=False)

## 9. % of digital skills certifications / training courses completed


In [71]:
indicator = indicators[8]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of digital skills certifications / training courses completed
digital_skill_level


In [72]:
df

Unnamed: 0,Global Rank,Region,Country and Region,Year,Technology,Data Science
0,1,Europe,Switzerland,2020,84%,96%
1,2,Europe,Luxembourg,2020,62%,85%
2,3,Europe,Austria,2020,88%,95%
3,4,Asia Pacific,Japan,2020,100%,88%
4,5,Europe,Germany,2020,89%,94%
...,...,...,...,...,...,...
103,104,Asia Pacific,Uzbekistan,2020,6%,9%
104,105,Sub-Saharan Africa,Sierra Leone,2020,2%,4%
105,106,Latin America and the Caribbean,Paraguay,2020,7%,11%
106,107,Latin America and the Caribbean,Guyana,2020,8%,2%


In [73]:
dcol = 'Global Rank'
indicol = indicator
cname = 'Country and Region'

# filter most recent year
# df = df[(df.TIME==2010)]

# create standard columns
df['higher_is_better'] = True
df['Indicator'] =  indicator
df['Country Name'] = df[cname]
# # df['2019 [YR2019]'] = df['2019 [YR2019]'].astype(float)
df['data_col'] = df[dcol]

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 1-20 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# # need to invert score since higher rank is not better 
df['new_rank_score'] = (6-df['new_rank_score'])+1

df.sort_values(by='new_rank_score', ascending=False)

# # prepare output
df = df[['Country Name', 'Indicator', 'data_col', 'new_rank_score','higher_is_better']]

# # output scores
df.to_csv('../indicator_scores/government_ percentage digital skills certifications_scores.csv'.format(indicator), index=False)

In [74]:
df

Unnamed: 0,Country Name,Indicator,data_col,new_rank_score,higher_is_better
0,Switzerland,% of digital skills certifications / training ...,1,6.000000,True
1,Luxembourg,% of digital skills certifications / training ...,2,5.953271,True
2,Austria,% of digital skills certifications / training ...,3,5.906542,True
3,Japan,% of digital skills certifications / training ...,4,5.859813,True
4,Germany,% of digital skills certifications / training ...,5,5.813084,True
...,...,...,...,...,...
103,Uzbekistan,% of digital skills certifications / training ...,104,1.186916,True
104,Sierra Leone,% of digital skills certifications / training ...,105,1.140187,True
105,Paraguay,% of digital skills certifications / training ...,106,1.093458,True
106,Guyana,% of digital skills certifications / training ...,107,1.046729,True


### Score Aggregating

In [75]:
import os


In [76]:
# get list of files in scores folder
scores = os.listdir('../indicator_scores/')
scores = [s for s in scores if s.startswith('government')]

In [77]:
scores

['government_ percentage digital skills certifications_scores.csv',
 'government_E-Participation index_scores.csv',
 'government_Evidence of digital strategies_scores.csv',
 'government_Evidence of focus on vulnerable groups_scores.csv',
 'government_ICT investment as a percentage of GDP_scores.csv',
 'government_Online-Service-Index (OSI)_scores.csv',
 'government_percent change of government digitalizing_scores.csv',
 'government_Security incidents (# of relevant issues)_scores.csv',
 'government_Use of public services online (% of services online, penetration, frequency of use)_scores.csv']

In [78]:
# create a dataframe that concatenates all these file into one table
df = pd.concat([pd.read_csv('../indicator_scores/{}'.format(s)) for s in scores])    

In [79]:
df

Unnamed: 0,Country Name,Indicator,data_col,new_rank_score,higher_is_better,Year,Country Code
0,Switzerland,% of digital skills certifications / training ...,1.0,6.000000,True,,
1,Luxembourg,% of digital skills certifications / training ...,2.0,5.953271,True,,
2,Austria,% of digital skills certifications / training ...,3.0,5.906542,True,,
3,Japan,% of digital skills certifications / training ...,4.0,5.859813,True,,
4,Germany,% of digital skills certifications / training ...,5.0,5.813084,True,,
...,...,...,...,...,...,...,...
26,Germany,Use of public services online (% of services o...,17.0,3.283951,True,,
27,Argentina,Use of public services online (% of services o...,15.0,3.160494,True,,
28,Morocco,Use of public services online (% of services o...,-2.0,2.111111,True,,
29,Switzerland,Use of public services online (% of services o...,-18.0,1.123457,True,,


In [80]:
# Data cleaning
df['new_rank_score'] = df['new_rank_score'].fillna(0)
df.sort_values(by=['Country Name'], ascending=True, inplace=True)
df.reset_index(drop=True, inplace=True)

In [81]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1148 entries, 0 to 1147
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Country Name      1138 non-null   object 
 1   Indicator         1148 non-null   object 
 2   data_col          1141 non-null   float64
 3   new_rank_score    1148 non-null   float64
 4   higher_is_better  1148 non-null   bool   
 5   Year              593 non-null    float64
 6   Country Code      10 non-null     object 
dtypes: bool(1), float64(3), object(3)
memory usage: 55.1+ KB


In [82]:
df.head(15)

Unnamed: 0,Country Name,Indicator,data_col,new_rank_score,higher_is_better,Year,Country Code
0,Afghanistan,E-Participation index,0.4643,3.3215,True,,
1,Afghanistan,Evidence of digital strategies in/across Minst...,2.0,4.333333,True,2020.0,
2,Afghanistan,Online-Service-Index (OSI),0.4118,3.059,True,,
3,Afghanistan,What is the % change of government digitizing ...,0.69,4.520408,True,2020.0,
4,Albania,Evidence of digital strategies in/across Minst...,3.0,6.0,True,2020.0,
5,Albania,What is the % change of government digitizing ...,0.79,5.030612,True,2020.0,
6,Albania,Evidence of focus on vulnerable groups,0.03,3.5,True,2020.0,
7,Albania,E-Participation index,0.8452,5.226,True,,
8,Albania,Online-Service-Index (OSI),0.8412,5.206,True,,
9,Algeria,Evidence of focus on vulnerable groups,-0.03,1.0,True,2020.0,


In [83]:
df.describe()

Unnamed: 0,data_col,new_rank_score,Year
count,1141.0,1148.0,593.0
mean,7.083916,3.793531,2020.0
std,19.214432,1.664582,0.0
min,-20.0,0.0,2020.0
25%,0.2619,2.607,2020.0
50%,0.72,4.088,2020.0
75%,2.0,5.183673,2020.0
max,108.0,6.0,2020.0


In [84]:
# checking country names
# sorted(df['Country Name'].unique().tolist())

In [85]:
# remove trailing whitespaces from country name
df['Country Name'] = df['Country Name'].str.strip()


In [86]:
df.head()

Unnamed: 0,Country Name,Indicator,data_col,new_rank_score,higher_is_better,Year,Country Code
0,Afghanistan,E-Participation index,0.4643,3.3215,True,,
1,Afghanistan,Evidence of digital strategies in/across Minst...,2.0,4.333333,True,2020.0,
2,Afghanistan,Online-Service-Index (OSI),0.4118,3.059,True,,
3,Afghanistan,What is the % change of government digitizing ...,0.69,4.520408,True,2020.0,
4,Albania,Evidence of digital strategies in/across Minst...,3.0,6.0,True,2020.0,


In [87]:
# checking country names
# sorted(df['Country Name'].unique().tolist())

In [88]:
# average indicator scores per country
agg_df = df.groupby(['Country Name']).agg({'new_rank_score':'mean','data_col':'count'})

In [89]:
agg_df.columns = ['agg_score', 'count_source' ]

In [90]:
max_number_sources = agg_df.describe()['count_source']['max']

In [91]:
agg_df['agg_score_wt'] = agg_df['agg_score']*(agg_df['count_source']/max_number_sources)

In [92]:
agg_df.sort_values(by='agg_score', ascending=False, inplace=True)

In [93]:
agg_df.head(25)

Unnamed: 0_level_0,agg_score,count_source,agg_score_wt
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
North Korea,6.0,1,0.75
UAE,6.0,1,0.75
"Korea, Rep.",5.965986,3,2.237245
United Kingdom of Great Britain and Northern Ireland,5.8375,2,1.459375
Finland,5.76656,6,4.32492
Denmark,5.741112,7,5.023473
Estonia,5.679981,6,4.259985
Netherlands,5.668561,7,4.959991
United States of America,5.552542,5,3.470339
Singapore,5.546862,7,4.853504


In [94]:
agg_df.to_csv('../pillar_scores/government_scores_v0.csv')