In [1]:
import pandas as pd
import numpy as np



### Get all the pillar names from the excel

In [4]:
ls ../..

README.md
UNDP Digital Assessment Data Framework Filename Matching V7.xlsx
[34mdashboard[m[m/
[34mdata[m[m/
data_manifest.csv
data_manifest_instructions.md
process_manifest.js
process_raw_data.js
[34mprocessed[m[m/
[34mscore[m[m/
[34msources[m[m/
undp-diagram.png


In [7]:
names = pd.read_excel('../../UNDP Digital Assessment Data Framework Filename Matching V7.xlsx')

In [8]:
col_names = ['Indicator','check', 'Data Source','Index','Filename']

In [9]:
names = names[col_names]

In [10]:
names.head()

Unnamed: 0,Indicator,check,Data Source,Index,Filename
0,Countries,,United Nations,False,list_of_countries
1,"Database of Global Administrative Areas (GADM,...",,GADM maps and data,False,
2,High Resolution Population Density Maps + Demo...,,Facebook,False,
3,population density vs openstreetmap object den...,,Kontur,False,
4,Population Density,Infrastructure,World Bank,False,population_density


In [11]:
# get all the files per pillar
data_stats = names.groupby('check').agg({'Filename':'count','Indicator':'count'})

In [12]:
data_stats

Unnamed: 0_level_0,Filename,Indicator
check,Unnamed: 1_level_1,Unnamed: 2_level_1
Business,20,25
Foundations,9,12
Government,12,15
Infrastructure,46,48
People,39,47
Regulation,6,7
Strategy,1,1


### Business

In [20]:
bnames = names[(names.check=='Business')&(~names.Filename.isna())]#&(names.Index==False)]

In [26]:
bnames.head(25)

Unnamed: 0,Indicator,check,Data Source,Index,Filename
75,UNCTAD Business-to-Consumer (B2C) E-commerce I...,Business,UNCTAD/World Bank,True,b2c_ecommerse_idx
76,"Networking Services (Spend, IT Forecast Data)",Business,Portulans Institute,True,network_readiness_index
77,ICT goods exports,Business,UNCTAD,False,ict_goods
78,ICT goods imports,Business,UNCTAD,False,ict_goods
79,ICT service exports,Business,UNCTAD,False,ict_services
80,ICT service imports,Business,UNCTAD,False,ict_services
81,"Cloud Services (Spend, IT Forecast Data)",Business,Statista/Business Software Alliance,True,cloud_services
82,ICT task-intensive jobs as a percentage of tot...,Business,OECD,False,ICT_proportion
85,Share of business with internet,Business,OECD,False,%_business_internet
86,Share of businesses with broadband,Business,World Bank,False,%_business_broadband


In [38]:
# get list of names for all indicators
indicators = bnames.Indicator.unique()

In [39]:
# get all file names
bfiles = bnames.Filename.unique()

In [25]:
bfiles

array(['b2c_ecommerse_idx', 'network_readiness_index', 'ict_goods',
       'ict_services', 'cloud_services', 'ICT_proportion',
       '%_business_internet', '%_business_broadband',
       'share_of_businesses_online_presence', 'prevalance_gig_economy',
       'size_digital_economy', 'venture_cap_avail',
       'legal_rights_strength', 'time_start_bus', 'ease_doing_bus',
       'ease_of_finding_skilled_employees', 'start_up_investment',
       'doing_bus_idx'], dtype=object)

In [16]:
# ls digital-readiness-assessment-main/processed/

In [None]:
##ict_goods and services not in process data

In [58]:
# formula for converting scale
def convert_rank(old_value, old_min=1, old_max=7, new_min=1, new_max=6 ):
    """ Convert old scale values scale into new scale values"""
    old_range = old_max - old_min
    new_range = new_max - new_min
    new_value = (((old_value-old_min)*new_range)/old_range)+new_min
    return new_value

### 1. 'UNCTAD Business-to-Consumer (B2C) E-commerce Index'

In [43]:
indicators[0]

'UNCTAD Business-to-Consumer (B2C) E-commerce Index'

In [344]:
# load data
indicator = indicators[0]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

UNCTAD Business-to-Consumer (B2C) E-commerce Index
b2c_ecommerse_idx


In [53]:
df.Indicator.unique()

array(['Country rank and value in the UNCTAD B2C E-commerce Index'],
      dtype=object)

In [345]:
df.head()

Unnamed: 0,2015,2016,2017,Country ISO3,Country Name,Indicator Id,Indicator,Subindicator Type
0,14.1,17.0,,AFG,Afghanistan,24717,Country rank and value in the UNCTAD B2C E-com...,Value
1,,130.0,132.0,AFG,Afghanistan,24718,Country rank and value in the UNCTAD B2C E-com...,Rank
2,21.1,29.0,,AGO,Angola,24717,Country rank and value in the UNCTAD B2C E-com...,Value
3,,113.0,113.0,AGO,Angola,24718,Country rank and value in the UNCTAD B2C E-com...,Rank
4,51.0,62.0,,ALB,Albania,24717,Country rank and value in the UNCTAD B2C E-com...,Value


In [68]:
# two sub indicators per country
df['Subindicator Type'].unique()

array(['Value', 'Rank'], dtype=object)

In [56]:
# list of columns with data
value_cols = ['2015','2016','2017']

In [71]:
# value doesn't have data for 2017
df[df['Subindicator Type']=='Value'].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147 entries, 0 to 292
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   2015               137 non-null    float64
 1   2016               136 non-null    float64
 2   2017               0 non-null      float64
 3   Country ISO3       147 non-null    object 
 4   Country Name       147 non-null    object 
 5   Indicator Id       147 non-null    int64  
 6   Indicator          147 non-null    object 
 7   Subindicator Type  147 non-null    object 
 8   data_col           0 non-null      float64
 9   new_rank_score     0 non-null      float64
dtypes: float64(5), int64(1), object(4)
memory usage: 12.6+ KB


In [72]:
df[df['Subindicator Type']=='Value'].describe()

Unnamed: 0,2015,2016,2017,Indicator Id,data_col,new_rank_score
count,137.0,136.0,0.0,147.0,0.0,0.0
mean,47.151095,53.713382,,24717.0,,
std,22.978008,26.604495,,0.0,,
min,6.5,3.0,,24717.0,,
25%,28.7,32.0,,24717.0,,
50%,47.2,53.0,,24717.0,,
75%,65.2,78.25,,24717.0,,
max,89.7,96.5,,24717.0,,


In [73]:
# Rank does have 2017 data
df[df['Subindicator Type']=='Rank'].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147 entries, 1 to 293
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   2015               0 non-null      float64
 1   2016               137 non-null    float64
 2   2017               136 non-null    float64
 3   Country ISO3       147 non-null    object 
 4   Country Name       147 non-null    object 
 5   Indicator Id       147 non-null    int64  
 6   Indicator          147 non-null    object 
 7   Subindicator Type  147 non-null    object 
 8   data_col           136 non-null    float64
 9   new_rank_score     136 non-null    float64
dtypes: float64(5), int64(1), object(4)
memory usage: 12.6+ KB


In [74]:
df[df['Subindicator Type']=='Rank'].describe()

Unnamed: 0,2015,2016,2017,Indicator Id,data_col,new_rank_score
count,0.0,137.0,136.0,147.0,136.0,136.0
mean,,69.0,73.066176,24718.0,73.066176,3.519796
std,,39.692569,41.872682,0.0,41.872682,1.46408
min,,1.0,1.0,24718.0,1.0,1.0
25%,,35.0,36.75,24718.0,36.75,2.25
50%,,69.0,72.5,24718.0,72.5,3.5
75%,,103.0,110.25,24718.0,110.25,4.81993
max,,137.0,144.0,24718.0,144.0,6.0


Going to use the rank column for now since it has more recent data

In [79]:
df_rank = df[df['Subindicator Type']=='Rank'].copy()

In [81]:
# convert rank into 1-6 - in order to get most recent data 
df_rank['data_col'] = df_rank['2017'] 


min_rank = df_rank['data_col'].min()
max_rank = df_rank['data_col'].max()

# transform 1-147 rank into 1-6
df_rank['new_rank_score'] = df_rank['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# need to invert score since higher rank is not better 
df_rank['new_rank_score'] = (6-df_rank['new_rank_score'])+1

In [89]:
# prep output

df_rank.rename(columns={'Country ISO3':'Country Code'}, inplace=True)

df_rank['higher_is_better'] = True


# output scores to csv
df_rank[['Country Code', 'Country Name','Indicator','data_col','new_rank_score','higher_is_better']].to_csv('../indicator_scores/{}_scores.csv'.format(bf), index=False)

In [90]:
df_rank[['Country Code', 'Country Name','Indicator','data_col','new_rank_score','higher_is_better']].head(15)


Unnamed: 0,Country Code,Country Name,Indicator,data_col,new_rank_score,higher_is_better
1,AFG,Afghanistan,Country rank and value in the UNCTAD B2C E-com...,132.0,1.41958,True
3,AGO,Angola,Country rank and value in the UNCTAD B2C E-com...,113.0,2.083916,True
5,ALB,Albania,Country rank and value in the UNCTAD B2C E-com...,59.0,3.972028,True
7,ARE,United Arab Emirates,Country rank and value in the UNCTAD B2C E-com...,23.0,5.230769,True
9,ARG,Argentina,Country rank and value in the UNCTAD B2C E-com...,81.0,3.202797,True
11,ARM,Armenia,Country rank and value in the UNCTAD B2C E-com...,78.0,3.307692,True
13,AUS,Australia,Country rank and value in the UNCTAD B2C E-com...,14.0,5.545455,True
15,AUT,Austria,Country rank and value in the UNCTAD B2C E-com...,17.0,5.440559,True
17,AZE,Azerbaijan,Country rank and value in the UNCTAD B2C E-com...,68.0,3.657343,True
19,BDI,Burundi,Country rank and value in the UNCTAD B2C E-com...,140.0,1.13986,True


## 2. Networking Services (Spend, IT Forecast Data)


In [91]:
indicator = indicators[1]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Networking Services (Spend, IT Forecast Data)
network_readiness_index


In [108]:
# no DATE associated wiht the data

In [95]:
df.head(16)

Unnamed: 0,Rank,Country,Score,Income Group,Region
0,1,Sweden,82.75,High-income,Europe
1,2,Denmark,82.19,High-income,Europe
2,3,Singapore,81.39,High-income,Asia & Pacific
3,4,Netherlands,81.37,High-income,Europe
4,5,Switzerland,80.41,High-income,Europe
5,6,Finland,80.16,High-income,Europe
6,7,Norway,79.39,High-income,Europe
7,8,United States,78.91,High-income,The Americas
8,9,Germany,77.48,High-income,Europe
9,10,United Kingdom,76.27,High-income,Europe


In [97]:
# going to use the  score column since this is already an Index

In [102]:
# convert rank into 1-6 - in order to get most recent data 
df['data_col'] = df['Score'] 

# going to assume index is between 1-100 but not 100% sure
min_rank = 1 #df['data_col'].min()
max_rank = 100 #df['data_col'].max()

# transform 1-100 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# need to invert score since higher rank is not better 
# df['new_rank_score'] = (6-df['new_rank_score'])+1

In [107]:
# prepare output
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['Indicator'] = indicator
df['higher_is_better'] = True

df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].head(15)


# output scores to csv
df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].to_csv('../indicator_scores/{}_scores.csv'.format(bf), index=False)

## 3. ICT Services Imports

In [110]:
indicator = indicators[2]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

ICT goods exports 
ict_goods


The *ICT goods* file is actually *ICT services* data. Label is incorrect.

In [117]:
df.head()

Unnamed: 0,Year,Economy,Economy Label,Partner,Partner Label,Flow,Flow Label,Category,Category Label,US dollars at current prices in millions,US dollars at current prices in millions Footnote,Growth rate (over previous period),Growth rate (over previous period) Footnote,Percentage of total world,Percentage of total world Footnote,Percentage of total trade in services,Percentage of total trade in services Footnote
0,2005,0,World,0,World,2,Exports,SICT,ICT services,173413.539,Estimated,,,100.0,Estimated,6.47676,Estimated
1,2005,8,Albania,0,World,1,Imports,SICT,ICT services,16.953,,,,,,1.22595,
2,2005,8,Albania,0,World,2,Exports,SICT,ICT services,73.406,,,,0.04233,Estimated,5.79526,
3,2005,12,Algeria,0,World,1,Imports,SICT,ICT services,154.0,,,,,,3.19303,
4,2005,12,Algeria,0,World,2,Exports,SICT,ICT services,94.0,,,,0.054206,Estimated,3.7495,


In [112]:
dcol = 'Percentage of total trade in services'
indicol = 'Category Label'
cname = 'Economy Label'

# filter most recent year and imports
df = df[(df.Year==2019)&(df['Flow Label']=='Imports')]

# create the standard columns
df['higher_is_better'] = True
df['Indicator'] =  df[indicol]+' '+ df['Flow Label']
df['Country Name'] = df[cname]
# df['2019 [YR2019]'] = df['2019 [YR2019]'].astype(float)
df['data_col'] = df[dcol]


# convert 1-100 %  into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=0,old_max=100))

df.sort_values(by='new_rank_score', ascending=False)

# prepare output
df = df[['Country Name', 'Indicator', 'data_col', 'new_rank_score','higher_is_better']]

# output scores
df.to_csv('../indicator_scores/{}_scores.csv'.format(bf+'_imports'), index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

S

In [114]:
df.head()

Unnamed: 0,Country Name,Indicator,data_col,new_rank_score,higher_is_better
6663,Afghanistan,ICT services Imports,2.02267,1.101133,True
6665,Albania,ICT services Imports,3.38082,1.169041,True
6667,Angola,ICT services Imports,1.80954,1.090477,True
6669,Azerbaijan,ICT services Imports,0.99593,1.049797,True
6671,Argentina,ICT services Imports,6.40331,1.320166,True


## 4. ICT Services Exports

In [115]:
indicator = indicators[3]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

ICT goods imports 
ict_goods


The *ICT goods* file is actually *ICT services* data. Label is incorrect.

In [119]:
dcol = 'Percentage of total trade in services'
indicol = 'Category Label'
cname = 'Economy Label'

# filter most recent year
df = df[(df.Year==2019)&(df['Flow Label']=='Exports')]

# create standard columns
df['higher_is_better'] = True
df['Indicator'] =  df[indicol]+' '+ df['Flow Label']
df['Country Name'] = df[cname]
# df['2019 [YR2019]'] = df['2019 [YR2019]'].astype(float)
df['data_col'] = df[dcol]


# convert 1-100 %  into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=0,old_max=100))

df.sort_values(by='new_rank_score', ascending=False)

# prepare output
df = df[['Country Name', 'Indicator', 'data_col', 'new_rank_score','higher_is_better']]

# output scores
df.to_csv('../indicator_scores/{}_scores.csv'.format(bf+'_exports'), index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try

## 5. ICT Goods Exports

In [126]:
indicator = indicators[4]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

ICT service exports 
ict_services


In [133]:
df[(df.Year==2019)&(df['Flow Label']=='Exports')].sort_values(by=dcol, ascending=False)

Unnamed: 0,Year,Economy,Economy Label,Partner,Partner Label,Flow,Flow Label,IctProductCategory,IctProductCategory Label,Percentage of total merchandise trade,Percentage of total merchandise trade Footnote
21734,2019,344,"China, Hong Kong SAR",0,World,2,Exports,ICT00,Total ICT goods,56.64507,
22051,2019,608,Philippines,0,World,2,Exports,ICT00,Total ICT goods,49.01819,
21233,2019,158,"China, Taiwan Province of",0,World,2,Exports,ICT00,Total ICT goods,46.39929,
22085,2019,704,Viet Nam,0,World,2,Exports,ICT00,Total ICT goods,35.01057,
21911,2019,458,Malaysia,0,World,2,Exports,ICT00,Total ICT goods,32.50135,
...,...,...,...,...,...,...,...,...,...,...,...
21513,2019,3107,CEMAC (Economic and Monetary Community of Cent...,0,World,2,Exports,ICT00,Total ICT goods,0.01465,
21240,2019,178,Congo,0,World,2,Exports,ICT00,Total ICT goods,0.01465,
21921,2019,496,Mongolia,0,World,2,Exports,ICT00,Total ICT goods,0.00808,
22039,2019,566,Nigeria,0,World,2,Exports,ICT00,Total ICT goods,0.00210,


In [134]:
dcol = 'Percentage of total merchandise trade'
indicol = 'IctProductCategory Label'
cname = 'Economy Label'

# filter most recent year and exports
df = df[(df.Year==2019)&(df['Flow Label']=='Exports')]

# create standard columns
df['higher_is_better'] = True
df['Indicator'] =  df[indicol]+' '+ df['Flow Label']
df['Country Name'] = df[cname]
# df['2019 [YR2019]'] = df['2019 [YR2019]'].astype(float)
df['data_col'] = df[dcol]


# convert 1-100 %  into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=0,old_max=100))

df.sort_values(by='new_rank_score', ascending=False)

# prepare output
df = df[['Country Name', 'Indicator', 'data_col', 'new_rank_score','higher_is_better']]

# output scores
df.to_csv('../indicator_scores/{}_scores.csv'.format(bf+'_exports'), index=False)

In [124]:
df.head()

Unnamed: 0,Country Name,Indicator,data_col,new_rank_score,higher_is_better
21135,World,Total ICT goods Exports,12.48824,1.624412,True
21139,Antigua and Barbuda,Total ICT goods Exports,0.69518,1.034759,True
21142,Azerbaijan,Total ICT goods Exports,0.02084,1.001042,True
21144,Argentina,Total ICT goods Exports,0.07246,1.003623,True
21147,Australia,Total ICT goods Exports,1.10187,1.055093,True


## 6. ICT Goods Imports

In [137]:
indicator = indicators[5]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

ICT service imports
ict_services


In [138]:
df.head()

Unnamed: 0,Year,Economy,Economy Label,Partner,Partner Label,Flow,Flow Label,IctProductCategory,IctProductCategory Label,Percentage of total merchandise trade,Percentage of total merchandise trade Footnote
0,2000,0,World,0,World,1,Imports,ICT00,Total ICT goods,16.05745,
1,2000,0,World,0,World,2,Exports,ICT00,Total ICT goods,16.0222,
2,2000,0,World,0,World,3,Re-exports,ICT00,Total ICT goods,29.61627,
3,2000,0,World,0,World,24,Re-imports,ICT00,Total ICT goods,31.78913,
4,2000,8,Albania,0,World,1,Imports,ICT00,Total ICT goods,3.33276,


In [139]:
dcol = 'Percentage of total merchandise trade'
indicol = 'IctProductCategory Label'
cname = 'Economy Label'

# filter most recent year and exports
df = df[(df.Year==2019)&(df['Flow Label']=='Imports')]

# create standard columns
df['higher_is_better'] = True
df['Indicator'] =  df[indicol]+' '+ df['Flow Label']
df['Country Name'] = df[cname]
# df['2019 [YR2019]'] = df['2019 [YR2019]'].astype(float)
df['data_col'] = df[dcol]


# convert 1-100 %  into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=0,old_max=100))

df.sort_values(by='new_rank_score', ascending=False)

# prepare output
df = df[['Country Name', 'Indicator', 'data_col', 'new_rank_score','higher_is_better']]

# output scores
df.to_csv('../indicator_scores/{}_scores.csv'.format(bf+'_imports'), index=False)

Question: do we want to use imports or exports?

## 7. Cloud Services (Spend, IT Forecast Data)


In [140]:
indicator = indicators[6]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Cloud Services (Spend, IT Forecast Data)
cloud_services


In [147]:
# remove nulls
df = df.dropna()

In [160]:
# prepare standard columns
df['data_col'] = df['Unnamed: 1'].astype(float)
df['Indicator'] = indicator
df['higher_is_better'] = True

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [163]:
# create country name column
df['Country Name'] = df['Cloud computing policy environment by category - country ranking 2018']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [158]:
min_rank = 1
max_rank = df['Country Name'].nunique()

In [164]:
# transform 1-24 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# need to invert score since higher rank is not better 
# df['new_rank_score'] = (6-df['new_rank_score'])+1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [166]:
# prepare output
df.sort_values(by='new_rank_score', ascending=False)

# prepare output
df = df[['Country Name', 'Indicator', 'data_col', 'new_rank_score','higher_is_better']]

# output scores
df.to_csv('../indicator_scores/{}_scores.csv'.format(bf), index=False)

## 8. ICT task-intensive jobs as a percentage of total employment

In [347]:
indicator = indicators[7]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

ICT task-intensive jobs as a percentage of total employment
ICT_proportion


In [348]:
df.head()

Unnamed: 0,Indicator,Country,Industry,Information and communication technologies,Sex,Measure,Time,Value,Flags
0,ICT specialists within and outside information...,Austria,Total economy,Specialist (ISCO-08: 133+215+251+252+351+352+742),Total,Share of jobs,2011,3.1764,
1,ICT specialists within and outside information...,Austria,Total economy,Specialist (ISCO-08: 133+215+251+252+351+352+742),Total,Share of jobs,2012,3.225967,
2,ICT specialists within and outside information...,Austria,Total economy,Specialist (ISCO-08: 133+215+251+252+351+352+742),Total,Share of jobs,2013,3.346251,
3,ICT specialists within and outside information...,Austria,Total economy,Specialist (ISCO-08: 133+215+251+252+351+352+742),Total,Share of jobs,2014,3.3191,
4,ICT specialists within and outside information...,Austria,Total economy,Specialist (ISCO-08: 133+215+251+252+351+352+742),Total,Share of jobs,2015,3.72934,


In [349]:
df[(df['Time']==2018)&(df['Information and communication technologies']=='ICT-intensive')].sort_values(by='Value', ascending=False)

Unnamed: 0,Indicator,Country,Industry,Information and communication technologies,Sex,Measure,Time,Value,Flags


In [178]:
# bnames

In [176]:
df['Information and communication technologies'].unique()

array(['Specialist (ISCO-08: 133+215+251+252+351+352+742)',
       'Other ICT-intensive (ISCO-08: 121+122,134+,211+,216+,231+,241+,242+243)',
       'Non-ICT (rest of ISCO-08 occupations)', 'ICT-intensive', 'Total'],
      dtype=object)

In [172]:
df.Sex.unique()

array(['Total'], dtype=object)

In [170]:
# convert to correct types
df['Value'] = df['Value'].astype(float)

In [171]:
df['Value'].describe()

count    985.000000
mean      42.353406
std       42.614469
min        0.890157
25%        5.526795
50%       12.887070
75%       91.318100
max      100.000000
Name: Value, dtype: float64

In [184]:
# filter on relevant years
df = df[(df['Time']==2017)&(df['Information and communication technologies']=='ICT-intensive')]

# convert rank into 1-6 - in order to get most recent data 
df['data_col'] = df['Value'] 

# going to assume index is between 1-100 but not 100% sure
min_rank = 0 #df['data_col'].min()
max_rank = 100 #df['data_col'].max()

# transform 1-147 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# need to invert score since higher rank is not better 
# df['new_rank_score'] = (6-df['new_rank_score'])+1

# df_rank[['Country ISO3', 'Country Name','Indicator','data_col','new_rank_score']].head()

df.rename(columns={'Country':'Country Name'}, inplace=True)
df['Indicator'] = indicator
df['higher_is_better'] = True

df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].head(15)


# output scores to csv
df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].to_csv('../indicator_scores/{}_scores.csv'.format(bf), index=False)

## 9. Share of business with internet

In [350]:
indicator = indicators[8]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Share of business with internet
%_business_internet


FileNotFoundError: [Errno 2] No such file or directory: '../../processed/%_business_internet.csv'

## 10. Share of businesses with broadband

In [196]:
indicator = indicators[9]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Share of businesses with broadband
%_business_broadband


FileNotFoundError: [Errno 2] No such file or directory: '../../processed/%_business_broadband.csv'

No data found

## 11. Share of businesses with online presence

In [191]:
indicator = indicators[10]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Share of businesses with online presence
share_of_businesses_online_presence


In [194]:
df['Unnamed: 1'].unique()

array([nan])

 No data in the file

## 12. Share of businesses with online presence

In [200]:
indicator = indicators[11]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Size of gig economy (% of GDP)
prevalance_gig_economy


In [202]:
df.iloc[0,0]

'Average answer to the question: In your country, to what extent is the online gig economy prevalent?\n[1 = Not at all; 7 = To a great extent] | 2018–19'

In [204]:
df.head()

Unnamed: 0,Prevalence of gig economy,Unnamed: 1
0,Average answer to the question: In your countr...,
1,RANK,SCORE
2,1,100
3,2,94.63
4,3,93.8


### No country mapping

## 13. Size of digital economy (% of transactions)


In [210]:
indicator = indicators[12]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Size of digital economy (% of transactions)
size_digital_economy


In [211]:
df


Unnamed: 0,Proportion of medium and high-tech industry value added in total value added (%) | 2016,Unnamed: 1
0,1.0,100.00
1,2.0,82.59
2,3.0,81.42
3,4.0,78.58
4,5.0,76.36
...,...,...
129,,
130,,
131,,
132,,


No Country Data

## 14. Venture Capital Availability


In [214]:
indicator = indicators[13]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Venture Capital Availability
venture_cap_avail


In [215]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70157 entries, 0 to 70156
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   2017               54355 non-null  float64
 1   2018               55681 non-null  float64
 2   2019               55190 non-null  float64
 3   Country ISO3       70157 non-null  object 
 4   Country Name       70157 non-null  object 
 5   Indicator Id       70157 non-null  int64  
 6   Indicator          70157 non-null  object 
 7   Subindicator Type  70157 non-null  object 
 8   2007-2008          9112 non-null   float64
 9   2008-2009          9658 non-null   float64
 10  2009-2010          9658 non-null   float64
 11  2010-2011          10686 non-null  float64
 12  2011-2012          10917 non-null  float64
 13  2012-2013          11071 non-null  float64
 14  2013-2014          12270 non-null  float64
 15  2014-2015          11588 non-null  float64
 16  2015-2016          112

#### What fields should we use?

In [216]:
df.Indicator.unique()  
# relavent indicator is:        'Venture capital availability, 1-7 (best)', id 529

array(['GDP (PPP) as % of world total',
       'Domestic market size index, 1-7 (best)',
       'Foreign market size index, 1-7 (best)',
       'Exports as a percentage of GDP',
       'Availability of financial services, 1-7 (best)',
       'Affordability of financial services, 1-7 (best)',
       'Financing through local equity market, 1-7 (best)',
       'Ease of access to loans, 1-7 (best)',
       'Venture capital availability, 1-7 (best)',
       'Soundness of banks, 1-7 (best)',
       'Women in labor force, ratio to men',
       'Value chain breadth, 1-7 (best)',
       'Production process sophistication, 1-7 (best)',
       'Reliance on professional management, 1-7 (best)',
       'Country capacity to retain talent, 1-7 (best)',
       'Country capacity to attract talent, 1-7 (best)',
       'State of cluster development, 1-7 (best)',
       'Malaria cases/100,000 pop.', 'Tuberculosis cases/100,000 pop.',
       'Degree of customer orientation, 1-7 (best)',
       'Buyer sophi

In [217]:
df[df.Indicator=='Venture capital availability, 1-7 (best)']

# there are two indicator subtypes: Index and Rank
df[df.Indicator=='Venture capital availability, 1-7 (best)']['Subindicator Type'].unique()


array(['Index 1-7 (best)', 'Rank'], dtype=object)

In [238]:
df['data_col'] = df['2019']#.apply(lambda row: convert_rank(row))

In [239]:
# create two data frames for each subtype
df_rank = df[(df.Indicator=='Venture capital availability, 1-7 (best)')&(df['Subindicator Type']=='Rank')]
df_index = df[(df.Indicator=='Venture capital availability, 1-7 (best)')&(df['Subindicator Type']=='Index 1-7 (best)')]

In [219]:
df_index.head()

Unnamed: 0,2017,2018,2019,Country ISO3,Country Name,Indicator Id,Indicator,Subindicator Type,2007-2008,2008-2009,2009-2010,2010-2011,2011-2012,2012-2013,2013-2014,2014-2015,2015-2016,2016-2017,2017-2018
16,,,,AGO,Angola,529,"Venture capital availability, 1-7 (best)",Index 1-7 (best),,,,1.802891,1.494084,,2.117647,2.34817,,,
510,,,,ALB,Albania,529,"Venture capital availability, 1-7 (best)",Index 1-7 (best),2.665708,2.524466,2.321898,2.145235,1.956589,1.844924,1.859132,1.948528,1.898624,2.11548,2.482504
1011,,,,ARE,United Arab Emirates,529,"Venture capital availability, 1-7 (best)",Index 1-7 (best),4.544287,4.296279,3.881013,3.716301,3.973436,4.143612,4.123291,4.352251,4.352251,4.47605,4.574773
1509,,,,ARG,Argentina,529,"Venture capital availability, 1-7 (best)",Index 1-7 (best),2.839896,2.425063,2.131687,1.897883,1.909561,1.82237,1.745999,1.779793,2.009052,2.151421,2.243158
2014,,,,ARM,Armenia,529,"Venture capital availability, 1-7 (best)",Index 1-7 (best),2.123574,1.999757,1.921578,1.789272,2.136096,2.369044,2.425793,2.355755,2.537145,2.796361,2.78055


In [220]:
df_rank.head()

Unnamed: 0,2017,2018,2019,Country ISO3,Country Name,Indicator Id,Indicator,Subindicator Type,2007-2008,2008-2009,2009-2010,2010-2011,2011-2012,2012-2013,2013-2014,2014-2015,2015-2016,2016-2017,2017-2018
17,,140.0,141.0,AGO,Angola,530,"Venture capital availability, 1-7 (best)",Rank,,,,129.0,140.0,,119.0,98.0,,,
511,102.0,88.0,81.0,ALB,Albania,530,"Venture capital availability, 1-7 (best)",Rank,95.0,101.0,101.0,107.0,124.0,132.0,136.0,128.0,133.0,124.0,103.0
1012,17.0,9.0,4.0,ARE,United Arab Emirates,530,"Venture capital availability, 1-7 (best)",Rank,16.0,17.0,13.0,16.0,13.0,8.0,10.0,4.0,7.0,7.0,7.0
1510,116.0,104.0,118.0,ARG,Argentina,530,"Venture capital availability, 1-7 (best)",Rank,83.0,108.0,117.0,124.0,129.0,135.0,142.0,138.0,126.0,120.0,120.0
2015,74.0,72.0,56.0,ARM,Armenia,530,"Venture capital availability, 1-7 (best)",Rank,124.0,130.0,129.0,131.0,109.0,89.0,93.0,96.0,90.0,73.0,76.0


In [221]:
# 152 countries in data
df_rank['Country Name'].nunique()

152

In [222]:
# list of columns with data
value_cols = ['2007-2008', '2008-2009', '2009-2010',
       '2010-2011', '2011-2012', '2012-2013', '2013-2014', '2014-2015',
       '2015-2016', '2016-2017', '2017-2018', '2017', '2018', '2019']

In [223]:
# quick snapshot of dataframes
df_rank[value_cols].describe()

Unnamed: 0,2007-2008,2008-2009,2009-2010,2010-2011,2011-2012,2012-2013,2013-2014,2014-2015,2015-2016,2016-2017,2017-2018,2017,2018,2019
count,127.0,131.0,131.0,138.0,141.0,143.0,147.0,143.0,139.0,137.0,136.0,130.0,134.0,137.0
mean,65.259843,67.038168,67.053435,70.427536,71.900709,72.944056,74.945578,72.902098,70.920863,69.912409,69.375,67.638462,69.5,70.262774
std,37.6275,38.33745,38.215646,40.098763,41.003537,41.51697,42.669289,41.578928,40.397314,39.832272,39.594975,39.205087,40.282601,40.255849
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,33.5,34.5,34.5,36.25,37.0,37.5,38.5,37.5,36.5,36.0,35.75,34.25,35.25,36.0
50%,65.0,67.0,67.0,70.5,72.0,73.0,75.0,73.0,71.0,70.0,69.5,66.5,68.5,70.0
75%,97.5,99.5,99.5,104.75,107.0,108.5,111.5,108.5,105.5,104.0,103.25,101.75,102.75,104.0
max,130.0,134.0,133.0,139.0,142.0,144.0,148.0,144.0,140.0,138.0,137.0,135.0,140.0,141.0


In [224]:
# index values have no 2019 data
df_index[value_cols].describe()

Unnamed: 0,2007-2008,2008-2009,2009-2010,2010-2011,2011-2012,2012-2013,2013-2014,2014-2015,2015-2016,2016-2017,2017-2018,2017,2018,2019
count,127.0,131.0,131.0,138.0,141.0,143.0,147.0,143.0,139.0,137.0,136.0,0.0,0.0,0.0
mean,3.288489,3.164625,2.869158,2.659362,2.687949,2.692248,2.705923,2.754663,2.839938,2.939999,3.002545,,,
std,0.899434,0.82637,0.674535,0.680454,0.745284,0.713336,0.693771,0.712325,0.710434,0.740065,0.785252,,,
min,1.869061,1.500274,1.487119,1.488852,1.420704,1.494243,1.466607,1.474669,1.54505,1.643615,1.621595,,,
25%,2.631797,2.529897,2.361602,2.15608,2.147166,2.193814,2.18349,2.215631,2.328283,2.352083,2.480248,,,
50%,3.11007,3.003566,2.733178,2.498967,2.573486,2.513357,2.576551,2.679087,2.715083,2.833833,2.847194,,,
75%,3.918566,3.792939,3.36104,3.080125,3.015231,3.038222,3.153995,3.172582,3.243787,3.290093,3.435095,,,
max,5.278317,5.127621,4.610427,4.441749,5.389427,4.684038,4.572578,4.775331,5.079046,4.940222,5.244993,,,


Two datasets, 1 rank (1-140ish per year), 1 index (1-7).  
Index is normally prefered but doesn't have data for 2019.  
Will use rank for this reason. Transformation should be the approx the same.

In [296]:
df_rank.data_col.describe()

count    137.000000
mean      70.262774
std       40.255849
min        1.000000
25%       36.000000
50%       70.000000
75%      104.000000
max      141.000000
Name: data_col, dtype: float64

In [240]:
# transform 1-141 rank into 1-6
min_rank = df_rank.data_col.min()
max_rank = df_rank.data_col.max()
df_rank['new_rank_score'] = df_rank['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [241]:
# need to invert score since higher rank is not better 
df_rank['new_rank_score'] = (6-df_rank['new_rank_score'])+1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


###### Going to test how rank and index conversions compare.  

Unlike index scores, Rank scores don't necessarily have equal weightings as you go up. E.g. the difference betwee ranks 15-16 is not the same as ranks 17-18. But for our conversion we will have to treat them as equal, so I am keen to see how this affects results compared to the index scores.  
To do this I construct a simple test example of an older data column 2007-2008

In [231]:
df_rank['2007-2008'].describe()

count    127.000000
mean      65.259843
std       37.627500
min        1.000000
25%       33.500000
50%       65.000000
75%       97.500000
max      130.000000
Name: 2007-2008, dtype: float64

In [246]:
# convert rank into 1-6
df_rank['test0'] = df_rank['2007-2008'].apply(lambda row: convert_rank(row, 1, 130))
df_rank['test0'] = (6-df_rank['test0'])+1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [247]:
df_index['test0'] = df_index['2007-2008'].apply(lambda row: convert_rank(row, 1, 7))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [248]:
new_df = df_index[['test0','Country Name']].merge(df_rank[['test0','Country Name']], suffixes=['index','rank'],on='Country Name')

In [249]:
new_df.corr()

Unnamed: 0,test0index,test0rank
test0index,1.0,0.978224
test0rank,0.978224,1.0


The two are correlated enough to not dramatically impact results. But could be worth revising in the future when there is more recent index data.

#### Prepare Output

In [250]:
df_rank[['Country ISO3', 'Country Name','Indicator','data_col','new_rank_score']].head()

df_rank.rename(columns={'Country ISO3':'Country Code'}, inplace=True)

df_rank['higher_is_better'] = True

df_rank[['Country Code', 'Country Name','Indicator','data_col','new_rank_score','higher_is_better']].head(15)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,Country Code,Country Name,Indicator,data_col,new_rank_score,higher_is_better
17,AGO,Angola,"Venture capital availability, 1-7 (best)",141.0,1.0,True
511,ALB,Albania,"Venture capital availability, 1-7 (best)",81.0,3.142857,True
1012,ARE,United Arab Emirates,"Venture capital availability, 1-7 (best)",4.0,5.892857,True
1510,ARG,Argentina,"Venture capital availability, 1-7 (best)",118.0,1.821429,True
2015,ARM,Armenia,"Venture capital availability, 1-7 (best)",56.0,4.035714,True
2507,AUS,Australia,"Venture capital availability, 1-7 (best)",46.0,4.392857,True
3012,AUT,Austria,"Venture capital availability, 1-7 (best)",38.0,4.678571,True
3510,AZE,Azerbaijan,"Venture capital availability, 1-7 (best)",24.0,5.178571,True
4004,BDI,Burundi,"Venture capital availability, 1-7 (best)",78.0,3.25,True
4494,BEL,Belgium,"Venture capital availability, 1-7 (best)",31.0,4.928571,True


In [251]:

# output scores to csv
df_rank[['Country Code', 'Country Name','Indicator','data_col','new_rank_score','higher_is_better']].to_csv('../indicator_scores/{}_scores.csv'.format(bf), index=False)

### 15. Legal Rights Strength

In [252]:
indicator = indicators[14]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Strength of Legal Rights
legal_rights_strength


In [253]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271 entries, 0 to 270
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Series Name    268 non-null    object
 1   Series Code    266 non-null    object
 2   Country Name   266 non-null    object
 3   Country Code   266 non-null    object
 4   1990 [YR1990]  266 non-null    object
 5   2000 [YR2000]  266 non-null    object
 6   2011 [YR2011]  266 non-null    object
 7   2012 [YR2012]  266 non-null    object
 8   2013 [YR2013]  266 non-null    object
 9   2014 [YR2014]  266 non-null    object
 10  2015 [YR2015]  266 non-null    object
 11  2016 [YR2016]  266 non-null    object
 12  2017 [YR2017]  266 non-null    object
 13  2018 [YR2018]  266 non-null    object
 14  2019 [YR2019]  266 non-null    object
 15  2020 [YR2020]  266 non-null    object
dtypes: object(16)
memory usage: 34.0+ KB


#### Find Relevant Columns

In [254]:
df['Series Name'].unique()

array(['Strength of legal rights index (0=weak to 12=strong)', nan,
       'Data from database: World Development Indicators',
       'Last Updated: 06/30/2021'], dtype=object)

In [255]:
# remove unwanted rows
df = df[~df['Series Code'].isna()]
df = df.replace('..', np.nan)

In [256]:
df.loc[0][0]

'Strength of legal rights index (0=weak to 12=strong)'

In [257]:
df.head()

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,1990 [YR1990],2000 [YR2000],2011 [YR2011],2012 [YR2012],2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020]
0,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Afghanistan,AFG,,,,,9.0,9.0,9.0,9.0,9.0,10.0,10.0,
1,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Albania,ALB,,,,,7.0,6.0,6.0,6.0,8.0,8.0,8.0,
2,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Algeria,DZA,,,,,2.0,2.0,2.0,2.0,2.0,2.0,2.0,
3,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,American Samoa,ASM,,,,,,,,,,,,
4,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Andorra,AND,,,,,,,,,,,,


In [258]:
df.columns

Index(['Series Name', 'Series Code', 'Country Name', 'Country Code',
       '1990 [YR1990]', '2000 [YR2000]', '2011 [YR2011]', '2012 [YR2012]',
       '2013 [YR2013]', '2014 [YR2014]', '2015 [YR2015]', '2016 [YR2016]',
       '2017 [YR2017]', '2018 [YR2018]', '2019 [YR2019]', '2020 [YR2020]'],
      dtype='object')

In [260]:
# clean data
df = df.replace('..', np.nan)

df[['1990 [YR1990]', '2000 [YR2000]', '2011 [YR2011]', '2012 [YR2012]',
       '2013 [YR2013]', '2014 [YR2014]', '2015 [YR2015]', '2016 [YR2016]',
       '2017 [YR2017]', '2018 [YR2018]', '2019 [YR2019]', '2020 [YR2020]']] = df[['1990 [YR1990]', '2000 [YR2000]', '2011 [YR2011]', '2012 [YR2012]',
       '2013 [YR2013]', '2014 [YR2014]', '2015 [YR2015]', '2016 [YR2016]',
       '2017 [YR2017]', '2018 [YR2018]', '2019 [YR2019]', '2020 [YR2020]']].astype(float)

In [261]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 266 entries, 0 to 265
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Series Name    266 non-null    object 
 1   Series Code    266 non-null    object 
 2   Country Name   266 non-null    object 
 3   Country Code   266 non-null    object 
 4   1990 [YR1990]  0 non-null      float64
 5   2000 [YR2000]  0 non-null      float64
 6   2011 [YR2011]  0 non-null      float64
 7   2012 [YR2012]  0 non-null      float64
 8   2013 [YR2013]  236 non-null    float64
 9   2014 [YR2014]  236 non-null    float64
 10  2015 [YR2015]  237 non-null    float64
 11  2016 [YR2016]  237 non-null    float64
 12  2017 [YR2017]  237 non-null    float64
 13  2018 [YR2018]  238 non-null    float64
 14  2019 [YR2019]  238 non-null    float64
 15  2020 [YR2020]  0 non-null      float64
dtypes: float64(12), object(4)
memory usage: 43.4+ KB


In [262]:
# keep only most recent value
df = df[['Series Name', 'Series Code', 'Country Name', 'Country Code','2019 [YR2019]']]

df['higher_is_better'] = True
df['Indicator'] = df['Series Name']
df['data_col'] = df['2019 [YR2019]']


#### Convert Scales

In [263]:
# convert 0-12 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=0,old_max=12))

In [264]:
df.head(16)

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2019 [YR2019],higher_is_better,Indicator,data_col,new_rank_score
0,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Afghanistan,AFG,10.0,True,Strength of legal rights index (0=weak to 12=s...,10.0,5.166667
1,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Albania,ALB,8.0,True,Strength of legal rights index (0=weak to 12=s...,8.0,4.333333
2,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Algeria,DZA,2.0,True,Strength of legal rights index (0=weak to 12=s...,2.0,1.833333
3,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,American Samoa,ASM,,True,Strength of legal rights index (0=weak to 12=s...,,
4,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Andorra,AND,,True,Strength of legal rights index (0=weak to 12=s...,,
5,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Angola,AGO,1.0,True,Strength of legal rights index (0=weak to 12=s...,1.0,1.416667
6,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Antigua and Barbuda,ATG,5.0,True,Strength of legal rights index (0=weak to 12=s...,5.0,3.083333
7,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Argentina,ARG,2.0,True,Strength of legal rights index (0=weak to 12=s...,2.0,1.833333
8,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Armenia,ARM,6.0,True,Strength of legal rights index (0=weak to 12=s...,6.0,3.5
9,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Aruba,ABW,,True,Strength of legal rights index (0=weak to 12=s...,,


In [95]:
df.columns

Index(['Series Name', 'Series Code', 'Country Name', 'Country Code',
       '2019 [YR2019]', 'higher_is_better', 'Indicator', 'data_col',
       'new_rank_score'],
      dtype='object')

#### Prepare Output

In [265]:
df = df[['Country Name', 'Country Code','Indicator', 'data_col', 'new_rank_score','higher_is_better']]
df.sort_values(by='new_rank_score', ascending=False).head(16)

Unnamed: 0,Country Name,Country Code,Indicator,data_col,new_rank_score,higher_is_better
12,Azerbaijan,AZE,Strength of legal rights index (0=weak to 12=s...,12.0,6.0,True
132,Montenegro,MNE,Strength of legal rights index (0=weak to 12=s...,12.0,6.0,True
141,New Zealand,NZL,Strength of legal rights index (0=weak to 12=s...,12.0,6.0,True
158,Puerto Rico,PRI,Strength of legal rights index (0=weak to 12=s...,12.0,6.0,True
28,Brunei Darussalam,BRN,Strength of legal rights index (0=weak to 12=s...,12.0,6.0,True
119,Malawi,MWI,Strength of legal rights index (0=weak to 12=s...,11.0,5.583333,True
105,Kosovo,XKX,Strength of legal rights index (0=weak to 12=s...,11.0,5.583333,True
101,Kenya,KEN,Strength of legal rights index (0=weak to 12=s...,11.0,5.583333,True
162,Rwanda,RWA,Strength of legal rights index (0=weak to 12=s...,11.0,5.583333,True
42,Colombia,COL,Strength of legal rights index (0=weak to 12=s...,11.0,5.583333,True


In [266]:
# output scores
df.to_csv('../indicator_scores/{}_scores.csv'.format(bf), index=False)

### 16. Time to start business


#### Load Data

In [268]:
indicator = indicators[15]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Time to Start a Business
time_start_bus


In [269]:
# remove unwanted rows
df = df[~df['Series Code'].isna()]
df = df.replace('..', np.nan)

In [270]:
df[['1990 [YR1990]', '2000 [YR2000]', '2011 [YR2011]', '2012 [YR2012]',
       '2013 [YR2013]', '2014 [YR2014]', '2015 [YR2015]', '2016 [YR2016]',
       '2017 [YR2017]', '2018 [YR2018]', '2019 [YR2019]', '2020 [YR2020]']] = df[['1990 [YR1990]', '2000 [YR2000]', '2011 [YR2011]', '2012 [YR2012]',
       '2013 [YR2013]', '2014 [YR2014]', '2015 [YR2015]', '2016 [YR2016]',
       '2017 [YR2017]', '2018 [YR2018]', '2019 [YR2019]', '2020 [YR2020]']].astype(float)

In [271]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 266 entries, 0 to 265
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Series Name    266 non-null    object 
 1   Series Code    266 non-null    object 
 2   Country Name   266 non-null    object 
 3   Country Code   266 non-null    object 
 4   1990 [YR1990]  0 non-null      float64
 5   2000 [YR2000]  0 non-null      float64
 6   2011 [YR2011]  221 non-null    float64
 7   2012 [YR2012]  225 non-null    float64
 8   2013 [YR2013]  236 non-null    float64
 9   2014 [YR2014]  236 non-null    float64
 10  2015 [YR2015]  237 non-null    float64
 11  2016 [YR2016]  237 non-null    float64
 12  2017 [YR2017]  237 non-null    float64
 13  2018 [YR2018]  238 non-null    float64
 14  2019 [YR2019]  238 non-null    float64
 15  2020 [YR2020]  0 non-null      float64
dtypes: float64(12), object(4)
memory usage: 35.3+ KB


In [272]:
# keep only most recent value
df = df[['Series Name', 'Series Code', 'Country Name', 'Country Code','2019 [YR2019]']]

df['higher_is_better'] = True
df['Indicator'] = df['Series Name']
df['data_col'] = df['2019 [YR2019]']


In [273]:
df.head()

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2019 [YR2019],higher_is_better,Indicator,data_col
0,Time required to start a business (days),IC.REG.DURS,Afghanistan,AFG,8.5,True,Time required to start a business (days),8.5
1,Time required to start a business (days),IC.REG.DURS,Albania,ALB,4.5,True,Time required to start a business (days),4.5
2,Time required to start a business (days),IC.REG.DURS,Algeria,DZA,18.0,True,Time required to start a business (days),18.0
3,Time required to start a business (days),IC.REG.DURS,American Samoa,ASM,,True,Time required to start a business (days),
4,Time required to start a business (days),IC.REG.DURS,Andorra,AND,,True,Time required to start a business (days),


In [274]:
def map_days_to_scores(number):
    if number<=2:
        return 4
    if number <6 and number>2:
        return 3
    elif number >=6 and number <11:
        return 2
    elif number >=11:
        return 1

In [275]:
# map days to scores 
df['data_col'] = df['data_col'].apply(map_days_to_scores)

In [276]:
# convert 1-3 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=1,old_max=4))

In [277]:
df.head()

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2019 [YR2019],higher_is_better,Indicator,data_col,new_rank_score
0,Time required to start a business (days),IC.REG.DURS,Afghanistan,AFG,8.5,True,Time required to start a business (days),2.0,2.666667
1,Time required to start a business (days),IC.REG.DURS,Albania,ALB,4.5,True,Time required to start a business (days),3.0,4.333333
2,Time required to start a business (days),IC.REG.DURS,Algeria,DZA,18.0,True,Time required to start a business (days),1.0,1.0
3,Time required to start a business (days),IC.REG.DURS,American Samoa,ASM,,True,Time required to start a business (days),,
4,Time required to start a business (days),IC.REG.DURS,Andorra,AND,,True,Time required to start a business (days),,


In [278]:
df[df['2019 [YR2019]']<3]

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2019 [YR2019],higher_is_better,Indicator,data_col,new_rank_score
10,Time required to start a business (days),IC.REG.DURS,Australia,AUS,2.0,True,Time required to start a business (days),4.0,6.0
35,Time required to start a business (days),IC.REG.DURS,Canada,CAN,1.5,True,Time required to start a business (days),4.0,6.0
72,Time required to start a business (days),IC.REG.DURS,Georgia,GEO,1.0,True,Time required to start a business (days),4.0,6.0
86,Time required to start a business (days),IC.REG.DURS,"Hong Kong SAR, China",HKG,1.5,True,Time required to start a business (days),4.0,6.0
141,Time required to start a business (days),IC.REG.DURS,New Zealand,NZL,0.5,True,Time required to start a business (days),4.0,6.0
171,Time required to start a business (days),IC.REG.DURS,Singapore,SGP,1.5,True,Time required to start a business (days),4.0,6.0
194,Time required to start a business (days),IC.REG.DURS,Togo,TGO,2.5,True,Time required to start a business (days),3.0,4.333333
251,Time required to start a business (days),IC.REG.DURS,North America,NAC,2.85,True,Time required to start a business (days),3.0,4.333333


In [279]:
df['2019 [YR2019]'].describe()

count    238.000000
mean      19.771819
std       23.169794
min        0.500000
25%        8.075000
50%       14.000000
75%       22.991844
max      230.000000
Name: 2019 [YR2019], dtype: float64

In [280]:
df = df[['Country Name', 'Country Code','Indicator', 'data_col', 'new_rank_score','higher_is_better']]
df.sort_values(by='new_rank_score', ascending=False).head(16)

Unnamed: 0,Country Name,Country Code,Indicator,data_col,new_rank_score,higher_is_better
171,Singapore,SGP,Time required to start a business (days),4.0,6.0,True
141,New Zealand,NZL,Time required to start a business (days),4.0,6.0,True
10,Australia,AUS,Time required to start a business (days),4.0,6.0,True
86,"Hong Kong SAR, China",HKG,Time required to start a business (days),4.0,6.0,True
35,Canada,CAN,Time required to start a business (days),4.0,6.0,True
72,Georgia,GEO,Time required to start a business (days),4.0,6.0,True
1,Albania,ALB,Time required to start a business (days),3.0,4.333333,True
205,United Kingdom,GBR,Time required to start a business (days),3.0,4.333333,True
31,Burundi,BDI,Time required to start a business (days),3.0,4.333333,True
62,Estonia,EST,Time required to start a business (days),3.0,4.333333,True


#### Prepare Output

In [281]:
bf

'time_start_bus'

In [282]:
# output scores
df.to_csv('../indicator_scores/{}_scores.csv'.format(bf), index=False)

### 17. Ease doing business


#### Load Data

In [318]:
indicator = indicators[16]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Ease of Doing Business
ease_doing_bus


In [319]:
# remove unwanted rows
df = df.replace('..', np.nan)
df = df[~df['Series Code'].isna()]


In [320]:
df['Series Name'].unique()

array(['Ease of doing business index (1=most business-friendly regulations)'],
      dtype=object)

In [286]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 266 entries, 0 to 265
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Series Name    266 non-null    object 
 1   Series Code    266 non-null    object 
 2   Country Name   266 non-null    object 
 3   Country Code   266 non-null    object 
 4   1990 [YR1990]  0 non-null      float64
 5   2000 [YR2000]  0 non-null      float64
 6   2011 [YR2011]  0 non-null      float64
 7   2012 [YR2012]  0 non-null      float64
 8   2013 [YR2013]  0 non-null      float64
 9   2014 [YR2014]  0 non-null      float64
 10  2015 [YR2015]  0 non-null      float64
 11  2016 [YR2016]  0 non-null      float64
 12  2017 [YR2017]  0 non-null      float64
 13  2018 [YR2018]  0 non-null      float64
 14  2019 [YR2019]  189 non-null    object 
 15  2020 [YR2020]  0 non-null      float64
dtypes: float64(11), object(5)
memory usage: 35.3+ KB


In [287]:
# keep only most recent value
df = df[['Series Name', 'Series Code', 'Country Name', 'Country Code','2019 [YR2019]']]

# df['higher_is_better'] = False
df['Indicator'] = df['Series Name']
df['2019 [YR2019]'] = df['2019 [YR2019]'].astype(float)
df['data_col'] = df['2019 [YR2019]']
# df['data_norm'] = df['data_norm'] = (df.data_col - df.data_col.mean())/df.data_col.std()

In [290]:
rank_min = df.data_col.min()
rank_max = df.data_col.max()

In [292]:
rank_min, rank_max

(1.0, 190.0)

In [293]:
# convert 1-190 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=rank_min,old_max=rank_max))

In [294]:
# invert since to put as higher is  better
df['new_rank_score'] = df['new_rank_score'].apply(lambda row: (6-row)+1)

In [295]:
df.sort_values(by='new_rank_score', ascending=False).head(16)

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2019 [YR2019],Indicator,data_col,new_rank_score
141,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,New Zealand,NZL,1.0,Ease of doing business index (1=most business-...,1.0,6.0
171,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Singapore,SGP,2.0,Ease of doing business index (1=most business-...,2.0,5.973545
86,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,"Hong Kong SAR, China",HKG,3.0,Ease of doing business index (1=most business-...,3.0,5.94709
53,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Denmark,DNK,4.0,Ease of doing business index (1=most business-...,4.0,5.920635
104,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,"Korea, Rep.",KOR,5.0,Ease of doing business index (1=most business-...,5.0,5.89418
206,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,United States,USA,6.0,Ease of doing business index (1=most business-...,6.0,5.867725
72,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Georgia,GEO,7.0,Ease of doing business index (1=most business-...,7.0,5.84127
205,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,United Kingdom,GBR,8.0,Ease of doing business index (1=most business-...,8.0,5.814815
147,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Norway,NOR,9.0,Ease of doing business index (1=most business-...,9.0,5.78836
187,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Sweden,SWE,10.0,Ease of doing business index (1=most business-...,10.0,5.761905


In [296]:
df['higher_is_better'] = True


#### Prepare Output

In [297]:
df = df[['Country Name', 'Country Code','Indicator', 'data_col', 'new_rank_score','higher_is_better']]

# output scores
df.to_csv('../indicator_scores/{}_scores.csv'.format(bf), index=False)

## 18. Ease of finding skilled employees

In [351]:
indicator = indicators[17]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Ease of finding skilled employees
ease_of_finding_skilled_employees


In [352]:
df.head()

Unnamed: 0,2017,2018,2019,Country,Unnamed: 4
0,3.88,4.03,3.89,Albania,
1,3.98,3.84,4.12,Algeria,
2,No data,2.08,2.76,Angola,
3,4.35,4.1,4.19,Argentina,
4,3.82,3.87,4.03,Armenia,


In [303]:
values = ['2017','2018','2019']

In [305]:
df = df.replace('No data', np.nan)

In [306]:
df[values] = df[values].astype(float)

In [313]:
df.head()

Unnamed: 0,2017,2018,2019,Country Name,Unnamed: 4,data_col,higher_is_better,Indicator,new_rank_score
0,3.88,4.03,3.89,Albania,,3.89,True,Ease of finding skilled employees,3.89
1,3.98,3.84,4.12,Algeria,,4.12,True,Ease of finding skilled employees,4.12
2,,2.08,2.76,Angola,,2.76,True,Ease of finding skilled employees,2.76
3,4.35,4.1,4.19,Argentina,,4.19,True,Ease of finding skilled employees,4.19
4,3.82,3.87,4.03,Armenia,,4.03,True,Ease of finding skilled employees,4.03


In [307]:
df[values].describe()

Unnamed: 0,2017,2018,2019
count,132.0,136.0,137.0
mean,4.183258,4.136176,4.196058
std,0.664458,0.659186,0.589124
min,2.72,2.08,2.76
25%,3.685,3.6375,3.84
50%,4.065,4.095,4.17
75%,4.6925,4.655,4.63
max,5.67,5.75,5.32


In [312]:
# create standard columns
df['data_col'] = df['2019']
df['new_rank_score'] = df['data_col']
df['higher_is_better'] = True
df['Indicator'] = indicator


df.rename(columns={'Country':'Country Name'}, inplace=True)


df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].head(15)


# output scores to csv
df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].to_csv('../indicator_scores/{}_scores.csv'.format(bf), index=False)

## 19. Amount invested into startups yearly from private, public, blended sources (respectively)


In [314]:
indicator = indicators[18]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Amount invested into startups yearly from private, public, blended sources (respectively)
start_up_investment


FileNotFoundError: [Errno 2] No such file or directory: '../../processed/start_up_investment.csv'

No data

## 20. Doing Business Index


In [315]:
indicator = indicators[19]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Doing Business Index
doing_bus_idx


In [317]:
bnames

Unnamed: 0,Indicator,check,Data Source,Index,Filename
75,UNCTAD Business-to-Consumer (B2C) E-commerce I...,Business,UNCTAD/World Bank,True,b2c_ecommerse_idx
76,"Networking Services (Spend, IT Forecast Data)",Business,Portulans Institute,True,network_readiness_index
77,ICT goods exports,Business,UNCTAD,False,ict_goods
78,ICT goods imports,Business,UNCTAD,False,ict_goods
79,ICT service exports,Business,UNCTAD,False,ict_services
80,ICT service imports,Business,UNCTAD,False,ict_services
81,"Cloud Services (Spend, IT Forecast Data)",Business,Statista/Business Software Alliance,True,cloud_services
82,ICT task-intensive jobs as a percentage of tot...,Business,OECD,False,ICT_proportion
85,Share of business with internet,Business,OECD,False,%_business_internet
86,Share of businesses with broadband,Business,World Bank,False,%_business_broadband


In [316]:
df

Unnamed: 0,Country code,Economy,Region,Income group,DB Year,Ease of doing business rank (DB19),Ease of doing business score (DB17-19 methodology),Ease of doing business score (DB16 methodology),Ease of doing business score (DB15 methodology),Ease of doing business score (DB10-14 methodology),...,Recovery rate (cents on the dollar),Strength of insolvency framework index (0-16) (DB15-19 methodology),Commencement of proceedings index (0-3) (DB15-19 methodology),Management of debtor's assets index (0-6) (DB15-19 methodology),Reorganization proceedings index (0-3) (DB15-19 methodology),Creditor participation index (0-4) (DB15-19 methodology),Score-Recovery rate (cents on the dollar),Score-Strength of insolvency framework index (0-16) (DB15-19 methodology),data_country,data_year
0,AFG,Afghanistan,South Asia,Low income,2005,,,,,,...,0.0,,,,,,0.00,,,
1,AFG,Afghanistan,South Asia,Low income,2006,,,,,,...,0.0,,,,,,0.00,,,
2,AFG,Afghanistan,South Asia,Low income,2007,,,,,,...,0.0,,,,,,0.00,,,
3,AFG,Afghanistan,South Asia,Low income,2008,,,,,,...,25.1,,,,,,27.07,,,
4,AFG,Afghanistan,South Asia,Low income,2009,,,,,,...,24.3,,,,,,26.16,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3020,ZWE,Zimbabwe,Sub-Saharan Africa,Low income,2015,,,47.11,44.36,,...,13.8,5.0,3.0,2.0,0.0,0.0,14.81,31.25,,
3021,ZWE,Zimbabwe,Sub-Saharan Africa,Low income,2016,,47.74,47.94,,,...,16.1,5.0,3.0,2.0,0.0,0.0,17.38,31.25,,
3022,ZWE,Zimbabwe,Sub-Saharan Africa,Low income,2017,,47.73,,,,...,18.0,5.0,3.0,2.0,0.0,0.0,19.43,31.25,,
3023,ZWE,Zimbabwe,Sub-Saharan Africa,Low income,2018,,48.52,,,,...,19.7,5.0,3.0,2.0,0.0,0.0,21.17,31.25,,


### Score Aggregating

In [321]:
import os


In [353]:
# get list of files in scores folder
scores = os.listdir('../indicator_scores/')

In [354]:
scores

['b2c_ecommerse_idx_scores.csv',
 'cloud_services_scores.csv',
 'ease_doing_bus_scores.csv',
 'ease_of_finding_skilled_employees_scores.csv',
 'ict_goods_exports_scores.csv',
 'ict_goods_imports_scores.csv',
 'ICT_proportion_scores.csv',
 'ict_services_exports_scores.csv',
 'ict_services_imports_scores.csv',
 'legal_rights_strength_scores.csv',
 'network_readiness_index_scores.csv',
 'time_start_bus_scores.csv',
 'venture_cap_avail_scores.csv']

In [355]:
# create a dataframe that concatenates all these file into one table
df = pd.concat([pd.read_csv('../indicator_scores/{}'.format(s)) for s in scores])    

In [356]:
df

Unnamed: 0,Country Code,Country Name,Indicator,data_col,new_rank_score,higher_is_better
0,AFG,Afghanistan,Country rank and value in the UNCTAD B2C E-com...,132.0,1.419580,True
1,AGO,Angola,Country rank and value in the UNCTAD B2C E-com...,113.0,2.083916,True
2,ALB,Albania,Country rank and value in the UNCTAD B2C E-com...,59.0,3.972028,True
3,ARE,United Arab Emirates,Country rank and value in the UNCTAD B2C E-com...,23.0,5.230769,True
4,ARG,Argentina,Country rank and value in the UNCTAD B2C E-com...,81.0,3.202797,True
...,...,...,...,...,...,...
147,VNM,Vietnam,"Venture capital availability, 1-7 (best)",61.0,3.857143,True
148,YEM,"Yemen, Rep.","Venture capital availability, 1-7 (best)",125.0,1.571429,True
149,ZAF,South Africa,"Venture capital availability, 1-7 (best)",77.0,3.285714,True
150,ZMB,Zambia,"Venture capital availability, 1-7 (best)",139.0,1.071429,True


In [357]:
# Data cleaning
df['new_rank_score'] = df['new_rank_score'].fillna(0)
df.sort_values(by=['Country Name'], ascending=True, inplace=True)
df.reset_index(drop=True, inplace=True)

In [358]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2458 entries, 0 to 2457
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Country Code      1097 non-null   object 
 1   Country Name      2458 non-null   object 
 2   Indicator         2458 non-null   object 
 3   data_col          2292 non-null   float64
 4   new_rank_score    2458 non-null   float64
 5   higher_is_better  2458 non-null   bool   
dtypes: bool(1), float64(2), object(3)
memory usage: 98.5+ KB


In [359]:
df

Unnamed: 0,Country Code,Country Name,Indicator,data_col,new_rank_score,higher_is_better
0,,"ACP (African, Caribbean and Pacific Group of S...",Total ICT goods Exports,0.43902,1.021951,True
1,,"ACP (African, Caribbean and Pacific Group of S...",ICT services Exports,3.49129,1.174565,True
2,,"ACP (African, Caribbean and Pacific Group of S...",Total ICT goods Imports,4.83498,1.241749,True
3,,ACP: Africa,ICT services Exports,4.92272,1.246136,True
4,,ACP: Africa,Total ICT goods Exports,0.41851,1.020925,True
...,...,...,...,...,...,...
2453,,Zimbabwe,Total ICT goods Exports,0.03435,1.001718,True
2454,,Zimbabwe,Ease of finding skilled employees,4.38000,4.380000,True
2455,ZWE,Zimbabwe,Strength of legal rights index (0=weak to 12=s...,6.00000,3.500000,True
2456,ZWE,Zimbabwe,"Venture capital availability, 1-7 (best)",130.00000,1.392857,True


In [360]:
df.head(15)

Unnamed: 0,Country Code,Country Name,Indicator,data_col,new_rank_score,higher_is_better
0,,"ACP (African, Caribbean and Pacific Group of S...",Total ICT goods Exports,0.43902,1.021951,True
1,,"ACP (African, Caribbean and Pacific Group of S...",ICT services Exports,3.49129,1.174565,True
2,,"ACP (African, Caribbean and Pacific Group of S...",Total ICT goods Imports,4.83498,1.241749,True
3,,ACP: Africa,ICT services Exports,4.92272,1.246136,True
4,,ACP: Africa,Total ICT goods Exports,0.41851,1.020925,True
5,,ACP: Africa,Total ICT goods Imports,5.01542,1.250771,True
6,,ACP: Caribbean,Total ICT goods Exports,0.31389,1.015694,True
7,,ACP: Caribbean,Total ICT goods Imports,2.40293,1.120146,True
8,,ACP: Caribbean,ICT services Exports,,0.0,True
9,,ACP: Central Africa,Total ICT goods Exports,0.0154,1.00077,True


In [361]:
df.describe()

Unnamed: 0,data_col,new_rank_score
count,2292.0,2458.0
mean,24.072451,2.225221
std,38.19965,1.498046
min,0.0,0.0
25%,3.0,1.139329
50%,6.68932,1.589146
75%,23.61134,3.379406
max,190.0,6.0


In [362]:
# checking country names
sorted(df['Country Name'].unique().tolist())

['ACP (African, Caribbean and Pacific Group of States)',
 'ACP: Africa',
 'ACP: Caribbean',
 'ACP: Central Africa',
 'ACP: Eastern Africa',
 'ACP: LDCs',
 'ACP: LLDCs',
 'ACP: Northern Africa',
 'ACP: Pacific',
 'ACP: Southern Africa',
 'ACP: Western Africa',
 'ACP: islands states',
 'ACS (Association of Caribbean States)',
 'ADB (African Development Bank), developing member countries',
 'ADB: Group A',
 'ADB: Group B',
 'ADB: Group C',
 "ALBA-TCP (Bolivarian Alliance for the Peoples of Our America - Peoples' Trade Treaty)",
 'AMU (Arab Maghreb Union)',
 'APEC (Asia-Pacific Economic Cooperation)',
 'APTA (Asia-Pacific Trade Agreement)',
 'ASEAN (Association of Southeast Asian Nations)',
 'ASEAN (Association of Southeast Asian Nations) plus China, Japan and Republic of Korea',
 'Afghanistan',
 'Africa',
 'Africa Eastern and Southern',
 'Africa Western and Central',
 'Africa excluding South Africa',
 'African Union',
 'Albania',
 'Albania ',
 'Algeria',
 'Algeria ',
 'America',
 'America

In [364]:
# remove trailing whitespaces from country name
df['Country Name'] = df['Country Name'].str.strip()


In [365]:
# checking country names
sorted(df['Country Name'].unique().tolist())

['ACP (African, Caribbean and Pacific Group of States)',
 'ACP: Africa',
 'ACP: Caribbean',
 'ACP: Central Africa',
 'ACP: Eastern Africa',
 'ACP: LDCs',
 'ACP: LLDCs',
 'ACP: Northern Africa',
 'ACP: Pacific',
 'ACP: Southern Africa',
 'ACP: Western Africa',
 'ACP: islands states',
 'ACS (Association of Caribbean States)',
 'ADB (African Development Bank), developing member countries',
 'ADB: Group A',
 'ADB: Group B',
 'ADB: Group C',
 "ALBA-TCP (Bolivarian Alliance for the Peoples of Our America - Peoples' Trade Treaty)",
 'AMU (Arab Maghreb Union)',
 'APEC (Asia-Pacific Economic Cooperation)',
 'APTA (Asia-Pacific Trade Agreement)',
 'ASEAN (Association of Southeast Asian Nations)',
 'ASEAN (Association of Southeast Asian Nations) plus China, Japan and Republic of Korea',
 'Afghanistan',
 'Africa',
 'Africa Eastern and Southern',
 'Africa Western and Central',
 'Africa excluding South Africa',
 'African Union',
 'Albania',
 'Algeria',
 'America',
 'American Samoa',
 'Andorra',
 'An

In [366]:
# average indicator scores per country
agg_df = df.groupby(['Country Name']).agg({'new_rank_score':'mean','data_col':'count'})

In [367]:
agg_df.columns = ['agg_score', 'count_source' ]

In [368]:
max_number_sources = agg_df.describe()['count_source']['max']

In [369]:
agg_df['agg_score_wt'] = agg_df['agg_score']*(agg_df['count_source']/max_number_sources)

In [370]:
agg_df.sort_values(by='agg_score', ascending=False, inplace=True)

In [371]:
agg_df.head(25)

Unnamed: 0_level_0,agg_score,count_source,agg_score_wt
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Taiwan, China",5.146429,2,0.791758
Korea,5.108696,1,0.392977
United States,4.861321,9,3.36553
Kosovo,4.811728,3,1.110399
Hong Kong (China),4.511111,1,0.347009
New Zealand,4.477888,9,3.100076
Switzerland,4.296162,8,2.643792
Singapore,4.209273,12,3.885483
Russia,4.195652,1,0.322742
Canada,4.061403,11,3.436572


In [373]:
agg_df.to_csv('../pillar_scores/business_scores_v0.csv')