In [1]:
import pandas as pd
import numpy as np

### Get all the pillar names from the excel

In [2]:
names = pd.read_excel('../../UNDP Digital Assessment Data Framework Filename Matching V7.xlsx')

In [3]:
col_names = ['Indicator','check', 'Data Source','Index','Filename']

In [4]:
names = names[col_names]

In [5]:
names.head()

Unnamed: 0,Indicator,check,Data Source,Index,Filename
0,Countries,,United Nations,False,Countries
1,"Database of Global Administrative Areas (GADM,...",,GADM maps and data,False,
2,High Resolution Population Density Maps + Demo...,,Facebook,False,
3,population density vs openstreetmap object den...,,Kontur,False,
4,Population Density,Infrastructure,World Bank,False,population_density


In [6]:
# get all the files per pillar
data_stats = names.groupby('check').agg({'Filename':'count','Indicator':'count'})

In [7]:
data_stats

Unnamed: 0_level_0,Filename,Indicator
check,Unnamed: 1_level_1,Unnamed: 2_level_1
Business,16,25
Foundations,8,13
Government,9,15
Infrastructure,39,48
People,34,47
Regulation,5,7
Strategy,1,1


### Business

In [8]:
bnames = names[(names.check=='Business')&(~names.Filename.isna())]#&(names.Index==False)]

In [9]:
bnames.head(25)

Unnamed: 0,Indicator,check,Data Source,Index,Filename
75,UNCTAD Business-to-Consumer (B2C) E-commerce I...,Business,UNCTAD/World Bank,True,b2c_ecommerse_idx
76,"Networking Services (Spend, IT Forecast Data)",Business,Portulans Institute,True,network_readiness_index
81,"Cloud Services (Spend, IT Forecast Data)",Business,Statista/Business Software Alliance,True,cloud_services
82,ICT task-intensive jobs as a percentage of tot...,Business,OECD,False,ICT_proportion
85,Share of business with internet,Business,OECD,False,business_internet
86,Share of businesses with broadband,Business,World Bank,False,business_broadband
87,Share of businesses with online presence,Business,Portulans Institute,False,share_of_businesses_online_presence
88,Size of gig economy (% of GDP),Business,Portulans Institute,False,prevalance_gig_economy
89,Size of digital economy (% of transactions),Business,Portulans Institute,False,size_digital_economy
90,Venture Capital Availability,Business,World Bank,True,venture_cap_avail


In [10]:
# get list of names for all indicators
indicators = bnames.Indicator.unique()

In [11]:
# get all file names
bfiles = bnames.Filename.unique()

In [12]:
bfiles

array(['b2c_ecommerse_idx', 'network_readiness_index', 'cloud_services',
       'ICT_proportion', 'business_internet', 'business_broadband',
       'share_of_businesses_online_presence', 'prevalance_gig_economy',
       'size_digital_economy', 'venture_cap_avail',
       'legal_rights_strength', 'time_start_bus', 'ease_doing_bus',
       'ease_of_finding_skilled_employees', 'start_up_investment',
       'doing_bus_idx'], dtype=object)

In [13]:
# ls digital-readiness-assessment-main/processed/

In [14]:
# formula for converting scale
def convert_rank(old_value, old_min=1, old_max=7, new_min=1, new_max=6 ):
    """ Convert old scale values scale into new scale values"""
    old_range = old_max - old_min
    new_range = new_max - new_min
    new_value = (((old_value-old_min)*new_range)/old_range)+new_min
    return new_value

### 1. 'UNCTAD Business-to-Consumer (B2C) E-commerce Index'

In [15]:
indicators[0]

'UNCTAD Business-to-Consumer (B2C) E-commerce Index'

In [16]:
# load data
indicator = indicators[0]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

UNCTAD Business-to-Consumer (B2C) E-commerce Index
b2c_ecommerse_idx


In [17]:
df.Indicator.unique()

array(['Country rank and value in the UNCTAD B2C E-commerce Index'],
      dtype=object)

In [18]:
df.head()

Unnamed: 0,2015,2016,2017,Country ISO3,Country Name,Indicator Id,Indicator,Subindicator Type
0,14.1,17.0,,AFG,Afghanistan,24717,Country rank and value in the UNCTAD B2C E-com...,Value
1,,130.0,132.0,AFG,Afghanistan,24718,Country rank and value in the UNCTAD B2C E-com...,Rank
2,21.1,29.0,,AGO,Angola,24717,Country rank and value in the UNCTAD B2C E-com...,Value
3,,113.0,113.0,AGO,Angola,24718,Country rank and value in the UNCTAD B2C E-com...,Rank
4,51.0,62.0,,ALB,Albania,24717,Country rank and value in the UNCTAD B2C E-com...,Value


In [19]:
# two sub indicators per country
df['Subindicator Type'].unique()

array(['Value', 'Rank'], dtype=object)

In [20]:
# list of columns with data
value_cols = ['2015','2016','2017']

In [21]:
# value doesn't have data for 2017
df[df['Subindicator Type']=='Value'].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147 entries, 0 to 292
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   2015               137 non-null    float64
 1   2016               136 non-null    float64
 2   2017               0 non-null      float64
 3   Country ISO3       147 non-null    object 
 4   Country Name       147 non-null    object 
 5   Indicator Id       147 non-null    int64  
 6   Indicator          147 non-null    object 
 7   Subindicator Type  147 non-null    object 
dtypes: float64(3), int64(1), object(4)
memory usage: 10.3+ KB


In [22]:
df[df['Subindicator Type']=='Value'].describe()

Unnamed: 0,2015,2016,2017,Indicator Id
count,137.0,136.0,0.0,147.0
mean,47.151095,53.713382,,24717.0
std,22.978008,26.604495,,0.0
min,6.5,3.0,,24717.0
25%,28.7,32.0,,24717.0
50%,47.2,53.0,,24717.0
75%,65.2,78.25,,24717.0
max,89.7,96.5,,24717.0


In [23]:
# Rank does have 2017 data
df[df['Subindicator Type']=='Rank'].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147 entries, 1 to 293
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   2015               0 non-null      float64
 1   2016               137 non-null    float64
 2   2017               136 non-null    float64
 3   Country ISO3       147 non-null    object 
 4   Country Name       147 non-null    object 
 5   Indicator Id       147 non-null    int64  
 6   Indicator          147 non-null    object 
 7   Subindicator Type  147 non-null    object 
dtypes: float64(3), int64(1), object(4)
memory usage: 10.3+ KB


In [24]:
df[df['Subindicator Type']=='Rank'].describe()

Unnamed: 0,2015,2016,2017,Indicator Id
count,0.0,137.0,136.0,147.0
mean,,69.0,73.066176,24718.0
std,,39.692569,41.872682,0.0
min,,1.0,1.0,24718.0
25%,,35.0,36.75,24718.0
50%,,69.0,72.5,24718.0
75%,,103.0,110.25,24718.0
max,,137.0,144.0,24718.0


Going to use the rank column for now since it has more recent data

In [25]:
df_rank = df[df['Subindicator Type']=='Rank'].copy()

In [26]:
# convert rank into 1-6 - in order to get most recent data 
df_rank['data_col'] = df_rank['2017'] 

min_rank = df_rank['data_col'].min()
max_rank = df_rank['data_col'].max()

# transform 1-147 rank into 1-6
df_rank['new_rank_score'] = df_rank['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# need to invert score since higher rank is not better 
df_rank['new_rank_score'] = (6-df_rank['new_rank_score'])+1

In [27]:
# prep output

df_rank.rename(columns={'Country ISO3':'Country Code'}, inplace=True)

df_rank['higher_is_better'] = True


# output scores to csv
df_rank[['Country Code', 'Country Name','Indicator','data_col','new_rank_score','higher_is_better']].to_csv('../indicator_scores/business_{}_scores.csv'.format(indicator), index=False)

In [28]:
df_rank[['Country Code', 'Country Name','Indicator','data_col','new_rank_score','higher_is_better']].head(15)


Unnamed: 0,Country Code,Country Name,Indicator,data_col,new_rank_score,higher_is_better
1,AFG,Afghanistan,Country rank and value in the UNCTAD B2C E-com...,132.0,1.41958,True
3,AGO,Angola,Country rank and value in the UNCTAD B2C E-com...,113.0,2.083916,True
5,ALB,Albania,Country rank and value in the UNCTAD B2C E-com...,59.0,3.972028,True
7,ARE,United Arab Emirates,Country rank and value in the UNCTAD B2C E-com...,23.0,5.230769,True
9,ARG,Argentina,Country rank and value in the UNCTAD B2C E-com...,81.0,3.202797,True
11,ARM,Armenia,Country rank and value in the UNCTAD B2C E-com...,78.0,3.307692,True
13,AUS,Australia,Country rank and value in the UNCTAD B2C E-com...,14.0,5.545455,True
15,AUT,Austria,Country rank and value in the UNCTAD B2C E-com...,17.0,5.440559,True
17,AZE,Azerbaijan,Country rank and value in the UNCTAD B2C E-com...,68.0,3.657343,True
19,BDI,Burundi,Country rank and value in the UNCTAD B2C E-com...,140.0,1.13986,True


## 2. Networking Services (Spend, IT Forecast Data)


In [29]:
indicator = indicators[1]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Networking Services (Spend, IT Forecast Data)
network_readiness_index


In [30]:
# no DATE associated wiht the data

In [31]:
df.head(16)

Unnamed: 0,Rank,Country,Score,Income Group,Region
0,1,Sweden,82.75,High-income,Europe
1,2,Denmark,82.19,High-income,Europe
2,3,Singapore,81.39,High-income,Asia & Pacific
3,4,Netherlands,81.37,High-income,Europe
4,5,Switzerland,80.41,High-income,Europe
5,6,Finland,80.16,High-income,Europe
6,7,Norway,79.39,High-income,Europe
7,8,United States,78.91,High-income,The Americas
8,9,Germany,77.48,High-income,Europe
9,10,United Kingdom,76.27,High-income,Europe


In [32]:
# going to use the  score column since this is already an Index

In [33]:
# convert rank into 1-6 - in order to get most recent data 
df['data_col'] = df['Score'] 

# going to assume index is between 1-100 but not 100% sure
min_rank = 1 #df['data_col'].min()
max_rank = 100 #df['data_col'].max()

# transform 1-100 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# need to invert score since higher rank is not better 
# df['new_rank_score'] = (6-df['new_rank_score'])+1

In [34]:
# prepare output
df.rename(columns={'Country':'Country Name'}, inplace=True)
df['Indicator'] = indicator
df['higher_is_better'] = True

df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].head(15)


# output scores to csv
df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].to_csv('../indicator_scores/business {}_scores.csv'.format(indicator), index=False)

### 3. Cloud Services (Spend, IT Forecast Data)


In [35]:
indicator = indicators[2]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Cloud Services (Spend, IT Forecast Data)
cloud_services


In [36]:
# remove nulls
df = df.dropna()

In [37]:
# prepare standard columns
df['data_col'] = df['Unnamed: 1'].astype(float)
df['Country Name'] = df.iloc[:,0]
df['Indicator'] = indicator
df['higher_is_better'] = True
df['Year'] = 2018

In [38]:
min_rank = 1
max_rank = df['Country Name'].nunique()

In [39]:
# transform 1-24 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# need to invert score since higher rank is not better 
# df['new_rank_score'] = (6-df['new_rank_score'])+1

In [40]:
# prepare output
df.sort_values(by='new_rank_score', ascending=False)

# prepare output
df = df[['Country Name', 'Year','Indicator', 'data_col', 'new_rank_score','higher_is_better']]

# output scores
df.to_csv('../indicator_scores/business_{}_scores.csv'.format(indicator), index=False)

In [41]:
df.head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
2,Germany,2018,"Cloud Services (Spend, IT Forecast Data)",18.2,4.73913,True
3,Japan,2018,"Cloud Services (Spend, IT Forecast Data)",20.3,5.195652,True
4,United States,2018,"Cloud Services (Spend, IT Forecast Data)",18.0,4.695652,True
5,United Kingdom,2018,"Cloud Services (Spend, IT Forecast Data)",19.8,5.086957,True
6,Australia,2018,"Cloud Services (Spend, IT Forecast Data)",16.1,4.282609,True
7,Singapore,2018,"Cloud Services (Spend, IT Forecast Data)",20.7,5.282609,True
8,Canada,2018,"Cloud Services (Spend, IT Forecast Data)",17.0,4.478261,True
9,France,2018,"Cloud Services (Spend, IT Forecast Data)",17.3,4.543478,True
10,Italy,2018,"Cloud Services (Spend, IT Forecast Data)",15.0,4.043478,True
11,Spain,2018,"Cloud Services (Spend, IT Forecast Data)",16.6,4.391304,True


## 4. ICT task-intensive jobs as a percentage of total employment

In [42]:
indicator = indicators[3]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

ICT task-intensive jobs as a percentage of total employment
ICT_proportion


In [43]:
df.head()

Unnamed: 0,Indicator,Country,Industry,Information and communication technologies,Sex,Measure,Time,Value,Flags
0,ICT specialists within and outside information...,Austria,Total economy,Specialist (ISCO-08: 133+215+251+252+351+352+742),Total,Share of jobs,2011,3.1764,
1,ICT specialists within and outside information...,Austria,Total economy,Specialist (ISCO-08: 133+215+251+252+351+352+742),Total,Share of jobs,2012,3.225967,
2,ICT specialists within and outside information...,Austria,Total economy,Specialist (ISCO-08: 133+215+251+252+351+352+742),Total,Share of jobs,2013,3.346251,
3,ICT specialists within and outside information...,Austria,Total economy,Specialist (ISCO-08: 133+215+251+252+351+352+742),Total,Share of jobs,2014,3.3191,
4,ICT specialists within and outside information...,Austria,Total economy,Specialist (ISCO-08: 133+215+251+252+351+352+742),Total,Share of jobs,2015,3.72934,


In [44]:
df[(df['Time']==2018)&(df['Information and communication technologies']=='ICT-intensive')].sort_values(by='Value', ascending=False)

Unnamed: 0,Indicator,Country,Industry,Information and communication technologies,Sex,Measure,Time,Value,Flags


In [45]:
# bnames

In [46]:
df['Information and communication technologies'].unique()

array(['Specialist (ISCO-08: 133+215+251+252+351+352+742)',
       'Other ICT-intensive (ISCO-08: 121+122,134+,211+,216+,231+,241+,242+243)',
       'Non-ICT (rest of ISCO-08 occupations)', 'ICT-intensive', 'Total'],
      dtype=object)

In [47]:
df.Sex.unique()

array(['Total'], dtype=object)

In [48]:
# convert to correct types
df['Value'] = df['Value'].astype(float)

In [49]:
df['Value'].describe()

count    985.000000
mean      42.353406
std       42.614469
min        0.890157
25%        5.526795
50%       12.887070
75%       91.318100
max      100.000000
Name: Value, dtype: float64

In [50]:
# filter on relevant years
df = df[(df['Time']==2017)&(df['Information and communication technologies']=='ICT-intensive')]

# convert rank into 1-6 - in order to get most recent data 
df['data_col'] = df['Value'] 

# going to assume index is between 1-100 but not 100% sure
min_rank = 0 #df['data_col'].min()
max_rank = 100 #df['data_col'].max()

# transform 1-147 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# need to invert score since higher rank is not better 
# df['new_rank_score'] = (6-df['new_rank_score'])+1

# df_rank[['Country ISO3', 'Country Name','Indicator','data_col','new_rank_score']].head()

df.rename(columns={'Country':'Country Name'}, inplace=True)
df['Indicator'] = indicator
df['higher_is_better'] = True
df['Year'] = df['Time']

df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)


# output scores to csv
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].to_csv('../indicator_scores/business_{}_scores.csv'.format(indicator), index=False)

In [51]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)


Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
27,Austria,2017,ICT task-intensive jobs as a percentage of tot...,10.88416,1.544208,True
62,Belgium,2017,ICT task-intensive jobs as a percentage of tot...,14.44826,1.722413,True
97,Czech Republic,2017,ICT task-intensive jobs as a percentage of tot...,9.219953,1.460998,True
157,Estonia,2017,ICT task-intensive jobs as a percentage of tot...,15.76142,1.788071,True
217,Finland,2017,ICT task-intensive jobs as a percentage of tot...,15.22048,1.761024,True
252,France,2017,ICT task-intensive jobs as a percentage of tot...,12.00835,1.600418,True
287,Germany,2017,ICT task-intensive jobs as a percentage of tot...,10.4419,1.522095,True
322,Greece,2017,ICT task-intensive jobs as a percentage of tot...,6.675247,1.333762,True
357,Hungary,2017,ICT task-intensive jobs as a percentage of tot...,8.369766,1.418488,True
392,Iceland,2017,ICT task-intensive jobs as a percentage of tot...,14.27687,1.713843,True


## 5. Share of business with internet

In [52]:
indicator = indicators[4]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Share of business with internet
business_internet


In [53]:
df= df.replace('..',np.nan)

In [54]:
df.head(15)

Unnamed: 0,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,Country
0,52.47,54.8,60.17,61.52,67.16,67.81,69.33,76.25,75.58,75.77,76.73,75.62,77.37,79.38,80.37,,Australia
1,72.22,78.81,80.06,79.84,79.85,80.19,82.87,82.01,85.7,86.35,87.46,88.11,85.55,87.92,89.45,90.42,Austria
2,,,,,77.37,78.47,76.6,76.01,78.26,79.15,81.04,81.0,82.6,84.03,86.72,86.62,Belgium
3,64.8,67.5,69.7,,,,,79.8,77.5,,,,78.5,,81.8,,Canada
4,,,,47.87,51.74,54.03,59.41,63.27,66.55,67.0,66.47,67.43,67.17,67.81,,,Colombia
5,,70.08,71.12,73.99,72.66,73.63,77.44,79.67,79.86,82.63,82.57,82.15,82.9,82.79,83.31,83.32,Czech Republic
6,,,,,87.61,87.83,88.68,89.3,91.78,91.4,91.95,93.34,95.09,95.58,93.92,92.77,Denmark
7,52.65,57.86,61.87,65.73,67.53,70.04,72.63,74.97,75.74,77.56,79.73,77.93,78.09,78.36,81.18,79.79,Estonia
8,,,,,84.62,87.32,92.56,91.3,93.64,95.1,95.2,95.33,96.28,95.64,,95.92,Finland
9,,,,,54.05,57.71,60.05,64.48,65.3,63.59,66.82,68.5,66.53,69.41,71.54,70.35,France


In [55]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['2020'].astype(float)
df['Country Name'] = df['Country']
df['Year'] = 2020

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [56]:
df = df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']]
df.head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,Australia,2020,Share of business with internet,,,True
1,Austria,2020,Share of business with internet,90.42,5.347878,True
2,Belgium,2020,Share of business with internet,86.62,4.89732,True
3,Canada,2020,Share of business with internet,,,True
4,Colombia,2020,Share of business with internet,,,True
5,Czech Republic,2020,Share of business with internet,83.32,4.506047,True
6,Denmark,2020,Share of business with internet,92.77,5.626512,True
7,Estonia,2020,Share of business with internet,79.79,4.087503,True
8,Finland,2020,Share of business with internet,95.92,6.0,True
9,France,2020,Share of business with internet,70.35,2.968224,True


In [57]:
df.to_csv('../indicator_scores/business_{}_scores.csv'.format(indicator), index=False)

## 6. Share of businesses with broadband

In [58]:
indicator = indicators[5]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Share of businesses with broadband
business_broadband


In [59]:
df.head(15)

Unnamed: 0,2008,2009,2010,Country
0,76.94,76.01,82.06,Austria
1,79.33,77.31,86.52,Czech Republic
2,87.53,86.08,88.08,Estonia
3,,91.68,93.31,France
4,83.46,87.9,89.34,Germany
5,70.37,74.19,79.61,Hungary
6,,,95.43,Iceland
7,,76.11,86.84,Ireland
8,,82.92,84.12,Italy
9,,88.03,87.91,Luxembourg


In [60]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['2010']
df['Country Name'] = df['Country']
df['Year'] = 2010

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [61]:
df = df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']]
df.head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,Austria,2010,Share of businesses with broadband,82.06,3.436782,True
1,Czech Republic,2010,Share of businesses with broadband,86.52,4.291188,True
2,Estonia,2010,Share of businesses with broadband,88.08,4.590038,True
3,France,2010,Share of businesses with broadband,93.31,5.591954,True
4,Germany,2010,Share of businesses with broadband,89.34,4.831418,True
5,Hungary,2010,Share of businesses with broadband,79.61,2.967433,True
6,Iceland,2010,Share of businesses with broadband,95.43,5.998084,True
7,Ireland,2010,Share of businesses with broadband,86.84,4.35249,True
8,Italy,2010,Share of businesses with broadband,84.12,3.831418,True
9,Luxembourg,2010,Share of businesses with broadband,87.91,4.557471,True


In [62]:
df.to_csv('../indicator_scores/business_{}_scores.csv'.format(indicator), index=False)

## 7. Share of businesses with online presence

In [63]:
indicator = indicators[6]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))
df.head(15)

Share of businesses with online presence
share_of_businesses_online_presence


Unnamed: 0,RANK,COUNTRY/ECONOMY,VALUE,SCORE,Year
0,1.0,Finland,95.64,100.0,2018
1,2.0,Denmark,93.92,98.01,2018
2,3.0,Japan,92.4,96.24,2018
3,4.0,Netherlands,91.89,95.65,2018
4,5.0,Switzerland,91.74,95.48,2018
5,6.0,Sweden,89.65,93.05,2018
6,7.0,Austria,89.45,92.82,2018
7,8.0,Germany,88.21,91.38,2018
8,9.0,Belgium,86.72,89.65,2018
9,10.0,United Kingdom,83.88,86.35,2018


In [64]:
# create standard columns
df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['VALUE']


min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [65]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']]
df.head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,Finland,2018,Share of businesses with online presence,95.64,6.0,True
1,Denmark,2018,Share of businesses with online presence,93.92,5.900163,True
2,Japan,2018,Share of businesses with online presence,92.4,5.811934,True
3,Netherlands,2018,Share of businesses with online presence,91.89,5.782331,True
4,Switzerland,2018,Share of businesses with online presence,91.74,5.773624,True
5,Sweden,2018,Share of businesses with online presence,89.65,5.65231,True
6,Austria,2018,Share of businesses with online presence,89.45,5.640701,True
7,Germany,2018,Share of businesses with online presence,88.21,5.568725,True
8,Belgium,2018,Share of businesses with online presence,86.72,5.482238,True
9,United Kingdom,2018,Share of businesses with online presence,83.88,5.31739,True


In [66]:
df.to_csv('../indicator_scores/business_{}_scores.csv'.format(indicator), index=False)

## 8. Size of gig economy (% of GDP)

In [67]:
indicator = indicators[7]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Size of gig economy (% of GDP)
prevalance_gig_economy


In [68]:
df.head(15)

Unnamed: 0,RANK,COUNTRY/ECONOMY,VALUE,SCORE
0,1.0,United States,5.4,100.0
1,2.0,Netherlands,5.22,94.63
2,3.0,United Kingdom,5.19,93.8
3,4.0,Saudi Arabia,5.08,90.33
4,5.0,Malaysia,5.07,90.19
5,6.0,Egypt,5.05,89.46
6,7.0,Israel,5.02,88.42
7,8.0,Canada,4.94,86.07
8,9.0,Singapore,4.92,85.52
9,10.0,United Arab Emirates,4.87,83.82


In [69]:
# create standard columns
df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['VALUE']
df['Year'] = 2019

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [70]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']]
df.head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,United States,2019,Size of gig economy (% of GDP),5.4,6.0,True
1,Netherlands,2019,Size of gig economy (% of GDP),5.22,5.72561,True
2,United Kingdom,2019,Size of gig economy (% of GDP),5.19,5.679878,True
3,Saudi Arabia,2019,Size of gig economy (% of GDP),5.08,5.512195,True
4,Malaysia,2019,Size of gig economy (% of GDP),5.07,5.496951,True
5,Egypt,2019,Size of gig economy (% of GDP),5.05,5.466463,True
6,Israel,2019,Size of gig economy (% of GDP),5.02,5.420732,True
7,Canada,2019,Size of gig economy (% of GDP),4.94,5.29878,True
8,Singapore,2019,Size of gig economy (% of GDP),4.92,5.268293,True
9,United Arab Emirates,2019,Size of gig economy (% of GDP),4.87,5.192073,True


In [71]:
df.to_csv('../indicator_scores/business_{}_scores.csv'.format(indicator), index=False)

## 9. Size of digital economy (% of transactions)


In [72]:
indicator = indicators[8]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Size of digital economy (% of transactions)
size_digital_economy


In [73]:
df


Unnamed: 0,Order,Country Name,Value,Score
0,1.0,Singapore,78.13,100.00
1,2.0,Switzerland,64.57,82.59
2,3.0,"Korea, Rep.",63.66,81.42
3,4.0,Germany,61.45,78.58
4,5.0,Hungary,59.72,76.36
...,...,...,...,...
129,,"Congo, Dem. Rep.",,
130,,Dominican Republic,,
131,,Guinea,,
132,,Lesotho,,


In [74]:
# create standard columns
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Value'] 
df['Year'] = 2019

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [75]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']]
df.head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,Singapore,2019,Size of digital economy (% of transactions),78.13,6.0,True
1,Switzerland,2019,Size of digital economy (% of transactions),64.57,5.129318,True
2,"Korea, Rep.",2019,Size of digital economy (% of transactions),63.66,5.070887,True
3,Germany,2019,Size of digital economy (% of transactions),61.45,4.928984,True
4,Hungary,2019,Size of digital economy (% of transactions),59.72,4.817902,True
5,Japan,2019,Size of digital economy (% of transactions),56.21,4.592526,True
6,Ireland,2019,Size of digital economy (% of transactions),54.35,4.473096,True
7,Denmark,2019,Size of digital economy (% of transactions),54.22,4.464749,True
8,Qatar,2019,Size of digital economy (% of transactions),54.17,4.461538,True
9,Sweden,2019,Size of digital economy (% of transactions),53.01,4.387055,True


In [76]:
df.to_csv('../indicator_scores/business_{}_scores.csv'.format(indicator), index=False)

No Country Data

## 10. Venture Capital Availability


In [77]:
indicator = indicators[9]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Venture Capital Availability
venture_cap_avail


In [78]:
df.head(15)

Unnamed: 0,2017,2018,2019,Country ISO3,Country Name,Indicator Id,Indicator,Subindicator Type,2007-2008,2008-2009,2009-2010,2010-2011,2011-2012,2012-2013,2013-2014,2014-2015,2015-2016,2016-2017,2017-2018
0,,,,AGO,Angola,495,GDP (PPP) as % of world total,Value,,,,0.153,0.145,,0.154,,,,
1,,,,AGO,Angola,496,GDP (PPP) as % of world total,Rank,,,,62.0,63.0,,63.0,,,,
2,,,,AGO,Angola,509,"Domestic market size index, 1-7 (best)",Value,,,,3.428947,3.582865,,3.490087,3.502455,,,
3,,,,AGO,Angola,510,"Domestic market size index, 1-7 (best)",Rank,,,,72.0,64.0,,67.0,69.0,,,
4,,,,AGO,Angola,511,"Foreign market size index, 1-7 (best)",Value,,,,4.743048,4.589237,,4.871807,4.839048,,,
5,,,,AGO,Angola,512,"Foreign market size index, 1-7 (best)",Rank,,,,51.0,61.0,,55.0,56.0,,,
6,,,,AGO,Angola,515,Exports as a percentage of GDP,Value,,,,65.6,54.896223,,65.05077,62.317539,,,
7,,,,AGO,Angola,516,Exports as a percentage of GDP,Rank,,,,24.0,33.0,,30.0,32.0,,,
8,,,,AGO,Angola,521,"Availability of financial services, 1-7 (best)",Value,,,,3.266015,2.803951,,2.382353,2.349348,,,
9,,,,AGO,Angola,522,"Availability of financial services, 1-7 (best)",Rank,,,,129.0,138.0,,148.0,143.0,,,


#### What fields should we use?

In [79]:
df.Indicator.unique()  
# relavent indicator is:        'Venture capital availability, 1-7 (best)', id 529

array(['GDP (PPP) as % of world total',
       'Domestic market size index, 1-7 (best)',
       'Foreign market size index, 1-7 (best)',
       'Exports as a percentage of GDP',
       'Availability of financial services, 1-7 (best)',
       'Affordability of financial services, 1-7 (best)',
       'Financing through local equity market, 1-7 (best)',
       'Ease of access to loans, 1-7 (best)',
       'Venture capital availability, 1-7 (best)',
       'Soundness of banks, 1-7 (best)',
       'Women in labor force, ratio to men',
       'Value chain breadth, 1-7 (best)',
       'Production process sophistication, 1-7 (best)',
       'Reliance on professional management, 1-7 (best)',
       'Country capacity to retain talent, 1-7 (best)',
       'Country capacity to attract talent, 1-7 (best)',
       'State of cluster development, 1-7 (best)',
       'Malaria cases/100,000 pop.', 'Tuberculosis cases/100,000 pop.',
       'Degree of customer orientation, 1-7 (best)',
       'Buyer sophi

In [80]:
df[df.Indicator=='Venture capital availability, 1-7 (best)']

# there are two indicator subtypes: Index and Rank
df[df.Indicator=='Venture capital availability, 1-7 (best)']['Subindicator Type'].unique()


array(['Index 1-7 (best)', 'Rank'], dtype=object)

In [81]:
df['data_col'] = df['2019']#.apply(lambda row: convert_rank(row))

In [82]:
# create two data frames for each subtype
df_rank = df[(df.Indicator=='Venture capital availability, 1-7 (best)')&(df['Subindicator Type']=='Rank')]
df_index = df[(df.Indicator=='Venture capital availability, 1-7 (best)')&(df['Subindicator Type']=='Index 1-7 (best)')]

In [83]:
df_index.head()

Unnamed: 0,2017,2018,2019,Country ISO3,Country Name,Indicator Id,Indicator,Subindicator Type,2007-2008,2008-2009,2009-2010,2010-2011,2011-2012,2012-2013,2013-2014,2014-2015,2015-2016,2016-2017,2017-2018,data_col
16,,,,AGO,Angola,529,"Venture capital availability, 1-7 (best)",Index 1-7 (best),,,,1.802891,1.494084,,2.117647,2.34817,,,,
510,,,,ALB,Albania,529,"Venture capital availability, 1-7 (best)",Index 1-7 (best),2.665708,2.524466,2.321898,2.145235,1.956589,1.844924,1.859132,1.948528,1.898624,2.11548,2.482504,
1011,,,,ARE,United Arab Emirates,529,"Venture capital availability, 1-7 (best)",Index 1-7 (best),4.544287,4.296279,3.881013,3.716301,3.973436,4.143612,4.123291,4.352251,4.352251,4.47605,4.574773,
1509,,,,ARG,Argentina,529,"Venture capital availability, 1-7 (best)",Index 1-7 (best),2.839896,2.425063,2.131687,1.897883,1.909561,1.82237,1.745999,1.779793,2.009052,2.151421,2.243158,
2014,,,,ARM,Armenia,529,"Venture capital availability, 1-7 (best)",Index 1-7 (best),2.123574,1.999757,1.921578,1.789272,2.136096,2.369044,2.425793,2.355755,2.537145,2.796361,2.78055,


In [84]:
df_rank.head()

Unnamed: 0,2017,2018,2019,Country ISO3,Country Name,Indicator Id,Indicator,Subindicator Type,2007-2008,2008-2009,2009-2010,2010-2011,2011-2012,2012-2013,2013-2014,2014-2015,2015-2016,2016-2017,2017-2018,data_col
17,,140.0,141.0,AGO,Angola,530,"Venture capital availability, 1-7 (best)",Rank,,,,129.0,140.0,,119.0,98.0,,,,141.0
511,102.0,88.0,81.0,ALB,Albania,530,"Venture capital availability, 1-7 (best)",Rank,95.0,101.0,101.0,107.0,124.0,132.0,136.0,128.0,133.0,124.0,103.0,81.0
1012,17.0,9.0,4.0,ARE,United Arab Emirates,530,"Venture capital availability, 1-7 (best)",Rank,16.0,17.0,13.0,16.0,13.0,8.0,10.0,4.0,7.0,7.0,7.0,4.0
1510,116.0,104.0,118.0,ARG,Argentina,530,"Venture capital availability, 1-7 (best)",Rank,83.0,108.0,117.0,124.0,129.0,135.0,142.0,138.0,126.0,120.0,120.0,118.0
2015,74.0,72.0,56.0,ARM,Armenia,530,"Venture capital availability, 1-7 (best)",Rank,124.0,130.0,129.0,131.0,109.0,89.0,93.0,96.0,90.0,73.0,76.0,56.0


In [85]:
# 152 countries in data
df_rank['Country Name'].nunique()

152

In [86]:
# list of columns with data
value_cols = ['2007-2008', '2008-2009', '2009-2010',
       '2010-2011', '2011-2012', '2012-2013', '2013-2014', '2014-2015',
       '2015-2016', '2016-2017', '2017-2018', '2017', '2018', '2019']

In [87]:
# quick snapshot of dataframes
df_rank[value_cols].describe()

Unnamed: 0,2007-2008,2008-2009,2009-2010,2010-2011,2011-2012,2012-2013,2013-2014,2014-2015,2015-2016,2016-2017,2017-2018,2017,2018,2019
count,127.0,131.0,131.0,138.0,141.0,143.0,147.0,143.0,139.0,137.0,136.0,130.0,134.0,137.0
mean,65.259843,67.038168,67.053435,70.427536,71.900709,72.944056,74.945578,72.902098,70.920863,69.912409,69.375,67.638462,69.5,70.262774
std,37.6275,38.33745,38.215646,40.098763,41.003537,41.51697,42.669289,41.578928,40.397314,39.832272,39.594975,39.205087,40.282601,40.255849
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,33.5,34.5,34.5,36.25,37.0,37.5,38.5,37.5,36.5,36.0,35.75,34.25,35.25,36.0
50%,65.0,67.0,67.0,70.5,72.0,73.0,75.0,73.0,71.0,70.0,69.5,66.5,68.5,70.0
75%,97.5,99.5,99.5,104.75,107.0,108.5,111.5,108.5,105.5,104.0,103.25,101.75,102.75,104.0
max,130.0,134.0,133.0,139.0,142.0,144.0,148.0,144.0,140.0,138.0,137.0,135.0,140.0,141.0


In [88]:
# index values have no 2019 data
df_index[value_cols].describe()

Unnamed: 0,2007-2008,2008-2009,2009-2010,2010-2011,2011-2012,2012-2013,2013-2014,2014-2015,2015-2016,2016-2017,2017-2018,2017,2018,2019
count,127.0,131.0,131.0,138.0,141.0,143.0,147.0,143.0,139.0,137.0,136.0,0.0,0.0,0.0
mean,3.288489,3.164625,2.869158,2.659362,2.687949,2.692248,2.705923,2.754663,2.839938,2.939999,3.002545,,,
std,0.899434,0.82637,0.674535,0.680454,0.745284,0.713336,0.693771,0.712325,0.710434,0.740065,0.785252,,,
min,1.869061,1.500274,1.487119,1.488852,1.420704,1.494243,1.466607,1.474669,1.54505,1.643615,1.621595,,,
25%,2.631797,2.529897,2.361602,2.15608,2.147166,2.193814,2.18349,2.215631,2.328283,2.352083,2.480248,,,
50%,3.11007,3.003566,2.733178,2.498967,2.573486,2.513357,2.576551,2.679087,2.715083,2.833833,2.847194,,,
75%,3.918566,3.792939,3.36104,3.080125,3.015231,3.038222,3.153995,3.172582,3.243787,3.290093,3.435095,,,
max,5.278317,5.127621,4.610427,4.441749,5.389427,4.684038,4.572578,4.775331,5.079046,4.940222,5.244993,,,


Two datasets, 1 rank (1-140ish per year), 1 index (1-7).  
Index is normally prefered but doesn't have data for 2019.  
Will use rank for this reason. Transformation should be the approx the same.

In [89]:
df_rank.data_col.describe()

count    137.000000
mean      70.262774
std       40.255849
min        1.000000
25%       36.000000
50%       70.000000
75%      104.000000
max      141.000000
Name: data_col, dtype: float64

In [90]:
# transform 1-141 rank into 1-6
min_rank = df_rank.data_col.min()
max_rank = df_rank.data_col.max()
df_rank['new_rank_score'] = df_rank['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rank['new_rank_score'] = df_rank['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))


In [91]:
# need to invert score since higher rank is not better 
df_rank['new_rank_score'] = (6-df_rank['new_rank_score'])+1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rank['new_rank_score'] = (6-df_rank['new_rank_score'])+1


###### Going to test how rank and index conversions compare.  

Unlike index scores, Rank scores don't necessarily have equal weightings as you go up. E.g. the difference betwee ranks 15-16 is not the same as ranks 17-18. But for our conversion we will have to treat them as equal, so I am keen to see how this affects results compared to the index scores.  
To do this I construct a simple test example of an older data column 2007-2008

In [92]:
df_rank['2007-2008'].describe()

count    127.000000
mean      65.259843
std       37.627500
min        1.000000
25%       33.500000
50%       65.000000
75%       97.500000
max      130.000000
Name: 2007-2008, dtype: float64

In [93]:
# convert rank into 1-6
df_rank['test0'] = df_rank['2007-2008'].apply(lambda row: convert_rank(row, 1, 130))
df_rank['test0'] = (6-df_rank['test0'])+1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rank['test0'] = df_rank['2007-2008'].apply(lambda row: convert_rank(row, 1, 130))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rank['test0'] = (6-df_rank['test0'])+1


In [94]:
df_index['test0'] = df_index['2007-2008'].apply(lambda row: convert_rank(row, 1, 7))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_index['test0'] = df_index['2007-2008'].apply(lambda row: convert_rank(row, 1, 7))


In [95]:
new_df = df_index[['test0','Country Name']].merge(df_rank[['test0','Country Name']], suffixes=['index','rank'],on='Country Name')

In [96]:
new_df.corr()

Unnamed: 0,test0index,test0rank
test0index,1.0,0.978224
test0rank,0.978224,1.0


The two are correlated enough to not dramatically impact results. But could be worth revising in the future when there is more recent index data.

#### Prepare Output

In [97]:
df_rank[['Country ISO3', 'Country Name','Indicator','data_col','new_rank_score']].head()

df_rank.rename(columns={'Country ISO3':'Country Code'}, inplace=True)

df_rank['higher_is_better'] = True

df_rank[['Country Code', 'Country Name','Indicator','data_col','new_rank_score','higher_is_better']].head(15)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rank['higher_is_better'] = True


Unnamed: 0,Country Code,Country Name,Indicator,data_col,new_rank_score,higher_is_better
17,AGO,Angola,"Venture capital availability, 1-7 (best)",141.0,1.0,True
511,ALB,Albania,"Venture capital availability, 1-7 (best)",81.0,3.142857,True
1012,ARE,United Arab Emirates,"Venture capital availability, 1-7 (best)",4.0,5.892857,True
1510,ARG,Argentina,"Venture capital availability, 1-7 (best)",118.0,1.821429,True
2015,ARM,Armenia,"Venture capital availability, 1-7 (best)",56.0,4.035714,True
2507,AUS,Australia,"Venture capital availability, 1-7 (best)",46.0,4.392857,True
3012,AUT,Austria,"Venture capital availability, 1-7 (best)",38.0,4.678571,True
3510,AZE,Azerbaijan,"Venture capital availability, 1-7 (best)",24.0,5.178571,True
4004,BDI,Burundi,"Venture capital availability, 1-7 (best)",78.0,3.25,True
4494,BEL,Belgium,"Venture capital availability, 1-7 (best)",31.0,4.928571,True


In [98]:

# output scores to csv
df_rank[['Country Code', 'Country Name','Indicator','data_col','new_rank_score','higher_is_better']].to_csv('../indicator_scores/business_{}_scores.csv'.format(indicator), index=False)

## 11. Strength of Legal Rights 

In [99]:
indicator = indicators[10]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Strength of Legal Rights
legal_rights_strength


In [100]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271 entries, 0 to 270
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Series Name    268 non-null    object
 1   Series Code    266 non-null    object
 2   Country Name   266 non-null    object
 3   Country Code   266 non-null    object
 4   1990 [YR1990]  266 non-null    object
 5   2000 [YR2000]  266 non-null    object
 6   2011 [YR2011]  266 non-null    object
 7   2012 [YR2012]  266 non-null    object
 8   2013 [YR2013]  266 non-null    object
 9   2014 [YR2014]  266 non-null    object
 10  2015 [YR2015]  266 non-null    object
 11  2016 [YR2016]  266 non-null    object
 12  2017 [YR2017]  266 non-null    object
 13  2018 [YR2018]  266 non-null    object
 14  2019 [YR2019]  266 non-null    object
 15  2020 [YR2020]  266 non-null    object
dtypes: object(16)
memory usage: 34.0+ KB


#### Find Relevant Columns

In [101]:
df['Series Name'].unique()

array(['Strength of legal rights index (0=weak to 12=strong)', nan,
       'Data from database: World Development Indicators',
       'Last Updated: 06/30/2021'], dtype=object)

In [102]:
# remove unwanted rows
df = df[~df['Series Code'].isna()]
df = df.replace('..', np.nan)

In [103]:
df.loc[0][0]

'Strength of legal rights index (0=weak to 12=strong)'

In [104]:
df.head()

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,1990 [YR1990],2000 [YR2000],2011 [YR2011],2012 [YR2012],2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020]
0,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Afghanistan,AFG,,,,,9.0,9.0,9.0,9.0,9.0,10.0,10.0,
1,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Albania,ALB,,,,,7.0,6.0,6.0,6.0,8.0,8.0,8.0,
2,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Algeria,DZA,,,,,2.0,2.0,2.0,2.0,2.0,2.0,2.0,
3,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,American Samoa,ASM,,,,,,,,,,,,
4,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Andorra,AND,,,,,,,,,,,,


In [105]:
df.columns

Index(['Series Name', 'Series Code', 'Country Name', 'Country Code',
       '1990 [YR1990]', '2000 [YR2000]', '2011 [YR2011]', '2012 [YR2012]',
       '2013 [YR2013]', '2014 [YR2014]', '2015 [YR2015]', '2016 [YR2016]',
       '2017 [YR2017]', '2018 [YR2018]', '2019 [YR2019]', '2020 [YR2020]'],
      dtype='object')

In [106]:
# clean data
df = df.replace('..', np.nan)

df[['1990 [YR1990]', '2000 [YR2000]', '2011 [YR2011]', '2012 [YR2012]',
       '2013 [YR2013]', '2014 [YR2014]', '2015 [YR2015]', '2016 [YR2016]',
       '2017 [YR2017]', '2018 [YR2018]', '2019 [YR2019]', '2020 [YR2020]']] = df[['1990 [YR1990]', '2000 [YR2000]', '2011 [YR2011]', '2012 [YR2012]',
       '2013 [YR2013]', '2014 [YR2014]', '2015 [YR2015]', '2016 [YR2016]',
       '2017 [YR2017]', '2018 [YR2018]', '2019 [YR2019]', '2020 [YR2020]']].astype(float)

In [107]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 266 entries, 0 to 265
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Series Name    266 non-null    object 
 1   Series Code    266 non-null    object 
 2   Country Name   266 non-null    object 
 3   Country Code   266 non-null    object 
 4   1990 [YR1990]  0 non-null      float64
 5   2000 [YR2000]  0 non-null      float64
 6   2011 [YR2011]  0 non-null      float64
 7   2012 [YR2012]  0 non-null      float64
 8   2013 [YR2013]  236 non-null    float64
 9   2014 [YR2014]  236 non-null    float64
 10  2015 [YR2015]  237 non-null    float64
 11  2016 [YR2016]  237 non-null    float64
 12  2017 [YR2017]  237 non-null    float64
 13  2018 [YR2018]  238 non-null    float64
 14  2019 [YR2019]  238 non-null    float64
 15  2020 [YR2020]  0 non-null      float64
dtypes: float64(12), object(4)
memory usage: 43.4+ KB


In [108]:
# keep only most recent value
df = df[['Series Name', 'Series Code', 'Country Name', 'Country Code','2019 [YR2019]']]

df['higher_is_better'] = True
df['Indicator'] = df['Series Name']
df['data_col'] = df['2019 [YR2019]']


#### Convert Scales

In [109]:
# convert 0-12 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=0,old_max=12))

In [110]:
df.head(16)

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2019 [YR2019],higher_is_better,Indicator,data_col,new_rank_score
0,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Afghanistan,AFG,10.0,True,Strength of legal rights index (0=weak to 12=s...,10.0,5.166667
1,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Albania,ALB,8.0,True,Strength of legal rights index (0=weak to 12=s...,8.0,4.333333
2,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Algeria,DZA,2.0,True,Strength of legal rights index (0=weak to 12=s...,2.0,1.833333
3,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,American Samoa,ASM,,True,Strength of legal rights index (0=weak to 12=s...,,
4,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Andorra,AND,,True,Strength of legal rights index (0=weak to 12=s...,,
5,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Angola,AGO,1.0,True,Strength of legal rights index (0=weak to 12=s...,1.0,1.416667
6,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Antigua and Barbuda,ATG,5.0,True,Strength of legal rights index (0=weak to 12=s...,5.0,3.083333
7,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Argentina,ARG,2.0,True,Strength of legal rights index (0=weak to 12=s...,2.0,1.833333
8,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Armenia,ARM,6.0,True,Strength of legal rights index (0=weak to 12=s...,6.0,3.5
9,Strength of legal rights index (0=weak to 12=s...,IC.LGL.CRED.XQ,Aruba,ABW,,True,Strength of legal rights index (0=weak to 12=s...,,


In [111]:
df.columns

Index(['Series Name', 'Series Code', 'Country Name', 'Country Code',
       '2019 [YR2019]', 'higher_is_better', 'Indicator', 'data_col',
       'new_rank_score'],
      dtype='object')

#### Prepare Output

In [112]:
df = df[['Country Name', 'Country Code','Indicator', 'data_col', 'new_rank_score','higher_is_better']]
df.sort_values(by='new_rank_score', ascending=False).head(16)

Unnamed: 0,Country Name,Country Code,Indicator,data_col,new_rank_score,higher_is_better
12,Azerbaijan,AZE,Strength of legal rights index (0=weak to 12=s...,12.0,6.0,True
132,Montenegro,MNE,Strength of legal rights index (0=weak to 12=s...,12.0,6.0,True
141,New Zealand,NZL,Strength of legal rights index (0=weak to 12=s...,12.0,6.0,True
158,Puerto Rico,PRI,Strength of legal rights index (0=weak to 12=s...,12.0,6.0,True
28,Brunei Darussalam,BRN,Strength of legal rights index (0=weak to 12=s...,12.0,6.0,True
119,Malawi,MWI,Strength of legal rights index (0=weak to 12=s...,11.0,5.583333,True
105,Kosovo,XKX,Strength of legal rights index (0=weak to 12=s...,11.0,5.583333,True
101,Kenya,KEN,Strength of legal rights index (0=weak to 12=s...,11.0,5.583333,True
162,Rwanda,RWA,Strength of legal rights index (0=weak to 12=s...,11.0,5.583333,True
42,Colombia,COL,Strength of legal rights index (0=weak to 12=s...,11.0,5.583333,True


In [113]:
# output scores
df.to_csv('../indicator_scores/business_{}_scores.csv'.format(indicator), index=False)

### 12. Time to start business


#### Load Data

In [114]:
indicator = indicators[11]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Time to Start a Business
time_start_bus


In [115]:
# remove unwanted rows
df = df[~df['Series Code'].isna()]
df = df.replace('..', np.nan)

In [116]:
df[['1990 [YR1990]', '2000 [YR2000]', '2011 [YR2011]', '2012 [YR2012]',
       '2013 [YR2013]', '2014 [YR2014]', '2015 [YR2015]', '2016 [YR2016]',
       '2017 [YR2017]', '2018 [YR2018]', '2019 [YR2019]', '2020 [YR2020]']] = df[['1990 [YR1990]', '2000 [YR2000]', '2011 [YR2011]', '2012 [YR2012]',
       '2013 [YR2013]', '2014 [YR2014]', '2015 [YR2015]', '2016 [YR2016]',
       '2017 [YR2017]', '2018 [YR2018]', '2019 [YR2019]', '2020 [YR2020]']].astype(float)

In [117]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 266 entries, 0 to 265
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Series Name    266 non-null    object 
 1   Series Code    266 non-null    object 
 2   Country Name   266 non-null    object 
 3   Country Code   266 non-null    object 
 4   1990 [YR1990]  0 non-null      float64
 5   2000 [YR2000]  0 non-null      float64
 6   2011 [YR2011]  221 non-null    float64
 7   2012 [YR2012]  225 non-null    float64
 8   2013 [YR2013]  236 non-null    float64
 9   2014 [YR2014]  236 non-null    float64
 10  2015 [YR2015]  237 non-null    float64
 11  2016 [YR2016]  237 non-null    float64
 12  2017 [YR2017]  237 non-null    float64
 13  2018 [YR2018]  238 non-null    float64
 14  2019 [YR2019]  238 non-null    float64
 15  2020 [YR2020]  0 non-null      float64
dtypes: float64(12), object(4)
memory usage: 35.3+ KB


In [118]:
# keep only most recent value
df = df[['Series Name', 'Series Code', 'Country Name', 'Country Code','2019 [YR2019]']]

df['higher_is_better'] = True
df['Indicator'] = df['Series Name']
df['data_col'] = df['2019 [YR2019]']


In [119]:
df.head()

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2019 [YR2019],higher_is_better,Indicator,data_col
0,Time required to start a business (days),IC.REG.DURS,Afghanistan,AFG,8.5,True,Time required to start a business (days),8.5
1,Time required to start a business (days),IC.REG.DURS,Albania,ALB,4.5,True,Time required to start a business (days),4.5
2,Time required to start a business (days),IC.REG.DURS,Algeria,DZA,18.0,True,Time required to start a business (days),18.0
3,Time required to start a business (days),IC.REG.DURS,American Samoa,ASM,,True,Time required to start a business (days),
4,Time required to start a business (days),IC.REG.DURS,Andorra,AND,,True,Time required to start a business (days),


In [120]:
def map_days_to_scores(number):
    if number<=2:
        return 4
    if number <6 and number>2:
        return 3
    elif number >=6 and number <11:
        return 2
    elif number >=11:
        return 1

In [121]:
# map days to scores 
df['data_col'] = df['data_col'].apply(map_days_to_scores)

In [122]:
# convert 1-3 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=1,old_max=4))

In [123]:
df.head()

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2019 [YR2019],higher_is_better,Indicator,data_col,new_rank_score
0,Time required to start a business (days),IC.REG.DURS,Afghanistan,AFG,8.5,True,Time required to start a business (days),2.0,2.666667
1,Time required to start a business (days),IC.REG.DURS,Albania,ALB,4.5,True,Time required to start a business (days),3.0,4.333333
2,Time required to start a business (days),IC.REG.DURS,Algeria,DZA,18.0,True,Time required to start a business (days),1.0,1.0
3,Time required to start a business (days),IC.REG.DURS,American Samoa,ASM,,True,Time required to start a business (days),,
4,Time required to start a business (days),IC.REG.DURS,Andorra,AND,,True,Time required to start a business (days),,


In [124]:
df[df['2019 [YR2019]']<3]

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2019 [YR2019],higher_is_better,Indicator,data_col,new_rank_score
10,Time required to start a business (days),IC.REG.DURS,Australia,AUS,2.0,True,Time required to start a business (days),4.0,6.0
35,Time required to start a business (days),IC.REG.DURS,Canada,CAN,1.5,True,Time required to start a business (days),4.0,6.0
72,Time required to start a business (days),IC.REG.DURS,Georgia,GEO,1.0,True,Time required to start a business (days),4.0,6.0
86,Time required to start a business (days),IC.REG.DURS,"Hong Kong SAR, China",HKG,1.5,True,Time required to start a business (days),4.0,6.0
141,Time required to start a business (days),IC.REG.DURS,New Zealand,NZL,0.5,True,Time required to start a business (days),4.0,6.0
171,Time required to start a business (days),IC.REG.DURS,Singapore,SGP,1.5,True,Time required to start a business (days),4.0,6.0
194,Time required to start a business (days),IC.REG.DURS,Togo,TGO,2.5,True,Time required to start a business (days),3.0,4.333333
251,Time required to start a business (days),IC.REG.DURS,North America,NAC,2.85,True,Time required to start a business (days),3.0,4.333333


In [125]:
df['2019 [YR2019]'].describe()

count    238.000000
mean      19.771819
std       23.169794
min        0.500000
25%        8.075000
50%       14.000000
75%       22.991844
max      230.000000
Name: 2019 [YR2019], dtype: float64

In [126]:
df = df[['Country Name', 'Country Code','Indicator', 'data_col', 'new_rank_score','higher_is_better']]
df.sort_values(by='new_rank_score', ascending=False).head(16)

Unnamed: 0,Country Name,Country Code,Indicator,data_col,new_rank_score,higher_is_better
171,Singapore,SGP,Time required to start a business (days),4.0,6.0,True
141,New Zealand,NZL,Time required to start a business (days),4.0,6.0,True
10,Australia,AUS,Time required to start a business (days),4.0,6.0,True
86,"Hong Kong SAR, China",HKG,Time required to start a business (days),4.0,6.0,True
35,Canada,CAN,Time required to start a business (days),4.0,6.0,True
72,Georgia,GEO,Time required to start a business (days),4.0,6.0,True
1,Albania,ALB,Time required to start a business (days),3.0,4.333333,True
205,United Kingdom,GBR,Time required to start a business (days),3.0,4.333333,True
31,Burundi,BDI,Time required to start a business (days),3.0,4.333333,True
62,Estonia,EST,Time required to start a business (days),3.0,4.333333,True


#### Prepare Output

In [127]:
bf

'time_start_bus'

In [128]:
# output scores
df.to_csv('../indicator_scores/business_{}_scores.csv'.format(indicator), index=False)

### 13. Ease doing business


#### Load Data

In [129]:
indicator = indicators[12]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Ease of Doing Business
ease_doing_bus


In [130]:
# remove unwanted rows
df = df.replace('..', np.nan)
df = df[~df['Series Code'].isna()]


In [131]:
df['Series Name'].unique()

array(['Ease of doing business index (1=most business-friendly regulations)'],
      dtype=object)

In [132]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 266 entries, 0 to 265
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Series Name    266 non-null    object 
 1   Series Code    266 non-null    object 
 2   Country Name   266 non-null    object 
 3   Country Code   266 non-null    object 
 4   1990 [YR1990]  0 non-null      float64
 5   2000 [YR2000]  0 non-null      float64
 6   2011 [YR2011]  0 non-null      float64
 7   2012 [YR2012]  0 non-null      float64
 8   2013 [YR2013]  0 non-null      float64
 9   2014 [YR2014]  0 non-null      float64
 10  2015 [YR2015]  0 non-null      float64
 11  2016 [YR2016]  0 non-null      float64
 12  2017 [YR2017]  0 non-null      float64
 13  2018 [YR2018]  0 non-null      float64
 14  2019 [YR2019]  189 non-null    object 
 15  2020 [YR2020]  0 non-null      float64
dtypes: float64(11), object(5)
memory usage: 35.3+ KB


In [133]:
# keep only most recent value
df = df[['Series Name', 'Series Code', 'Country Name', 'Country Code','2019 [YR2019]']]

# df['higher_is_better'] = False
df['Indicator'] = df['Series Name']
df['2019 [YR2019]'] = df['2019 [YR2019]'].astype(float)
df['data_col'] = df['2019 [YR2019]']
# df['data_norm'] = df['data_norm'] = (df.data_col - df.data_col.mean())/df.data_col.std()

In [134]:
rank_min = df.data_col.min()
rank_max = df.data_col.max()

In [135]:
rank_min, rank_max

(1.0, 190.0)

In [136]:
# convert 1-190 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=rank_min,old_max=rank_max))

In [137]:
# invert since to put as higher is  better
df['new_rank_score'] = df['new_rank_score'].apply(lambda row: (6-row)+1)

In [138]:
df.sort_values(by='new_rank_score', ascending=False).head(16)

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2019 [YR2019],Indicator,data_col,new_rank_score
141,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,New Zealand,NZL,1.0,Ease of doing business index (1=most business-...,1.0,6.0
171,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Singapore,SGP,2.0,Ease of doing business index (1=most business-...,2.0,5.973545
86,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,"Hong Kong SAR, China",HKG,3.0,Ease of doing business index (1=most business-...,3.0,5.94709
53,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Denmark,DNK,4.0,Ease of doing business index (1=most business-...,4.0,5.920635
104,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,"Korea, Rep.",KOR,5.0,Ease of doing business index (1=most business-...,5.0,5.89418
206,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,United States,USA,6.0,Ease of doing business index (1=most business-...,6.0,5.867725
72,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Georgia,GEO,7.0,Ease of doing business index (1=most business-...,7.0,5.84127
205,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,United Kingdom,GBR,8.0,Ease of doing business index (1=most business-...,8.0,5.814815
147,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Norway,NOR,9.0,Ease of doing business index (1=most business-...,9.0,5.78836
187,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Sweden,SWE,10.0,Ease of doing business index (1=most business-...,10.0,5.761905


In [139]:
df['higher_is_better'] = True
df.head(15)

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2019 [YR2019],Indicator,data_col,new_rank_score,higher_is_better
0,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Afghanistan,AFG,173.0,Ease of doing business index (1=most business-...,173.0,1.449735,True
1,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Albania,ALB,82.0,Ease of doing business index (1=most business-...,82.0,3.857143,True
2,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Algeria,DZA,157.0,Ease of doing business index (1=most business-...,157.0,1.873016,True
3,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,American Samoa,ASM,,Ease of doing business index (1=most business-...,,,True
4,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Andorra,AND,,Ease of doing business index (1=most business-...,,,True
5,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Angola,AGO,177.0,Ease of doing business index (1=most business-...,177.0,1.343915,True
6,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Antigua and Barbuda,ATG,113.0,Ease of doing business index (1=most business-...,113.0,3.037037,True
7,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Argentina,ARG,126.0,Ease of doing business index (1=most business-...,126.0,2.693122,True
8,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Armenia,ARM,47.0,Ease of doing business index (1=most business-...,47.0,4.783069,True
9,Ease of doing business index (1=most business-...,IC.BUS.EASE.XQ,Aruba,ABW,,Ease of doing business index (1=most business-...,,,True


#### Prepare Output

In [140]:
df = df[['Country Name', 'Country Code','Indicator', 'data_col', 'new_rank_score','higher_is_better']]

# output scores
df.to_csv('../indicator_scores/business_{}_scores.csv'.format(indicator), index=False)

## 14. Ease of finding skilled employees

In [141]:
indicator = indicators[13]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Ease of finding skilled employees
ease_of_finding_skilled_employees


In [142]:
df.head()

Unnamed: 0,2017,2018,2019,Country,Unnamed: 4
0,3.88,4.03,3.89,Albania,
1,3.98,3.84,4.12,Algeria,
2,No data,2.08,2.76,Angola,
3,4.35,4.1,4.19,Argentina,
4,3.82,3.87,4.03,Armenia,


In [143]:
values = ['2017','2018','2019']

In [144]:
df = df.replace('No data', np.nan)

In [145]:
df[values] = df[values].astype(float)

In [146]:
df.head()

Unnamed: 0,2017,2018,2019,Country,Unnamed: 4
0,3.88,4.03,3.89,Albania,
1,3.98,3.84,4.12,Algeria,
2,,2.08,2.76,Angola,
3,4.35,4.1,4.19,Argentina,
4,3.82,3.87,4.03,Armenia,


In [147]:
df[values].describe()

Unnamed: 0,2017,2018,2019
count,132.0,136.0,137.0
mean,4.183258,4.136176,4.196058
std,0.664458,0.659186,0.589124
min,2.72,2.08,2.76
25%,3.685,3.6375,3.84
50%,4.065,4.095,4.17
75%,4.6925,4.655,4.63
max,5.67,5.75,5.32


In [148]:
# create standard columns
df['data_col'] = df['2019']
df['new_rank_score'] = df['data_col']
df['higher_is_better'] = True
df['Indicator'] = indicator
df['Year'] = 2019


df.rename(columns={'Country':'Country Name'}, inplace=True)


df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].head(15)


# output scores to csv
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].to_csv('../indicator_scores/business_{}_scores.csv'.format(indicator), index=False)

## 15. Amount invested into startups yearly from private, public, blended sources (respectively)


In [149]:
indicator = indicators[14]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Amount invested into startups yearly from private, public, blended sources (respectively)
start_up_investment


In [150]:
df.head(15)

Unnamed: 0,LOCATION,Country,SUBJECT,Subject,STAGES,Development stages,MEASURE,Measure,TIME,Year,Unit Code,Unit,PowerCode Code,PowerCode,Reference Period Code,Reference Period,Value,Flag Codes,Flags
0,AUS,Australia,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2006,2006,USD,US Dollar,6,Millions,,,456.334579,,
1,AUS,Australia,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2007,2007,USD,US Dollar,6,Millions,,,680.29317,,
2,AUS,Australia,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2008,2008,USD,US Dollar,6,Millions,,,755.759626,,
3,AUS,Australia,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2009,2009,USD,US Dollar,6,Millions,,,532.682779,,
4,AUS,Australia,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2010,2010,USD,US Dollar,6,Millions,,,367.836251,,
5,AUS,Australia,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2011,2011,USD,US Dollar,6,Millions,,,246.528233,,
6,AUS,Australia,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2012,2012,USD,US Dollar,6,Millions,,,331.331196,,
7,AUS,Australia,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2013,2013,USD,US Dollar,6,Millions,,,252.934084,,
8,AUS,Australia,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2014,2014,USD,US Dollar,6,Millions,,,265.918369,,
9,AUS,Australia,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2015,2015,USD,US Dollar,6,Millions,,,288.485377,,


In [151]:
df = df[(df['Development stages']=='Total') & (df.Year == 2019) & (df.MEASURE == 'USD_V')]
df.head(15)

Unnamed: 0,LOCATION,Country,SUBJECT,Subject,STAGES,Development stages,MEASURE,Measure,TIME,Year,Unit Code,Unit,PowerCode Code,PowerCode,Reference Period Code,Reference Period,Value,Flag Codes,Flags
13,AUS,Australia,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2019,2019,USD,US Dollar,6,Millions,,,487.310802,,
120,AUT,Austria,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2019,2019,USD,US Dollar,6,Millions,,,90.416881,,
232,BEL,Belgium,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2019,2019,USD,US Dollar,6,Millions,,,395.828165,,
343,CAN,Canada,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2019,2019,USD,US Dollar,6,Millions,,,3286.872933,,
448,CZE,Czech Republic,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2019,2019,USD,US Dollar,6,Millions,,,27.204683,,
538,DNK,Denmark,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2019,2019,USD,US Dollar,6,Millions,,,352.437466,,
650,FIN,Finland,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2019,2019,USD,US Dollar,6,Millions,,,321.566548,,
762,FRA,France,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2019,2019,USD,US Dollar,6,Millions,,,2164.94444,,
874,DEU,Germany,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2019,2019,USD,US Dollar,6,Millions,,,2379.672789,,
985,GRC,Greece,VC_INV,Venture capital investments,VC_T,Total,USD_V,"USD, current prices",2019,2019,USD,US Dollar,6,Millions,,,27.057841,,


In [152]:
# create standard columns
df['Country Name'] = df['Country']
df['Indicator'] = indicator
df['data_col'] = df['Value']
df['higher_is_better'] = True
df['Year'] = 2019

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

df = df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']]

In [153]:
df

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
13,Australia,2019,Amount invested into startups yearly from priv...,487.310802,1.017817,True
120,Austria,2019,Amount invested into startups yearly from priv...,90.416881,1.003187,True
232,Belgium,2019,Amount invested into startups yearly from priv...,395.828165,1.014445,True
343,Canada,2019,Amount invested into startups yearly from priv...,3286.872933,1.121012,True
448,Czech Republic,2019,Amount invested into startups yearly from priv...,27.204683,1.000857,True
538,Denmark,2019,Amount invested into startups yearly from priv...,352.437466,1.012846,True
650,Finland,2019,Amount invested into startups yearly from priv...,321.566548,1.011708,True
762,France,2019,Amount invested into startups yearly from priv...,2164.94444,1.079656,True
874,Germany,2019,Amount invested into startups yearly from priv...,2379.672789,1.087572,True
985,Greece,2019,Amount invested into startups yearly from priv...,27.057841,1.000852,True


In [154]:
df.to_csv('../indicator_scores/business_{}_scores.csv'.format(bf), index=False)

## 16. Doing Business Index


In [155]:
indicator = indicators[15]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Doing Business Index
doing_bus_idx


In [156]:
df

Unnamed: 0,Country code,Economy,Region,Income group,DB Year,Ease of doing business rank (DB19),Ease of doing business score (DB17-19 methodology),Ease of doing business score (DB16 methodology),Ease of doing business score (DB15 methodology),Ease of doing business score (DB10-14 methodology),...,Recovery rate (cents on the dollar),Strength of insolvency framework index (0-16) (DB15-19 methodology),Commencement of proceedings index (0-3) (DB15-19 methodology),Management of debtor's assets index (0-6) (DB15-19 methodology),Reorganization proceedings index (0-3) (DB15-19 methodology),Creditor participation index (0-4) (DB15-19 methodology),Score-Recovery rate (cents on the dollar),Score-Strength of insolvency framework index (0-16) (DB15-19 methodology),data_country,data_year
0,AFG,Afghanistan,South Asia,Low income,2005,,,,,,...,0.0,,,,,,0.00,,,
1,AFG,Afghanistan,South Asia,Low income,2006,,,,,,...,0.0,,,,,,0.00,,,
2,AFG,Afghanistan,South Asia,Low income,2007,,,,,,...,0.0,,,,,,0.00,,,
3,AFG,Afghanistan,South Asia,Low income,2008,,,,,,...,25.1,,,,,,27.07,,,
4,AFG,Afghanistan,South Asia,Low income,2009,,,,,,...,24.3,,,,,,26.16,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3020,ZWE,Zimbabwe,Sub-Saharan Africa,Low income,2015,,,47.11,44.36,,...,13.8,5.0,3.0,2.0,0.0,0.0,14.81,31.25,,
3021,ZWE,Zimbabwe,Sub-Saharan Africa,Low income,2016,,47.74,47.94,,,...,16.1,5.0,3.0,2.0,0.0,0.0,17.38,31.25,,
3022,ZWE,Zimbabwe,Sub-Saharan Africa,Low income,2017,,47.73,,,,...,18.0,5.0,3.0,2.0,0.0,0.0,19.43,31.25,,
3023,ZWE,Zimbabwe,Sub-Saharan Africa,Low income,2018,,48.52,,,,...,19.7,5.0,3.0,2.0,0.0,0.0,21.17,31.25,,


In [157]:
df = df[(df['DB Year'] == 2019)]
df.head(15)

Unnamed: 0,Country code,Economy,Region,Income group,DB Year,Ease of doing business rank (DB19),Ease of doing business score (DB17-19 methodology),Ease of doing business score (DB16 methodology),Ease of doing business score (DB15 methodology),Ease of doing business score (DB10-14 methodology),...,Recovery rate (cents on the dollar),Strength of insolvency framework index (0-16) (DB15-19 methodology),Commencement of proceedings index (0-3) (DB15-19 methodology),Management of debtor's assets index (0-6) (DB15-19 methodology),Reorganization proceedings index (0-3) (DB15-19 methodology),Creditor participation index (0-4) (DB15-19 methodology),Score-Recovery rate (cents on the dollar),Score-Strength of insolvency framework index (0-16) (DB15-19 methodology),data_country,data_year
14,AFG,Afghanistan,South Asia,Low income,2019,167.0,47.77,,,,...,26.5,12.0,2.0,6.0,2.0,2.0,28.57,75.0,,
30,ALB,Albania,Europe & Central Asia,Upper middle income,2019,63.0,69.51,,,,...,44.0,14.0,3.0,6.0,3.0,2.0,47.33,87.5,,
46,DZA,Algeria,Middle East & North Africa,Upper middle income,2019,157.0,49.65,,,,...,50.8,7.0,3.0,2.0,1.0,1.0,54.72,43.75,,
62,AGO,Angola,Sub-Saharan Africa,Lower middle income,2019,173.0,43.86,,,,...,0.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,,
76,ATG,Antigua and Barbuda,Latin America & Caribbean,High income,2019,112.0,59.48,,,,...,36.8,5.0,2.0,2.0,0.0,1.0,39.56,31.25,,
92,ARG,Argentina,Latin America & Caribbean,High income,2019,119.0,58.8,,,,...,21.5,9.5,2.5,4.0,2.0,1.0,23.11,59.38,,
108,ARM,Armenia,Europe & Central Asia,Upper middle income,2019,41.0,75.37,,,,...,38.2,7.5,2.5,2.0,2.0,1.0,41.1,46.88,,
124,AUS,Australia,High income: OECD,High income,2019,18.0,80.13,,,,...,82.7,11.0,2.5,5.0,0.5,3.0,88.99,68.75,,
140,AUT,Austria,High income: OECD,High income,2019,26.0,78.57,,,,...,80.1,11.0,2.5,5.5,1.0,2.0,86.2,68.75,,
156,AZE,Azerbaijan,Europe & Central Asia,Upper middle income,2019,25.0,78.64,,,,...,40.1,13.5,3.0,6.0,1.5,3.0,43.21,84.38,,


In [158]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = False
df['Indicator'] = indicator
df['data_col'] = df.iloc[:,5]
df['Year'] = df.iloc[:,4]
df['Country Name'] = df.iloc[:,1]

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# convert 1-190 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=min_rank,old_max=max_rank))

# invert since to put as higher is  better
df['new_rank_score'] = df['new_rank_score'].apply(lambda row: (6-row)+1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['higher_is_better'] = False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Indicator'] = indicator
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['data_col'] = df.iloc[:,5]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value i

In [159]:
df = df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']]
df.head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
14,Afghanistan,2019,Doing Business Index,167.0,1.608466,False
30,Albania,2019,Doing Business Index,63.0,4.359788,False
46,Algeria,2019,Doing Business Index,157.0,1.873016,False
62,Angola,2019,Doing Business Index,173.0,1.449735,False
76,Antigua and Barbuda,2019,Doing Business Index,112.0,3.063492,False
92,Argentina,2019,Doing Business Index,119.0,2.878307,False
108,Armenia,2019,Doing Business Index,41.0,4.941799,False
124,Australia,2019,Doing Business Index,18.0,5.550265,False
140,Austria,2019,Doing Business Index,26.0,5.338624,False
156,Azerbaijan,2019,Doing Business Index,25.0,5.365079,False


In [160]:
df.to_csv('../indicator_scores/business_{}_scores.csv'.format(indicator), index=False)

### Score Aggregating

In [161]:
import os


In [162]:
# get list of files in scores folder
scores = os.listdir('../indicator_scores/')
scores = [s for s in scores if s.startswith('business')]

In [163]:
scores

['business Networking Services (Spend, IT Forecast Data)_scores.csv',
 'business_Cloud Services (Spend, IT Forecast Data)_scores.csv',
 'business_Doing Business Index_scores.csv',
 'business_Ease of Doing Business_scores.csv',
 'business_Ease of finding skilled employees_scores.csv',
 'business_ICT task-intensive jobs as a percentage of total employment_scores.csv',
 'business_Share of business with internet_scores.csv',
 'business_Share of businesses with broadband_scores.csv',
 'business_Share of businesses with online presence_scores.csv',
 'business_Size of digital economy (% of transactions)_scores.csv',
 'business_Size of gig economy (% of GDP)_scores.csv',
 'business_start_up_investment_scores.csv',
 'business_Strength of Legal Rights_scores.csv',
 'business_Time to Start a Business_scores.csv',
 'business_UNCTAD Business-to-Consumer (B2C) E-commerce Index_scores.csv',
 'business_Venture Capital Availability_scores.csv']

In [164]:
# create a dataframe that concatenates all these file into one table
df = pd.concat([pd.read_csv('../indicator_scores/{}'.format(s)) for s in scores])    

In [165]:
df

Unnamed: 0,Country Name,Indicator,data_col,new_rank_score,higher_is_better,Year,Country Code
0,Sweden,"Networking Services (Spend, IT Forecast Data)",82.75,5.128788,True,,
1,Denmark,"Networking Services (Spend, IT Forecast Data)",82.19,5.100505,True,,
2,Singapore,"Networking Services (Spend, IT Forecast Data)",81.39,5.060101,True,,
3,Netherlands,"Networking Services (Spend, IT Forecast Data)",81.37,5.059091,True,,
4,Switzerland,"Networking Services (Spend, IT Forecast Data)",80.41,5.010606,True,,
...,...,...,...,...,...,...,...
147,Vietnam,"Venture capital availability, 1-7 (best)",61.00,3.857143,True,,VNM
148,"Yemen, Rep.","Venture capital availability, 1-7 (best)",125.00,1.571429,True,,YEM
149,South Africa,"Venture capital availability, 1-7 (best)",77.00,3.285714,True,,ZAF
150,Zambia,"Venture capital availability, 1-7 (best)",139.00,1.071429,True,,ZMB


In [166]:
# Data cleaning
df['new_rank_score'] = df['new_rank_score'].fillna(0)
df.sort_values(by=['Country Name'], ascending=True, inplace=True)
df.reset_index(drop=True, inplace=True)

In [167]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2126 entries, 0 to 2125
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Country Name      2126 non-null   object 
 1   Indicator         2126 non-null   object 
 2   data_col          1902 non-null   float64
 3   new_rank_score    2126 non-null   float64
 4   higher_is_better  2126 non-null   bool   
 5   Year              895 non-null    float64
 6   Country Code      1097 non-null   object 
dtypes: bool(1), float64(3), object(3)
memory usage: 101.9+ KB


In [168]:
df

Unnamed: 0,Country Name,Indicator,data_col,new_rank_score,higher_is_better,Year,Country Code
0,Afghanistan,Country rank and value in the UNCTAD B2C E-com...,132.00,1.419580,True,,AFG
1,Afghanistan,Time required to start a business (days),2.00,2.666667,True,,AFG
2,Afghanistan,Ease of doing business index (1=most business-...,173.00,1.449735,True,,AFG
3,Afghanistan,Doing Business Index,167.00,1.608466,False,2019.0,
4,Afghanistan,Strength of legal rights index (0=weak to 12=s...,10.00,5.166667,True,,AFG
...,...,...,...,...,...,...,...
2121,Zimbabwe,Size of digital economy (% of transactions),21.82,2.384359,True,2019.0,
2122,Zimbabwe,Ease of doing business index (1=most business-...,140.00,2.322751,True,,ZWE
2123,Zimbabwe,Time required to start a business (days),1.00,1.000000,True,,ZWE
2124,Zimbabwe,"Venture capital availability, 1-7 (best)",130.00,1.392857,True,,ZWE


In [169]:
df.head(15)

Unnamed: 0,Country Name,Indicator,data_col,new_rank_score,higher_is_better,Year,Country Code
0,Afghanistan,Country rank and value in the UNCTAD B2C E-com...,132.0,1.41958,True,,AFG
1,Afghanistan,Time required to start a business (days),2.0,2.666667,True,,AFG
2,Afghanistan,Ease of doing business index (1=most business-...,173.0,1.449735,True,,AFG
3,Afghanistan,Doing Business Index,167.0,1.608466,False,2019.0,
4,Afghanistan,Strength of legal rights index (0=weak to 12=s...,10.0,5.166667,True,,AFG
5,Africa Eastern and Southern,Time required to start a business (days),1.0,1.0,True,,AFE
6,Africa Eastern and Southern,Ease of doing business index (1=most business-...,,0.0,True,,AFE
7,Africa Eastern and Southern,Strength of legal rights index (0=weak to 12=s...,4.538462,2.891026,True,,AFE
8,Africa Western and Central,Time required to start a business (days),1.0,1.0,True,,AFW
9,Africa Western and Central,Ease of doing business index (1=most business-...,,0.0,True,,AFW


In [170]:
df.describe()

Unnamed: 0,data_col,new_rank_score,Year
count,1902.0,2126.0,895.0
mean,124.774771,2.880729,2018.613408
std,3113.416528,1.668241,1.411141
min,0.0,0.0,2010.0
25%,4.03,1.473003,2019.0
50%,22.35,3.012195,2019.0
75%,73.0,4.258492,2019.0
max,135648.690714,6.0,2020.0


In [171]:
df['Country Name'] = df['Country Name'].astype(str)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2126 entries, 0 to 2125
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Country Name      2126 non-null   object 
 1   Indicator         2126 non-null   object 
 2   data_col          1902 non-null   float64
 3   new_rank_score    2126 non-null   float64
 4   higher_is_better  2126 non-null   bool   
 5   Year              895 non-null    float64
 6   Country Code      1097 non-null   object 
dtypes: bool(1), float64(3), object(3)
memory usage: 101.9+ KB


In [172]:
# checking country names
sorted(df['Country Name'].unique().tolist())

['Afghanistan',
 'Africa Eastern and Southern',
 'Africa Western and Central',
 'Albania',
 'Albania ',
 'Algeria',
 'Algeria ',
 'American Samoa',
 'Andorra',
 'Angola',
 'Angola ',
 'Antigua and Barbuda',
 'Arab World',
 'Argentina',
 'Argentina ',
 'Armenia',
 'Armenia ',
 'Aruba',
 'Australia',
 'Australia ',
 'Austria',
 'Austria ',
 'Azerbaijan',
 'Azerbaijan ',
 'B:',
 'Bahamas, The',
 'Bahrain',
 'Bahrain ',
 'Bangladesh',
 'Bangladesh ',
 'Bangladesh Chittagong',
 'Bangladesh Dhaka',
 'Barbados',
 'Belarus',
 'Belarus ',
 'Belgium',
 'Belgium ',
 'Belize',
 'Benin',
 'Benin ',
 'Bermuda',
 'Bhutan',
 'Bolivia',
 'Bolivia ',
 'Bosnia and Herzegovina',
 'Bosnia and Herzegovina ',
 'Botswana',
 'Botswana ',
 'Brazil',
 'Brazil ',
 'Brazil Rio de Janeiro',
 'Brazil São Paulo',
 'British Virgin Islands',
 'Brunei Darussalam',
 'Bulgaria',
 'Bulgaria ',
 'Burkina Faso',
 'Burkina Faso ',
 'Burundi',
 'Burundi ',
 'Cabo Verde',
 'Cabo Verde ',
 'Cambodia',
 'Cambodia ',
 'Cameroon',


In [173]:
# remove trailing whitespaces from country name
df['Country Name'] = df['Country Name'].str.strip()


In [174]:
# checking country names
sorted(df['Country Name'].unique().tolist())

['Afghanistan',
 'Africa Eastern and Southern',
 'Africa Western and Central',
 'Albania',
 'Algeria',
 'American Samoa',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Arab World',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'B:',
 'Bahamas, The',
 'Bahrain',
 'Bangladesh',
 'Bangladesh Chittagong',
 'Bangladesh Dhaka',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Brazil Rio de Janeiro',
 'Brazil São Paulo',
 'British Virgin Islands',
 'Brunei Darussalam',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Caribbean small states',
 'Cayman Islands',
 'Central African Republic',
 'Central Europe and the Baltics',
 'Chad',
 'Channel Islands',
 'Chile',
 'China',
 'China Beijing',
 'China Shanghai',
 'Colombia',
 'Comoros',
 'Congo, Dem. Rep.',
 'Congo, Rep.',
 'Costa Rica',
 "Cote d'Ivoire",
 'Croatia',
 'Cu

In [175]:
# average indicator scores per country
agg_df = df.groupby(['Country Name']).agg({'new_rank_score':'mean','data_col':'count'})

In [176]:
agg_df.columns = ['agg_score', 'count_source' ]

In [177]:
max_number_sources = agg_df.describe()['count_source']['max']

In [178]:
agg_df['agg_score_wt'] = agg_df['agg_score']*(agg_df['count_source']/max_number_sources)

In [179]:
agg_df.sort_values(by='agg_score', ascending=False, inplace=True)

In [180]:
agg_df.head(25)

Unnamed: 0_level_0,agg_score,count_source,agg_score_wt
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Taiwan, China",5.325132,3,0.998462
Singapore,5.020704,11,3.451734
Kosovo,4.824405,4,1.206101
New Zealand,4.718127,13,3.833479
United States,4.686297,14,4.10051
Sweden,4.488101,15,4.207594
Brunei Darussalam,4.487531,6,1.682824
Denmark,4.461287,12,3.345965
Germany,4.447015,16,4.447015
Israel,4.409727,11,3.031687


In [181]:
agg_df.to_csv('../pillar_scores/business_scores_v0.csv')