In [1]:
import pandas as pd
import numpy as np

In [2]:
### Get all the pillar names from the excel

In [3]:
names = pd.read_excel('../../UNDP Digital Assessment Data Framework Filename Matching V7.xlsx')

In [4]:
col_names = ['Indicator','check', 'Data Source','Index','Filename']

In [5]:
names = names[col_names]

In [6]:
names.head()

Unnamed: 0,Indicator,check,Data Source,Index,Filename
0,Countries,,United Nations,False,Countries
1,"Database of Global Administrative Areas (GADM,...",,GADM maps and data,False,
2,High Resolution Population Density Maps + Demo...,,Facebook,False,
3,population density vs openstreetmap object den...,,Kontur,False,
4,Population Density,Infrastructure,World Bank,False,population_density


In [7]:
# get all the files per pillar
data_stats = names.groupby('check').agg({'Filename':'count','Indicator':'count'})

In [8]:
data_stats

Unnamed: 0_level_0,Filename,Indicator
check,Unnamed: 1_level_1,Unnamed: 2_level_1
Business,20,25
Foundations,9,12
Government,11,15
Infrastructure,45,48
People,38,46
Regulation,6,7
Strategy,1,1


In [9]:
### People

In [10]:
bnames = names[(names.check=='People')&(~names.Filename.isna())]#&(names.Index==False)]
bnames

Unnamed: 0,Indicator,check,Data Source,Index,Filename
99,Human Capital Index (HCI),People,DESA,True,e_government_index
100,% of population using internet (all),People,World Bank,False,population_using_internet
101,% of population using internet (female),People,ITU,False,gender_gap_internet_usage
102,% of population using internet (male),People,ITU,False,gender_gap_internet_usage
103,SDG 4.4 Digital literacy data,People,UNESCO,False,SDG 4.4_Digital_literacy_data
104,UNDP Human Development Index (HDI),People,UNDP,True,undp_human_developmnt
105,Facebook Social Connectedness Index,People,Facebook,True,fb_social_connectedness
106,Share of individuals using the Internet to int...,People,OECD,False,population_interacting_public_officials
107,Level of satisfaction for online public servic...,People,Boston Consulting Group/SalesForce,False,digital_public_service_use
108,Number of mobile apps available in national la...,People,GSMA Mobile Connectivity Index,False,apps_in_national_language


In [11]:
# get list of names for all indicators
indicators = bnames.Indicator.unique()

In [12]:
# get all file names
bfiles = bnames.Filename.unique()

In [13]:
bfiles

array(['e_government_index', 'population_using_internet',
       'gender_gap_internet_usage', 'SDG 4.4_Digital_literacy_data',
       'undp_human_developmnt', 'fb_social_connectedness',
       'population_interacting_public_officials',
       'digital_public_service_use', 'apps_in_national_language',
       'time_spent_online', ' happiness_score', 'cryptocurrency_adoption',
       'not_buying_online_concern_about_returning',
       'not_buying_online_concern_about_security',
       'ewaste_per_inhabitant', 'automation_led_unemployment',
       'cyberbullying_rate', 'global_wellbeing_initiative ',
       'financial_inclusiveness ',
       'individuals buying online and frequency', 'e-commerce_activity',
       'top_sites', 'youtube_searches', 'google_trends', 'intenet_usage',
       'household_internet_access', 'FB_users', 'gender_gaps',
       'population_digital_financial_services',
       'mobile_broadband_pricing', 'tax_percent_mobile_ownership',
       'percent_mobile_subscription'

In [14]:
# formula for converting scale
def convert_rank(old_value, old_min=1, old_max=7, new_min=1, new_max=6 ):
    """ Convert old scale values scale into new scale values"""
    old_range = old_max - old_min
    new_range = new_max - new_min
    new_value = (((old_value-old_min)*new_range)/old_range)+new_min
    return new_value

In [15]:
### 1. Human Capital Index (HCI)

In [16]:
indicators[0]

# load data
indicator = indicators[0]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Human Capital Index (HCI)
e_government_index


In [17]:
df.head(10)

Unnamed: 0,Survey Year,Country Name,E-Government Rank,E-Government Index,E-Participation Index,Online Service Index,Human Capital Index,Telecommunication Infrastructure Index
0,2020,Iraq,143,0.436,0.3095,0.3353,0.4358,0.537
1,2020,Ireland,27,0.8433,0.8571,0.7706,0.9494,0.81
2,2020,Israel,30,0.8361,0.7143,0.7471,0.8924,0.8689
3,2020,Italy,37,0.8231,0.8214,0.8294,0.8466,0.7932
4,2020,Jamaica,114,0.5392,0.369,0.3882,0.7142,0.5151
5,2020,Japan,14,0.8989,0.9881,0.9059,0.8684,0.9223
6,2020,Jordan,117,0.5309,0.3333,0.3588,0.68,0.554
7,2020,Kazakhstan,29,0.8375,0.881,0.9235,0.8866,0.7024
8,2020,Kenya,116,0.5326,0.5952,0.6765,0.5812,0.3402
9,2020,Kiribati,145,0.432,0.5595,0.4941,0.6778,0.1241


In [18]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Human Capital Index'] 
df['Year'] = df['Survey Year']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [19]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,Iraq,2020,Human Capital Index (HCI),0.4358,3.179,True
1,Ireland,2020,Human Capital Index (HCI),0.9494,5.747,True
2,Israel,2020,Human Capital Index (HCI),0.8924,5.462,True
3,Italy,2020,Human Capital Index (HCI),0.8466,5.233,True
4,Jamaica,2020,Human Capital Index (HCI),0.7142,4.571,True
5,Japan,2020,Human Capital Index (HCI),0.8684,5.342,True
6,Jordan,2020,Human Capital Index (HCI),0.68,4.4,True
7,Kazakhstan,2020,Human Capital Index (HCI),0.8866,5.433,True
8,Kenya,2020,Human Capital Index (HCI),0.5812,3.906,True
9,Kiribati,2020,Human Capital Index (HCI),0.6778,4.389,True


In [20]:
### 2. % of population using internet (all)

In [21]:
indicators[1]

# load data
indicator = indicators[1]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population using internet (all)
population_using_internet


FileNotFoundError: [Errno 2] No such file or directory: '../../processed/population_using_internet.csv'

In [None]:
### 3. % of population using internet (female)

In [101]:
indicators[2]

# load data
indicator = indicators[2]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population using internet (female)
gender_gap_internet_usage


In [99]:
df.head(10)

# Must convert the string in the dataset to float

Economy name     object
Unnamed: 1      float64
year            float64
Individuals     float64
Male             object
Female           object
Total            object
data_country    float64
data_year       float64
dtype: object

In [95]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Female'].to_numeric
df['Country Name'] = df['Economy name']
df['Year'] = df['year']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

AttributeError: 'Series' object has no attribute 'to_numeric'

In [89]:
df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']].head(20)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,Albania,2020.0,% of population using internet (female),72.2,4.492408,True
1,Algeria,2018.0,% of population using internet (female),49.0,3.234273,True
2,Andorra,2017.0,% of population using internet (female),91.6,5.544469,True
3,Argentina,2017.0,% of population using internet (female),74.3,4.606291,True
4,Armenia,2019.0,% of population using internet (female),66.5,4.183297,True
5,Australia,2017.0,% of population using internet (female),86.5,5.267896,True
6,Austria,2020.0,% of population using internet (female),87.5,5.322126,True
7,Azerbaijan,2019.0,% of population using internet (female),81.1,4.975054,True
8,Bahrain,2020.0,% of population using internet (female),99.5,5.972885,True
9,Bangladesh,2019.0,% of population using internet (female),12.9,1.276573,True


In [None]:
### 4. % of population using internet (male)

In [102]:
indicators[3]

# load data
indicator = indicators[3]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of population using internet (male)
gender_gap_internet_usage


In [103]:
df.head(10)

# Must convert the string in the dataset to float

Unnamed: 0,Economy name,Unnamed: 1,year,Individuals,Male,Female,Total,data_country,data_year
0,Albania,,2020.0,72.2,...,...,...,,
1,Algeria,,2018.0,49.0,41.9,25.9,34.1,,
2,Andorra,,2017.0,91.6,...,...,...,,
3,Argentina,,2017.0,74.3,...,...,...,,
4,Armenia,,2019.0,66.5,59.2,60.7,59.9,,
5,Australia,,2017.0,86.5,81.4,85.5,82.8,,
6,Austria,,2020.0,87.5,...,...,...,,
7,Azerbaijan,,2019.0,81.1,68.3,67.1,67.7,,
8,Bahrain,,2020.0,99.5,...,...,...,,
9,Bangladesh,,2019.0,12.9,...,...,...,,


In [None]:
### 5. SDG 4.4 Digital literacy data

In [104]:
indicators[4]

# load data
indicator = indicators[4]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

# Possibly have to change the name of the file to something like SDG_digital_literacy_data

SDG 4.4 Digital literacy data
SDG 4.4_Digital_literacy_data


FileNotFoundError: [Errno 2] No such file or directory: '../../processed/SDG 4.4_Digital_literacy_data.csv'

In [None]:
### 6. UNDP Human Development Index (HDI)

In [105]:
indicators[5]

# load data
indicator = indicators[5]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

UNDP Human Development Index (HDI) 
undp_human_developmnt


In [106]:
df.head(15)
# Need to move the top row down
# Maybe something is wrong with the link although the one in data manifest works perfectly

Unnamed: 0,Table 3. Inequality-adjusted Human Development Index,Unnamed: 1,data_country,data_year
0,Table 4. Gender Development Index,,,
1,Table 5. Gender Inequality Index,,,
2,Table 6. Multidimensional Poverty Index: devel...,,,
3,,,,
4,Human development indicators,,,
5,Table 7. Population trends,,,
6,Table 8. Health outcomes,,,
7,Table 9. Education achievements,,,
8,Table 10. National income and composition of r...,,,
9,Table 11. Work and employment,,,


In [None]:
### 7. Facebook Social Connectedness Index

In [None]:
indicators[6]

# load data
indicator = indicators[6]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(25)

In [None]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['scaled_sci'] 
df['Country Name'] = df['fr_loc']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df[['Country Name','Indicator','data_col','new_rank_score','higher_is_better']].head(30)

# Need to find a way to convert ISO codes to full country names

In [None]:
### 8. Share of individuals using the Internet to interact with officials

In [None]:
indicators[7]

# load data
indicator = indicators[7]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

In [None]:
# filter most recent year
df = df[(df.Time==2019)]
df = df[(df.Indicator=='Individuals using the Internet for visiting or interacting with public authorities websites - last 12 m (%)')]

# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Value'] 
df['Year'] = df['Time']
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']].head(20)

# Need to find a way to convert ISO codes to full country names

In [None]:
### 9. Level of satisfaction for online public service

In [None]:
indicators[8]

# load data
indicator = indicators[8]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

In [None]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Net Perception'] 
df['Year'] = 2020
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

# Need to replace the % mark 

In [None]:
### 10. Number of mobile apps available in national language

In [None]:
indicators[9]

# load data
indicator = indicators[9]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

In [None]:
# filter most recent year and global value
df = df[(df.Year==2019)]

# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Number of apps in national language'] 
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [None]:
df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']].head(20)

In [None]:
### 11. Device Addiction (time of use on internet)

In [107]:
indicators[10]

# load data
indicator = indicators[10]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Device Addiction (time of use on internet / on devices)
time_spent_online


In [108]:
df.head(15)

# Need to move the row further down

Unnamed: 0,Country,"Average daily time spent using the internet by online users worldwide as of 3rd quarter 2020, by region (in hours.minutes)"
0,Philippines,10.56
1,Brazil,10.08
2,Colombia,10.07
3,South Africa,10.06
4,Argentina,9.39
5,Malaysia,9.17
6,Mexico,9.01
7,Indonesia,8.52
8,Thailand,8.44
9,Taiwan,8.08


In [111]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Average daily time spent using the internet by online users worldwide as of 3rd quarter 2020, by region (in hours.minutes)'] 
df['Country Name'] = df['Country']
df['Year'] = 2020

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [112]:
df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']].head(20)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,Philippines,2020,Device Addiction (time of use on internet / on...,10.56,6.0,True
1,Brazil,2020,Device Addiction (time of use on internet / on...,10.08,5.619651,True
2,Colombia,2020,Device Addiction (time of use on internet / on...,10.07,5.611727,True
3,South Africa,2020,Device Addiction (time of use on internet / on...,10.06,5.603803,True
4,Argentina,2020,Device Addiction (time of use on internet / on...,9.39,5.0729,True
5,Malaysia,2020,Device Addiction (time of use on internet / on...,9.17,4.898574,True
6,Mexico,2020,Device Addiction (time of use on internet / on...,9.01,4.771791,True
7,Indonesia,2020,Device Addiction (time of use on internet / on...,8.52,4.383518,True
8,Thailand,2020,Device Addiction (time of use on internet / on...,8.44,4.320127,True
9,Taiwan,2020,Device Addiction (time of use on internet / on...,8.08,4.034865,True


In [None]:
### 12. Gross National Wellbeing

In [113]:
indicators[11]

# load data
indicator = indicators[11]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

# happiness_score is missing looks like the problem is in a redundant space before it in the filename matching

Gross National Wellbeing
 happiness_score


FileNotFoundError: [Errno 2] No such file or directory: '../../processed/ happiness_score.csv'

In [None]:
### 13. % of internet users who own cryptocurrency

In [114]:
indicators[12]

# load data
indicator = indicators[12]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

% of internet users who own cryptocurrency
cryptocurrency_adoption


In [116]:
df.head(15)

# Need to move the top row down

Unnamed: 0,Country,Share of respondents who indicated they either owned or used cryptocurrencies in 55 countries worldwide in 2020,Units
0,Nigeria,31.9,in %
1,Vietnam,21.1,in %
2,Philippines,19.8,in %
3,South Africa,17.8,in %
4,Thailand,17.6,in %
5,Peru,16.1,in %
6,Turkey,16.1,in %
7,Colombia,15.3,in %
8,Argentina,14.4,in %
9,Indonesia,13.0,in %


In [118]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Share of respondents who indicated they either owned or used cryptocurrencies in 55 countries worldwide in 2020'] 
df['Country Name'] = df['Country']
df['Year'] = 2020

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

In [119]:
df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']].head(20)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,Nigeria,2020,% of internet users who own cryptocurrency,31.9,6.0,True
1,Vietnam,2020,% of internet users who own cryptocurrency,21.1,4.085106,True
2,Philippines,2020,% of internet users who own cryptocurrency,19.8,3.85461,True
3,South Africa,2020,% of internet users who own cryptocurrency,17.8,3.5,True
4,Thailand,2020,% of internet users who own cryptocurrency,17.6,3.464539,True
5,Peru,2020,% of internet users who own cryptocurrency,16.1,3.198582,True
6,Turkey,2020,% of internet users who own cryptocurrency,16.1,3.198582,True
7,Colombia,2020,% of internet users who own cryptocurrency,15.3,3.056738,True
8,Argentina,2020,% of internet users who own cryptocurrency,14.4,2.897163,True
9,Indonesia,2020,% of internet users who own cryptocurrency,13.0,2.648936,True


In [None]:
### 14. Percentage of individuals not buying online due to concerns about returning products

In [None]:
indicators[13]

# load data
indicator = indicators[13]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

In [None]:
df.head(15)

In [None]:
# filter most recent year and global value
df = df[(df.Time==2019)]
df = df[(df.Scope =='All individuals (aged 16-74)')]

# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = False
df['Indicator'] = indicator
df['data_col'] = df['Value'] 
df['Year'] = df['Time']
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# convert 1-190 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=min_rank,old_max=max_rank))

# invert since to put as higher is  better
df['new_rank_score'] = df['new_rank_score'].apply(lambda row: (6-row)+1)

In [None]:
df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']].head(20)

In [22]:
### 15. Percentage of individuals not buying online due to payment security concerns

In [23]:
indicators[14]

# load data
indicator = indicators[14]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Percentage of individuals not buying online due to payment security concerns
not_buying_online_concern_about_security


In [24]:
df.head(15)

Unnamed: 0,Indicator,Country,Variable,Unit,Scope,Time,Value,Flags
0,Individuals who did not buy online for payment...,Austria,"Individuals who, in the last 12 months, haven'...",Percentage of individuals who ordered goods or...,All individuals (aged 16-74),2009,38.8093,
1,Individuals who did not buy online for payment...,Austria,"Individuals who, in the last 12 months, haven'...",Percentage of individuals who ordered goods or...,All individuals (aged 16-74),2015,34.8985,
2,Individuals who did not buy online for payment...,Austria,"Individuals who, in the last 12 months, haven'...",Percentage of individuals who ordered goods or...,All individuals (aged 16-74),2017,33.04743,
3,Individuals who did not buy online for payment...,Austria,"Individuals who, in the last 12 months, haven'...",Percentage of individuals who ordered goods or...,All individuals (aged 16-74),2019,31.19888,
4,Individuals who did not buy online for payment...,Austria,"Individuals who, in the last 12 months, haven'...",Percentage of individuals who ordered goods or...,Individuals aged 16-24,2009,38.3401,
5,Individuals who did not buy online for payment...,Austria,"Individuals who, in the last 12 months, haven'...",Percentage of individuals who ordered goods or...,Individuals aged 16-24,2015,34.9687,
6,Individuals who did not buy online for payment...,Austria,"Individuals who, in the last 12 months, haven'...",Percentage of individuals who ordered goods or...,Individuals aged 16-24,2017,16.65675,
7,Individuals who did not buy online for payment...,Austria,"Individuals who, in the last 12 months, haven'...",Percentage of individuals who ordered goods or...,Individuals aged 16-24,2019,12.88121,
8,Individuals who did not buy online for payment...,Austria,"Individuals who, in the last 12 months, haven'...",Percentage of individuals who ordered goods or...,Individuals aged 55-74,2009,40.5761,
9,Individuals who did not buy online for payment...,Austria,"Individuals who, in the last 12 months, haven'...",Percentage of individuals who ordered goods or...,Individuals aged 55-74,2015,42.602,


In [25]:
# filter most recent year and global value
df = df[(df.Time==2019)]
df = df[(df.Scope =='All individuals (aged 16-74)')]

# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = False
df['Indicator'] = indicator
df['data_col'] = df['Value'] 
df['Year'] = df['Time']
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# convert 1-190 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=min_rank,old_max=max_rank))

# invert since to put as higher is  better
df['new_rank_score'] = df['new_rank_score'].apply(lambda row: (6-row)+1)

In [26]:
df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']].head(20)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
3,Austria,2019,Percentage of individuals not buying online du...,31.19888,3.765819,False
15,Belgium,2019,Percentage of individuals not buying online du...,18.12961,4.853663,False
27,Czech Republic,2019,Percentage of individuals not buying online du...,9.814267,5.545805,False
39,Denmark,2019,Percentage of individuals not buying online du...,18.77055,4.800313,False
51,Estonia,2019,Percentage of individuals not buying online du...,4.357608,6.0,False
84,European Union (28 countries),2019,Percentage of individuals not buying online du...,24.3086,4.339344,False
96,Finland,2019,Percentage of individuals not buying online du...,64.42721,1.0,False
108,France,2019,Percentage of individuals not buying online du...,40.84349,2.963033,False
120,Germany,2019,Percentage of individuals not buying online du...,25.12184,4.271652,False
132,Greece,2019,Percentage of individuals not buying online du...,23.22381,4.429638,False


In [37]:
### 16. E-waste generated, kilograms per inhabitant

In [41]:
indicators[15]

# load data
indicator = indicators[15]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

E-waste generated, kilograms per inhabitant
ewaste_per_inhabitant


In [42]:
df.head(15)

Unnamed: 0,iso3c,region_id,country_name,income_id,gdp,composition_food_organic_waste_percent,composition_glass_percent,composition_metal_percent,composition_other_percent,composition_paper_cardboard_percent,...,waste_treatment_controlled_landfill_percent,waste_treatment_incineration_percent,waste_treatment_landfill_unspecified_percent,waste_treatment_open_dump_percent,waste_treatment_other_percent,waste_treatment_recycling_percent,waste_treatment_sanitary_landfill_landfill_gas_system_percent,waste_treatment_unaccounted_for_percent,waste_treatment_waterways_marine_percent,where_where_is_this_data_measured
0,ABW,LCN,Aruba,HIC,35563.3125,,,,,,...,,,,,,11.0,,89.0,,
1,AFG,SAS,Afghanistan,LIC,2057.062256,,,,,,...,,,,,,,,,,Other
2,AGO,SSF,Angola,LMC,8036.69043,51.8,6.7,4.4,11.5,11.9,...,,,,,,,,,,
3,ALB,ECS,Albania,UMC,13724.058594,51.4,4.5,4.8,15.21,9.9,...,,,,,,,,,,Some disposal sites
4,AND,ECS,Andorra,HIC,43711.800781,31.2,8.2,2.6,11.6,35.1,...,,52.1,,,,,,47.9,,
5,ARE,MEA,United Arab Emirates,HIC,67119.132812,39.0,4.0,3.0,10.0,25.0,...,,,9.0,62.0,,20.0,,,,
6,ARG,LCN,Argentina,HIC,23550.099609,38.74,3.16,1.84,15.36,13.96,...,8.9,,,22.6,,6.0,62.5,,,Other
7,ARM,ECS,Armenia,UMC,11019.838867,57.0,3.2,3.4,17.4,6.7,...,,,,100.0,,,,,,Other
8,ASM,EAS,American Samoa,UMC,11113.442383,19.7,3.4,7.9,25.6,26.4,...,,,,,,,,,,
9,ATG,LCN,Antigua and Barbuda,HIC,17965.501953,46.0,7.0,7.0,12.0,15.0,...,98.68,,,,,,,1.14,0.1,Disposal Site


In [44]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
# Use special_waste_e_waste_tons_year times one thousand and divde by total population
df['data_col'] = df['special_waste_e_waste_tons_year']*1000/df['population_population_number_of_people'] 
df['Year'] = 2021
df['Country Name'] = df['country_name']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# convert 1-190 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=min_rank,old_max=max_rank))

In [46]:
df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']].head(20)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,Aruba,2021,"E-waste generated, kilograms per inhabitant",,,True
1,Afghanistan,2021,"E-waste generated, kilograms per inhabitant",0.5771,1.040883,True
2,Angola,2021,"E-waste generated, kilograms per inhabitant",3.665901,1.261006,True
3,Albania,2021,"E-waste generated, kilograms per inhabitant",7.00724,1.499126,True
4,Andorra,2021,"E-waste generated, kilograms per inhabitant",,,True
5,United Arab Emirates,2021,"E-waste generated, kilograms per inhabitant",13.714713,1.977133,True
6,Argentina,2021,"E-waste generated, kilograms per inhabitant",6.786638,1.483405,True
7,Armenia,2021,"E-waste generated, kilograms per inhabitant",4.817254,1.343057,True
8,American Samoa,2021,"E-waste generated, kilograms per inhabitant",,,True
9,Antigua and Barbuda,2021,"E-waste generated, kilograms per inhabitant",11.366337,1.809777,True


In [47]:
### 17. Automation-led unemployment

In [50]:
indicators[16]

# load data
indicator = indicators[16]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Automation-led unemployment
automation_led_unemployment


In [51]:
df.head(15)

Unnamed: 0,Country,No. of employees potentially automable (millions),Total employees (millions),Potential Rate of Automation (%)
0,Japan,35.6,63.9,55.71
1,Thailand,21.0,38.4,54.69
2,Senegal,2.2,4.07,54.0
3,Colombia,9.3,17.5,53.14
4,Peru,6.9,13.0,53.08
5,Taiwan,5.2,9.8,53.06
6,Kenya,7.4,14.2,52.11
7,South Korea,12.5,24.0,52.08
8,Sweden,2.1,4.04,52.0
9,Costa Rica,1.1,2.12,52.0


In [54]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['Potential Rate of Automation (%)'] 
df['Year'] = 2018
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# convert 1-190 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=min_rank,old_max=max_rank))

In [55]:
df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']].head(20)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,Japan,2018,Automation-led unemployment,55.71,6.0,True
1,Thailand,2018,Automation-led unemployment,54.69,5.656797,True
2,Senegal,2018,Automation-led unemployment,54.0,5.42463,True
3,Colombia,2018,Automation-led unemployment,53.14,5.135262,True
4,Peru,2018,Automation-led unemployment,53.08,5.115074,True
5,Taiwan,2018,Automation-led unemployment,53.06,5.108345,True
6,Kenya,2018,Automation-led unemployment,52.11,4.788694,True
7,South Korea,2018,Automation-led unemployment,52.08,4.7786,True
8,Sweden,2018,Automation-led unemployment,52.0,4.751682,True
9,Costa Rica,2018,Automation-led unemployment,52.0,4.751682,True


In [56]:
### 18. Cyberbullying

In [152]:
indicators[17]

# load data
indicator = indicators[17]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Cyberbullying
cyberbullying_rate


In [153]:
df.head(15)

# Need to move the top row down further

Unnamed: 0,2011,2016,2018,Country
0,32,32,37,India
1,20,19,29,Brazil
2,15,34,26,United States
3,12,13,25,Belgium
4,10,25,26,South Africa
5,--,--,23,Malaysia
6,14,20,23,Sweden
7,18,17,20,Canada
8,5,14,20,Turkey
9,18,17,19,Saudi Arabia


In [157]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['2018'] 
df['Year'] = 2018
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# convert 1-190 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row, old_min=min_rank,old_max=max_rank))

# invert since to put as higher is  better
df['new_rank_score'] = df['new_rank_score'].apply(lambda row: (6-row)+1)

In [158]:
df[['Country Name', 'Year','Indicator','data_col','new_rank_score','higher_is_better']].head(20)

Unnamed: 0,Country Name,Year,Indicator,data_col,new_rank_score,higher_is_better
0,India,2018,Cyberbullying,37,1.0,True
1,Brazil,2018,Cyberbullying,29,2.111111,True
2,United States,2018,Cyberbullying,26,2.527778,True
3,Belgium,2018,Cyberbullying,25,2.666667,True
4,South Africa,2018,Cyberbullying,26,2.527778,True
5,Malaysia,2018,Cyberbullying,23,2.944444,True
6,Sweden,2018,Cyberbullying,23,2.944444,True
7,Canada,2018,Cyberbullying,20,3.361111,True
8,Turkey,2018,Cyberbullying,20,3.361111,True
9,Saudi Arabia,2018,Cyberbullying,19,3.5,True


In [65]:
### 19. Global Wellbeing Initiative

In [69]:
indicators[18]

# load data
indicator = indicators[18]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

# Strange the global_wellbeing_initiative is clearly in the data manifest and the link works well 
# but it has not been moved to the processed folder

Global Wellbeing Initiative (World Happiness Index)
global_wellbeing_initiative 


FileNotFoundError: [Errno 2] No such file or directory: '../../processed/global_wellbeing_initiative .csv'

In [70]:
### 20. Financial Inclusiveness

In [72]:
indicators[19]

# load data
indicator = indicators[19]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)


df = pd.read_csv('../../processed/{}.csv'.format(bf))
# Strange the global_wellbeing_initiative is clearly in the data manifest and the link works well 
# but it has not been moved to the processed folder

Financial Inclusiveness
financial_inclusiveness 


FileNotFoundError: [Errno 2] No such file or directory: '../../processed/financial_inclusiveness .csv'

In [73]:
### 21. E-commerce activity (% of individuals buying online and frequency)

In [75]:
indicators[20]

# load data
indicator = indicators[20]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

# Added _ between spaces to see if this changes anything

E-commerce activity (% of individuals buying online and frequency)
individuals buying online and frequency


FileNotFoundError: [Errno 2] No such file or directory: '../../processed/individuals buying online and frequency.csv'

In [76]:
### 22. E-commerce activity (Types of goods and services purchased online) 

In [78]:
indicators[21]

# load data
indicator = indicators[21]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

# Move the top row down further
# This is a list of categories, wonder if it will do

E-commerce activity (Types of goods and services purchased online) 
e-commerce_activity


In [81]:
df.head(15)

Unnamed: 0,"Global online shopper per visit spend 2021, by category",Unnamed: 1
0,Average e-commerce spending per online shopper...,
1,Luxury apparel,3.45
2,Active apparel,2.96
3,General apparel,2.7
4,All verticals,3.39
5,Health and beauty,2.4
6,Home and appliances,2.08


In [121]:
### 23. Top Visited websites

In [123]:
indicators[22]

# load data
indicator = indicators[22]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Top Visited websites
top_sites


In [125]:
df.head(15)

Unnamed: 0,1,google.com
0,2,youtube.com
1,3,facebook.com
2,4,baidu.com
3,5,wikipedia.org
4,6,yahoo.com
5,7,google.co.in
6,8,reddit.com
7,9,qq.com
8,10,amazon.com
9,11,taobao.com


In [127]:
### 24. Top YouTube Searches

In [129]:
indicators[23]

# load data
indicator = indicators[23]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Top YouTube Searches
youtube_searches


In [131]:
df.head(15)

Unnamed: 0,#,Keyword,Search Volume
0,1,bts,16723304
1,2,pewdiepie,16495659
2,3,asmr,14655088
3,4,billie eilish,13801247
4,5,baby shark,12110100
5,6,old town road,10456524
6,7,music,10232134
7,8,badabun,10188997
8,9,blackpink,9580131
9,10,fortnite,9117342


In [133]:
### 25. Top Google searches

In [135]:
indicators[24]

# load data
indicator = indicators[24]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Top Google searches
google_trends


In [137]:
df.head(15)

Unnamed: 0.1,Unnamed: 0,data_country,data_year
0,13.6,13.6,13.6
1,4.1,4.1,4.1
2,4.1,4.1,4.1
3,4.1,4.1,4.1
4,2.2,2.2,2.2
5,2.2,2.2,2.2
6,1.8,1.8,1.8
7,1.5,1.5,1.5
8,1.5,1.5,1.5
9,1.2,1.2,1.2


In [None]:
### 26. Internet Usage

In [140]:
indicators[25]

# load data
indicator = indicators[25]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Internet Usage
intenet_usage


In [142]:
df.head(15)

# Need to rename the filename in filename matching 


Unnamed: 0,Country Code,Region,IncomeGroup,SpecialNotes,TableName,Unnamed: 5
0,ABW,Latin America & Caribbean,High income,,Aruba,
1,AFE,,,"26 countries, stretching from the Red Sea in t...",Africa Eastern and Southern,
2,AFG,South Asia,Low income,Fiscal year end: March 20; reporting period fo...,Afghanistan,
3,AFW,,,"22 countries, stretching from the westernmost ...",Africa Western and Central,
4,AGO,Sub-Saharan Africa,Lower middle income,,Angola,
5,ALB,Europe & Central Asia,Upper middle income,,Albania,
6,AND,Europe & Central Asia,High income,,Andorra,
7,ARB,,,Arab World aggregate. Arab World is composed o...,Arab World,
8,ARE,Middle East & North Africa,High income,,United Arab Emirates,
9,ARG,Latin America & Caribbean,Upper middle income,,Argentina,


In [143]:
### 27. Households with a computer and with Internet Access

In [177]:
indicators[26]

# load data
indicator = indicators[26]
print(indicator)
bf = bnames[bnames['Indicator']==indicator]['Filename'].values[0]
print(bf)

df = pd.read_csv('../../processed/{}.csv'.format(bf))

Households with a computer and with Internet access
household_internet_access


In [178]:
df['2016'].replace('No data', np.NaN)
df.head(15)

Unnamed: 0,2014,2015,2016,Country,Unnamed: 4
0,45.37,42.4,48,Brazil,
1,20.5,24.5,26.64,Albania,
2,19.36,23.8,25.94,Algeria,
3,7.17,7.9,No data,Angola,
4,47.5,53.9,52,Argentina,
5,25.4,35.6,46.6,Armenia,
6,81.37,83,86.89,Australia,
7,79.0,80.86,80.99,Austria,
8,46.8,51.5,54.62,Azerbaijan,
9,79.0,82,81,Bahrain,


In [175]:
# create standard columns
# df.rename(columns={'COUNTRY/ECONOMY':'Country Name'}, inplace=True)
df['higher_is_better'] = True
df['Indicator'] = indicator
df['data_col'] = df['2016'] 
df['Year'] = 2016
df['Country Name'] = df['Country']

min_rank = df['data_col'].min()
max_rank = df['data_col'].max()

# transform 0-1 rank into 1-6
df['new_rank_score'] = df['data_col'].apply(lambda row: convert_rank(row,old_min=min_rank,old_max=max_rank))

TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [162]:
df[['Country Name','Year','Indicator','data_col','new_rank_score','higher_is_better']].head(15)

KeyError: "None of [Index(['Country Name', 'Year', 'Indicator', 'data_col', 'new_rank_score',\n       'higher_is_better'],\n      dtype='object')] are in the [columns]"