In [None]:
import pandas as pd
import numpy as np

## Data Source
- "data/DB_Indicators.xlsx" => World Bank's economics indicators  
The last 5 rows are footer, so they will be dropped.

In [None]:
df = pd.read_excel("data/DB_Indicators.xlsx")[:-5]

## Removing outliers by existing values
1. If 2017 is NaN, fill it with 2018's values
2. If 2018 is NaN, fill it with 2017's values
3. If 2019 is NaN, fill it with the mean of 2017 & 2018

In [71]:
df.loc[df["2017"].isnull(), "2017"] = df["2018"]
df.loc[df["2018"].isnull(), "2018"] = df["2017"]
df.loc[df["2019"].isnull(), "2019"] = (df["2017"] + df["2018"]) /2

## Calculating the mean with values from 2017 to 2019

In [91]:
df["Mean"] = df.mean(numeric_only=True, axis=1)

In [78]:
directory_path = "/mnt/s/Downloads/"
file_name = "cleaned_DB_Indicators.xlsx"
full_directory_path = directory_path + file_name
writer = pd.ExcelWriter(full_directory_path, engine="xlsxwriter")
df.to_excel(writer, sheet_name="World Indicator", index=False)
writer.save()

## Extract column names and countries name

In [111]:
series_name = df["Series Name"][0:14].to_list()
countries_name = df["Country Name"][::14].to_list()

['Adjusted net national income per capita (current US$)',
 'UHC service coverage index',
 'International tourism, number of arrivals',
 'Rail lines (total route-km)',
 'Fixed broadband subscriptions (per 100 people)',
 'Fixed telephone subscriptions (per 100 people)',
 'Mobile cellular subscriptions (per 100 people)',
 'Current health expenditure (% of GDP)',
 'Domestic general government health expenditure per capita, PPP (current international $)',
 'PM2.5 air pollution, mean annual exposure (micrograms per cubic meter)',
 'Nitrous oxide emissions (thousand metric tons of CO2 equivalent)',
 'Population density (people per sq. km of land area)',
 'Urban population (% of total population)',
 'GDP per capita (current US$)']

## Extact each 14 data of a country into a list

In [112]:
rows = []
data = []
for index, row in df.iterrows():
    data.append(row["Mean"])
    
    if (index+1) % 14 == 0 and index != 0:
        rows.append(data)
        data = []

In [116]:
df_aggregated = pd.DataFrame(data=rows, index=countries_name, columns=series_name)

In [119]:
directory_path = "/mnt/s/Downloads/"
file_name = "aggregated_DB_Indicators.xlsx"
full_directory_path = directory_path + file_name
writer = pd.ExcelWriter(full_directory_path, engine="xlsxwriter")
df_aggregated.to_excel(writer, sheet_name="World Indicator")
writer.save()

## Data Source
- "data/DB_Nutrition.xlsx" => World Bank's healtlh indicators  
The last 5 rows are footer, so they will be dropped.

In [133]:
df_n = pd.read_excel("data/DB_Nutrition.xlsx")[:-5]

In [134]:
df_n.loc[df["2017"].isnull(), "2017"] = df["2018"]
df_n.loc[df["2018"].isnull(), "2018"] = df["2017"]
df_n.loc[df["2019"].isnull(), "2019"] = (df["2017"] + df["2018"]) /2
df_n["Mean"] = df.mean(numeric_only=True, axis=1)

In [139]:
nutrition_series_name = df_n["Series Name"][:9].to_list()
nutrition_countries_name = df_n["Country Name"][::9].to_list()
nutrition_countries_name == countries_name

True

In [140]:
rows = []
data = []
for index, row in df_n.iterrows():
    data.append(row["Mean"])
    
    if (index+1) % 9 == 0 and index != 0:
        rows.append(data)
        data = []

In [142]:
df_aggregated_n = pd.DataFrame(data=rows, index=countries_name, columns=nutrition_series_name)

In [148]:
aggregated_result = pd.concat([df_aggregated, df_aggregated_n], axis=1, join="inner")

In [149]:
directory_path = "/mnt/s/Downloads/"
file_name = "aggregated_all.xlsx"
full_directory_path = directory_path + file_name
writer = pd.ExcelWriter(full_directory_path, engine="xlsxwriter")
aggregated_result.to_excel(writer, sheet_name="All indicators")
writer.save()

## Confirmed cases and death cases

In [202]:
df_confirmed = pd.read_csv("data/confirmed_cases.csv")

In [203]:
hk_index = df_confirmed.loc[df_confirmed["Province"] == "Hong Kong"]
macau_index = df_confirmed.loc[df_confirmed["Province"] == "Macau"]

Hong Kong and Macau shall be seperated from China in the calculation.

In [209]:
df_confirmed.loc[hk_index.index, "Country"] = "Hong Kong"
df_confirmed.loc[macau_index.index, "Country"] = "Macau"

Unnamed: 0,Province,Country,Confirmed
60,Henan,China,1276
61,Hong Kong,Hong Kong,714
62,Hubei,China,67801
63,Hunan,China,1018
64,Inner Mongolia,China,107
65,Jiangsu,China,646
66,Jiangxi,China,937
67,Jilin,China,98
68,Liaoning,China,139
69,Macau,Macau,41


In [210]:
df_confirmed["Total"] = df_confirmed.groupby(["Country"])["Confirmed"].transform("sum")

In [211]:
df_c = df_confirmed.drop_duplicates("Country")

In [217]:
del df_c["Province"]

In [219]:
del df_c["Confirmed"]