# Visualization COVID-19 data using Bar Chart Race
* you can find it here: [https://app.flourish.studio/projects](https://app.flourish.studio/projects)

## 1. Get raw data

In [3]:
import pandas as pd
doc = pd.read_csv('Examples/COVID-19-master/csse_covid_19_data/csse_covid_19_daily_reports/04-01-2020.csv', encoding='utf-8-sig')
doc.head()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
0,45001.0,Abbeville,South Carolina,US,2020-04-01 21:58:49,34.223334,-82.461707,4,0,0,0,"Abbeville, South Carolina, US"
1,22001.0,Acadia,Louisiana,US,2020-04-01 21:58:49,30.295065,-92.414197,47,1,0,0,"Acadia, Louisiana, US"
2,51001.0,Accomack,Virginia,US,2020-04-01 21:58:49,37.767072,-75.632346,7,0,0,0,"Accomack, Virginia, US"
3,16001.0,Ada,Idaho,US,2020-04-01 21:58:49,43.452658,-116.241552,195,3,0,0,"Ada, Idaho, US"
4,19001.0,Adair,Iowa,US,2020-04-01 21:58:49,41.330756,-94.471059,1,0,0,0,"Adair, Iowa, US"


## 2. Preprocess data
* select object columns
* delete NaN data
* change types
* unify column names

In [4]:
try:
    doc = doc[['Province_State', 'Country_Region', 'Confirmed']]  # select columns
except:
    doc = doc[['Province/State', 'Country/Region', 'Confirmed']]  # select columns
    doc.columns = ['Province_State', 'Country_Region', 'Confirmed']  # change columnn names

doc = doc.dropna(subset=['Confirmed'])    # delete NaN data
doc = doc.astype({'Confirmed': 'int64'})  # change data type
doc.head()

Unnamed: 0,Province_State,Country_Region,Confirmed
0,South Carolina,US,4
1,Louisiana,US,47
2,Virginia,US,7
3,Idaho,US,195
4,Iowa,US,1


* check country flags with their name

In [5]:
country_info = pd.read_csv('Examples/COVID-19-master/csse_covid_19_data/UID_ISO_FIPS_LookUP_Table.csv', encoding='utf-8-sig')
country_info.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key
0,0,0,,BW,,,,,,Botswana,,,Botswana
1,1,1,,BI,,,,,,Burundi,,,Burundi
2,2,2,,SL,,,,,,Sierra Leone,,,Sierra Leone
3,3,3,4.0,AF,AFG,4.0,,,,Afghanistan,33.93911,67.709953,Afghanistan
4,4,4,8.0,AL,ALB,8.0,,,,Albania,41.1533,20.1683,Albania


* merge two dataframe

In [6]:
df = pd.merge(doc, country_info, on='Country_Region', how='left')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7365315 entries, 0 to 7365314
Data columns (total 15 columns):
 #   Column            Dtype  
---  ------            -----  
 0   Province_State_x  object 
 1   Country_Region    object 
 2   Confirmed         int64  
 3   Unnamed: 0        float64
 4   Unnamed: 0.1      float64
 5   UID               float64
 6   iso2              object 
 7   iso3              object 
 8   code3             float64
 9   FIPS              float64
 10  Admin2            object 
 11  Province_State_y  object 
 12  Lat               float64
 13  Long_             float64
 14  Combined_Key      object 
dtypes: float64(7), int64(1), object(7)
memory usage: 899.1+ MB


In [7]:
nan_rows = df[df['iso2'].isnull()]
nan_rows.head()

Unnamed: 0.2,Province_State_x,Country_Region,Confirmed,Unnamed: 0,Unnamed: 0.1,UID,iso2,iso3,code3,FIPS,Admin2,Province_State_y,Lat,Long_,Combined_Key
7365162,,Diamond Princess,712,45.0,45.0,9999.0,,,,,,,0.0,0.0,Diamond Princess
7365226,,MS Zaandam,9,114.0,114.0,8888.0,,,,,,,0.0,0.0,",,MS Zaandam"
7365241,,Namibia,14,115.0,115.0,516.0,,NAM,516.0,,,,-22.9576,18.4904,Namibia
7365287,,Taiwan*,329,,,,,,,,,,,,


* As raw data varies, we need to unify column names manually...

In [8]:
import json

with open('Examples/COVID-19-master/csse_covid_19_data/country_convert.json', 'r', encoding='utf-8-sig') as json_file:
    json_data = json.load(json_file)
    print(json_data.keys())

dict_keys(['Mainland China', 'Macau', 'South Korea', 'Aruba', ' Azerbaijan', 'Bahamas, The', 'Cape Verde', 'Cayman Islands', 'Channel Islands', 'Curacao', 'Czech Republic', 'East Timor', 'Faroe Islands', 'French Guiana', 'Gambia, The', 'Gibraltar', 'Greenland', 'Guadeloupe', 'Guam', 'Guernsey', 'Hong Kong', 'Hong Kong SAR', 'Iran (Islamic Republic of)', 'Ivory Coast', 'Jersey', 'Macao SAR', 'Martinique', 'Mayotte', 'North Ireland', 'Palestine', 'Puerto Rico', 'Republic of Ireland', 'Republic of Korea', 'Republic of Moldova', 'Republic of the Congo', 'Reunion', 'Russian Federation', 'Saint Barthelemy', 'Saint Martin', 'St. Martin', 'Taipei and environs', 'The Bahamas', 'The Gambia', 'UK', 'Vatican City', 'Viet Nam', 'occupied Palestinian territory', 'Taiwan*', 'Malawi', 'South Sudan', 'Western Sahara', 'Namibia'])


### Using apply() function
* Using `apply()`, you can update selected column's values

In [9]:
df = pd.DataFrame({
    'A': [80, 90, 100],
    'B': [50, 60, 70]
}, index=['Top', 'Middle', 'Bottom'])
df

Unnamed: 0,A,B
Top,80,50
Middle,90,60
Bottom,100,70


In [10]:
def func(df_data):
    print(type(df_data))
    print(df_data.index)
    print(df_data.values)
    return df_data

df_func = df.apply(func, axis=0)  # by default: axis=0
df_func

<class 'pandas.core.series.Series'>
Index(['Top', 'Middle', 'Bottom'], dtype='object')
[ 80  90 100]
<class 'pandas.core.series.Series'>
Index(['Top', 'Middle', 'Bottom'], dtype='object')
[50 60 70]


Unnamed: 0,A,B
Top,80,50
Middle,90,60
Bottom,100,70


In [11]:
df_func = df.apply(func, axis=1)

<class 'pandas.core.series.Series'>
Index(['A', 'B'], dtype='object')
[80 50]
<class 'pandas.core.series.Series'>
Index(['A', 'B'], dtype='object')
[90 60]
<class 'pandas.core.series.Series'>
Index(['A', 'B'], dtype='object')
[100  70]


In [13]:
def func(df_data):
    df_data['A'] = 100
    return df_data

df_func = df.apply(func, axis=1)
df_func

Unnamed: 0,A,B
Top,100,50
Middle,100,60
Bottom,100,70


In [14]:
def func(row):
    if row['Country_Region'] in json_data:
        row['Country_Region'] = json_data[row['Country_Region']]
    return row

doc = doc.apply(func, axis=1)
doc.head()

Unnamed: 0,Province_State,Country_Region,Confirmed
0,South Carolina,US,4
1,Louisiana,US,47
2,Virginia,US,7
3,Idaho,US,195
4,Iowa,US,1


### Using groupby() function
* Using `groupby()`, you can group columns

In [22]:
import json

PATH = 'Examples/COVID-19-master/csse_covid_19_data/csse_covid_19_daily_reports/'

with open('Examples/COVID-19-master/csse_covid_19_data/country_convert.json', 'r', encoding='utf-8-sig') as json_file:
    json_data = json.load(json_file)

def country_name_convert(row):
    if row['Country_Region'] in json_data:
        return json_data[row['Country_Region']]
    return row['Country_Region']

def create_dateframe(filename):
    doc = pd.read_csv(PATH + filename, encoding='utf-8-sig')
    try:
        doc = doc[['Country_Region', 'Confirmed']]
    except:
        doc = doc[['Country/Region', 'Confirmed']]
        doc.columns = ['Country_Region', 'Confirmed']
    doc = doc.dropna(subset=['Confirmed'])
    doc['Country_Region'] = doc.apply(country_name_convert, axis=1)
    doc = doc.astype({'Confirmed': 'int64'})
    doc = doc.groupby('Country_Region').sum()
    
    date_column = filename.split(".")[0].lstrip('0').replace('-', '/')
    doc.columns = [date_column]
    return doc

In [24]:
doc1 = create_dateframe("01-22-2020.csv")
doc2 = create_dateframe("04-01-2020.csv")
doc2.head()

Unnamed: 0_level_0,4/01/2020
Country_Region,Unnamed: 1_level_1
Afghanistan,237
Albania,259
Algeria,847
Andorra,390
Angola,8


In [25]:
doc = pd.merge(doc1, doc2, how='outer', left_index=True, right_index=True)
doc.head()

Unnamed: 0_level_0,1/22/2020,4/01/2020
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,,237
Albania,,259
Algeria,,847
Andorra,,390
Angola,,8


In [28]:
doc.fillna(0)

Unnamed: 0_level_0,1/22/2020,4/01/2020
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,0.0,237
Albania,0.0,259
Algeria,0.0,847
Andorra,0.0,390
Angola,0.0,8
...,...,...
Venezuela,0.0,143
Vietnam,0.0,218
West Bank and Gaza,0.0,134
Zambia,0.0,36


In [35]:
import os

PATH = 'Examples/COVID-19-master/csse_covid_19_data/csse_covid_19_daily_reports/'
file_list, csv_list = os.listdir(PATH), list()

for file in file_list:
    if file.split(".")[-1] == 'csv':
        csv_list.append(file)

print(csv_list)

['02-26-2020.csv', '02-27-2020.csv', '06-07-2020.csv', '06-06-2020.csv', '04-08-2020.csv', '04-09-2020.csv', '02-18-2020.csv', '02-19-2020.csv', '03-24-2020.csv', '03-25-2020.csv', '04-02-2020.csv', '04-03-2020.csv', '03-10-2020.csv', '03-11-2020.csv', '05-01-2020.csv', '02-12-2020.csv', '02-13-2020.csv', '05-23-2020.csv', '05-22-2020.csv', '06-10-2020.csv', '06-11-2020.csv', '04-21-2020.csv', '04-20-2020.csv', '03-07-2020.csv', '03-06-2020.csv', '05-29-2020.csv', '05-28-2020.csv', '04-15-2020.csv', '04-14-2020.csv', '02-05-2020.csv', '02-04-2020.csv', '05-17-2020.csv', '05-16-2020.csv', '05-24-2020.csv', '05-25-2020.csv', '06-17-2020.csv', '06-16-2020.csv', '04-18-2020.csv', '04-19-2020.csv', '04-26-2020.csv', '04-27-2020.csv', '02-08-2020.csv', '02-09-2020.csv', '03-01-2020.csv', '04-12-2020.csv', '04-13-2020.csv', '02-02-2020.csv', '02-03-2020.csv', '01-31-2020.csv', '01-30-2020.csv', '05-10-2020.csv', '05-11-2020.csv', '02-21-2020.csv', '02-20-2020.csv', '06-01-2020.csv', '03-23-20

In [36]:
csv_list.sort()
csv_list

['01-22-2020.csv',
 '01-23-2020.csv',
 '01-24-2020.csv',
 '01-25-2020.csv',
 '01-26-2020.csv',
 '01-27-2020.csv',
 '01-28-2020.csv',
 '01-29-2020.csv',
 '01-30-2020.csv',
 '01-31-2020.csv',
 '02-01-2020.csv',
 '02-02-2020.csv',
 '02-03-2020.csv',
 '02-04-2020.csv',
 '02-05-2020.csv',
 '02-06-2020.csv',
 '02-07-2020.csv',
 '02-08-2020.csv',
 '02-09-2020.csv',
 '02-10-2020.csv',
 '02-11-2020.csv',
 '02-12-2020.csv',
 '02-13-2020.csv',
 '02-14-2020.csv',
 '02-15-2020.csv',
 '02-16-2020.csv',
 '02-17-2020.csv',
 '02-18-2020.csv',
 '02-19-2020.csv',
 '02-20-2020.csv',
 '02-21-2020.csv',
 '02-22-2020.csv',
 '02-23-2020.csv',
 '02-24-2020.csv',
 '02-25-2020.csv',
 '02-26-2020.csv',
 '02-27-2020.csv',
 '02-28-2020.csv',
 '02-29-2020.csv',
 '03-01-2020.csv',
 '03-02-2020.csv',
 '03-03-2020.csv',
 '03-04-2020.csv',
 '03-05-2020.csv',
 '03-06-2020.csv',
 '03-07-2020.csv',
 '03-08-2020.csv',
 '03-09-2020.csv',
 '03-10-2020.csv',
 '03-11-2020.csv',
 '03-12-2020.csv',
 '03-13-2020.csv',
 '03-14-2020

In [37]:
import os

def generate_dateframe_by_path(PATH):
    file_list, csv_list = os.listdir(PATH), list()
    first_doc = True
    for file in file_list:
        if file.split(".")[-1] == 'csv':
            csv_list.append(file)
    csv_list.sort()
    
    for file in csv_list:
        doc = create_dateframe(file)
        if first_doc:
            final_doc, first_doc = doc, False
        else:
            final_doc = pd.merge(final_doc, doc, how='outer', left_index=True, right_index=True)

    final_doc = final_doc.fillna(0)
    return final_doc

In [38]:
PATH = 'Examples/COVID-19-master/csse_covid_19_data/csse_covid_19_daily_reports/'
doc = generate_dateframe_by_path(PATH)
doc

Unnamed: 0_level_0,1/22/2020,1/23/2020,1/24/2020,1/25/2020,1/26/2020,1/27/2020,1/28/2020,1/29/2020,1/30/2020,1/31/2020,...,6/08/2020,6/09/2020,6/10/2020,6/11/2020,6/12/2020,6/13/2020,6/14/2020,6/15/2020,6/16/2020,6/17/2020
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,20917.0,21459.0,22142.0,22890.0,23546.0,24102.0,24766.0,25527.0,26310.0,26874.0
Albania,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1263.0,1299.0,1341.0,1385.0,1416.0,1464.0,1521.0,1590.0,1672.0,1722.0
Algeria,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10265.0,10382.0,10484.0,10589.0,10698.0,10810.0,10919.0,11031.0,11147.0,11268.0
Andorra,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,852.0,852.0,852.0,852.0,853.0,853.0,853.0,853.0,854.0,854.0
Angola,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,92.0,96.0,113.0,118.0,130.0,138.0,140.0,142.0,148.0,155.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Vietnam,0.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,332.0,332.0,332.0,332.0,333.0,334.0,334.0,334.0,334.0,335.0
West Bank and Gaza,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,473.0,481.0,485.0,487.0,489.0,489.0,492.0,505.0,514.0,555.0
Yemen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,496.0,524.0,560.0,591.0,632.0,705.0,728.0,844.0,885.0,902.0
Zambia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1200.0,1200.0,1200.0,1200.0,1321.0,1357.0,1358.0,1382.0,1405.0,1412.0


In [39]:
doc.astype('int64')

Unnamed: 0_level_0,1/22/2020,1/23/2020,1/24/2020,1/25/2020,1/26/2020,1/27/2020,1/28/2020,1/29/2020,1/30/2020,1/31/2020,...,6/08/2020,6/09/2020,6/10/2020,6/11/2020,6/12/2020,6/13/2020,6/14/2020,6/15/2020,6/16/2020,6/17/2020
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,0,0,0,0,0,0,0,0,0,0,...,20917,21459,22142,22890,23546,24102,24766,25527,26310,26874
Albania,0,0,0,0,0,0,0,0,0,0,...,1263,1299,1341,1385,1416,1464,1521,1590,1672,1722
Algeria,0,0,0,0,0,0,0,0,0,0,...,10265,10382,10484,10589,10698,10810,10919,11031,11147,11268
Andorra,0,0,0,0,0,0,0,0,0,0,...,852,852,852,852,853,853,853,853,854,854
Angola,0,0,0,0,0,0,0,0,0,0,...,92,96,113,118,130,138,140,142,148,155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Vietnam,0,2,2,2,2,2,2,2,2,2,...,332,332,332,332,333,334,334,334,334,335
West Bank and Gaza,0,0,0,0,0,0,0,0,0,0,...,473,481,485,487,489,489,492,505,514,555
Yemen,0,0,0,0,0,0,0,0,0,0,...,496,524,560,591,632,705,728,844,885,902
Zambia,0,0,0,0,0,0,0,0,0,0,...,1200,1200,1200,1200,1321,1357,1358,1382,1405,1412


In [40]:
doc.to_csv('Examples/COVID-19-master/final_df.csv')